<a href="https://colab.research.google.com/github/VickkiMars/NLP_Mastery/blob/main/classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install opendatasets

Collecting opendatasets
  Downloading opendatasets-0.1.22-py3-none-any.whl.metadata (9.2 kB)
Downloading opendatasets-0.1.22-py3-none-any.whl (15 kB)
Installing collected packages: opendatasets
Successfully installed opendatasets-0.1.22


In [3]:
import opendatasets as od
od.download('https://www.kaggle.com/datasets/chaitanyarahalkar/positive-and-negative-sentences')

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: 
Your Kaggle username: victorthamartian
Your Kaggle Key: ··········
Dataset URL: https://www.kaggle.com/datasets/chaitanyarahalkar/positive-and-negative-sentences
Downloading positive-and-negative-sentences.zip to ./positive-and-negative-sentences


100%|██████████| 484k/484k [00:00<00:00, 43.2MB/s]







In [6]:
import unicodedata
import string

In [7]:
all_letters = string.ascii_letters + " .,;'"

In [95]:
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

In [8]:
def load_data(filepath):
  # Open the file with the correct encoding (e.g., latin-1)
  with open(filepath+"positive.txt", encoding='latin-1') as f:
    positive = f.read()

  with open(filepath+"negative.txt", encoding='latin-1') as f:
    negative = f.read()
  positive, negative = positive.split('\n'), negative.split('\n')
  return positive, negative

In [9]:
def print_samples(no):
  positive, negative = load_data('positive-and-negative-sentences/')
  for i in range(no):
    print(f'Positive sample: {positive[i]}')
    print(f'Negative sample: {negative[i]}')
    print('\n')

In [10]:
positive, negative = load_data(r'positive-and-negative-sentences/')
print_samples(3)

Positive sample: the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean claud van damme or steven segal . 
Negative sample: simplistic , silly and tedious . 


Positive sample: the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co writer/director peter jackson's expanded vision of j . r . r . tolkien's middle earth . 
Negative sample: it's so laddish and juvenile , only teenage boys could possibly find it funny . 


Positive sample: effective but too tepid biopic
Negative sample: exploitative and largely devoid of the depth or sophistication that would make watching such a graphic treatment of the crimes bearable . 




In [11]:
def max_length(positive, negative):
  max_length = 0
  for line in positive:
    max_length = max(max_length, len(line))
  for line in negative:
    max_length = max(max_length, len(line))
  return max_length

In [12]:
import pandas as pd

In [13]:
def create_dataset(filepath):
  positive, negative = load_data(filepath)

  # Create labels
  positive_labels = ['positive'] * len(positive)
  negative_labels = ['negative'] * len(negative)
  positive_labels = [0 if label == 'positive' else 1 for label in positive_labels]
  negative_labels = [0 if label == 'positive' else 1 for label in negative_labels]
  # Combine sentences and labels
  text = positive + negative
  labels = positive_labels + negative_labels

  # Create the dictionary
  data = {'text': text, 'label': labels}
  return data

In [14]:
data = pd.DataFrame(create_dataset('positive-and-negative-sentences/'))
print(data.isnull().sum(axis=1).eq(data.shape[1]).sum())
data = data.sample(frac=1).reset_index(drop=True)
data.to_csv('data.csv', index=False)

0


In [49]:
import tensorflow as tf

In [50]:
BATCH_SIZE=16
dataset = tf.data.experimental.CsvDataset(
    "/content/data.csv",
    [tf.string, tf.int32],
    header=True,
)

In [52]:
for text, label in dataset.take(1493):
  print(text, label)

tf.Tensor(b'philosophically , intellectually and logistically a mess . ', shape=(), dtype=string) tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(b'it grabs you in the dark and shakes you vigorously for its duration . ', shape=(), dtype=string) tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(b'more timely than its director could ever have dreamed , this quietly lyrical tale probes the ambiguous welcome extended by iran to the afghani refugees who streamed across its borders , desperate for work and food . ', shape=(), dtype=string) tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(b"it's tough to tell which is in more abundant supply in this woefully hackneyed movie , directed by scott kalvert , about street gangs and turf wars in 1958 brooklyn -- stale cliches , gratuitous violence , or empty machismo . ", shape=(), dtype=string) tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(b"both lead performances are oscar size . quaid is utterly fearless as the tortured husband living a painful lie , and 

In [53]:
unique_positive_words = set([line for line in positive for word in line.split()])
unique_negative_words = set([line for line in negative for word in line.split()])
total = VOCAB_SIZE =  len(list(unique_positive_words.union(unique_negative_words)))

In [54]:
import re

In [55]:
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
    return tf.strings.regex_replace(
        stripped_html, f"[{re.escape(string.punctuation)}]", ""
    )


In [57]:
vectorize_layer = tf.keras.layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=VOCAB_SIZE,
    output_mode="int",
    output_sequence_length=max_length(positive, negative),
)

In [58]:
text_data = dataset.map(lambda x, y: x)

In [59]:
vectorize_layer.adapt(text_data)

In [68]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), tf.expand_dims(label, -1)

train = dataset.map(vectorize_text).cache().prefetch(buffer_size=10)

## Building the Model

In [None]:
inputs = tf.keras.Input(shape=(None,), dtype="int64")
x = tf.keras.layers.Embedding(total, 128)(inputs)
x = tf.keras.layers.Dropout(0.5)(x)


lstm_x = tf.keras.layers.LSTM(64, return_sequences=True)
x = tf.keras.layers.Bidirectional(lstm_x)(x)

x = tf.keras.layers.Conv1D(128, 5, padding='valid', activation="relu", strides=3)(x)
x = tf.keras.layers.Conv1D(128, 5, padding='valid', activation='relu', strides=3)(x)
x = tf.keras.layers.GlobalMaxPooling1D()(x)

x = tf.keras.layers.Dense(128, activation='relu')(x)
x = tf.keras.layers.Dropout(0.5)(x)
outputs = tf.keras.layers.Dense(1, activation="sigmoid")(x)
model = tf.keras.Model(inputs, outputs)
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
epochs = 2
model.fit(train, epochs=epochs)

Epoch 1/2
    348/Unknown [1m105s[0m 286ms/step - accuracy: 0.5059 - loss: 0.7032