## With Tensorflow Keras

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf 
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy

2024-08-05 06:18:33.117401: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-05 06:18:33.117535: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-05 06:18:33.254654: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
start_char = 1
oov_char = 2
index_from = 3

In [3]:
(x_train, y_train), _ = tf.keras.datasets.imdb.load_data(start_char=start_char, oov_char=oov_char, index_from=index_from)

In [4]:
x_train.shape

(25000,)

In [5]:
x_train.dtype

dtype('O')

In [6]:
y_train.shape

(25000,)

In [7]:
y_train[:5]

array([1, 0, 0, 1, 0])

In [8]:
df = pd.concat([pd.DataFrame({'X': x_train, 'Y': y_train})], ignore_index=True)
df.head()

Unnamed: 0,X,Y
0,"[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, ...",1
1,"[1, 194, 1153, 194, 8255, 78, 228, 5, 6, 1463,...",0
2,"[1, 14, 47, 8, 30, 31, 7, 4, 249, 108, 7, 4, 5...",0
3,"[1, 4, 18609, 16085, 33, 2804, 4, 2040, 432, 1...",1
4,"[1, 249, 1323, 7, 61, 113, 10, 10, 13, 1637, 1...",0


In [9]:
word_index = tf.keras.datasets.imdb.get_word_index()

inverted_word_index = dict(
    (i + index_from, word) for (word, i) in word_index.items()
)

inverted_word_index[start_char] = ""
inverted_word_index[oov_char] = ""


In [10]:
print(len(word_index))
print(len(inverted_word_index))

88584
88586


In [11]:
encode = lambda text : [word_index[word] for word in text.split()]
decode = lambda inp : " ".join([inverted_word_index[idx] for idx in inp])

In [12]:
df["text"] = df["X"].apply(decode)

In [13]:
nlp = spacy.load("en_core_web_sm")
def remove_stop_words(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct]) 

In [14]:
df["pre_text"] = df["text"].apply(remove_stop_words)

In [16]:
nlp = spacy.load("en_core_web_lg")
df["vector"] = df["pre_text"].apply(lambda text: nlp(text).vector)

In [17]:
df.head()

Unnamed: 0,X,Y,text,pre_text,vector
0,"[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, ...",1,this film was just brilliant casting location...,film brilliant cast location scenery story d...,"[-0.49241343, 0.22992784, -1.9614998, -2.44784..."
1,"[1, 194, 1153, 194, 8255, 78, 228, 5, 6, 1463,...",0,big hair big boobs bad music and a giant safe...,big hair big boob bad music giant safety pin...,"[-0.4520517, 0.6301006, -1.7863605, -1.3567178..."
2,"[1, 14, 47, 8, 30, 31, 7, 4, 249, 108, 7, 4, 5...",0,this has to be one of the worst films of the ...,bad film 1990 friend watch film target audie...,"[-0.58676946, 1.1265762, -2.6766188, -1.314009..."
3,"[1, 4, 18609, 16085, 33, 2804, 4, 2040, 432, 1...",1,the scots excel at storytelling the tradition...,scot excel storytelle traditional sort year ...,"[-0.12821278, 0.26851207, -1.1525749, -1.29237..."
4,"[1, 249, 1323, 7, 61, 113, 10, 10, 13, 1637, 1...",0,worst mistake of my life br br i picked this ...,bad mistake life br br pick movie target 5 f...,"[-0.37964636, 0.45462498, -2.716358, -0.844779..."


In [18]:
scaler = MinMaxScaler()

In [19]:
X = df["vector"].to_numpy()
X = np.array(X.tolist())
X.shape

(25000, 300)

In [20]:
X = scaler.fit_transform(X)

In [21]:
Y = df["Y"].to_numpy().astype(np.uint8)

In [21]:
X_train, X_test, y_train, y_test =  train_test_split(X, Y, test_size=0.2, random_state=42)

In [22]:
X_train.shape

(20000, 300)

In [23]:
y_train.shape

(20000,)

## With spacy vectors

In [24]:
input_size = 300
num_classes = 2
batch_size = 16
dropout = 0.2

In [28]:
model = Sequential([
    Dense(input_size, activation="relu"),
    Dropout(dropout),
    BatchNormalization(),
    
    Dense(300, activation="relu"),
    Dropout(dropout),
    BatchNormalization(),
    
    Dense(100, activation="relu"),
    Dropout(dropout),
    BatchNormalization(),
    
    Dense(100, activation="relu"),
    Dropout(dropout),
    BatchNormalization(),
    
    Dense(num_classes, activation="softmax")
])

In [29]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
model.fit(X_train, y_train, epochs=40, validation_split=0.2, batch_size=32)

Epoch 1/40
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 4ms/step - accuracy: 0.5788 - loss: 0.7900 - val_accuracy: 0.5040 - val_loss: 0.8831
Epoch 2/40
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.6911 - loss: 0.5829 - val_accuracy: 0.5020 - val_loss: 1.0949
Epoch 3/40
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.6653 - loss: 0.6069 - val_accuracy: 0.5038 - val_loss: 0.8976
Epoch 4/40
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7684 - loss: 0.4815 - val_accuracy: 0.5445 - val_loss: 0.6354
Epoch 5/40
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7946 - loss: 0.4534 - val_accuracy: 0.7107 - val_loss: 0.5150
Epoch 6/40
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7909 - loss: 0.4445 - val_accuracy: 0.5505 - val_loss: 0.6685
Epoch 7/40
[1m500/500[0m 

<keras.src.callbacks.history.History at 0x7aa52318c5e0>

## With TF-IDF

In [15]:
vectorizer = TfidfVectorizer()

In [16]:
text = df["pre_text"].to_numpy()
text

array(['  film brilliant cast location scenery story direction suit play imagine robert redford amazing actor director norman father come scottish island love fact real connection film witty remark film great brilliant buy film soon release retail recommend watch fly fishing amazing cry end sad know cry film good definitely congratulation little boy play norman paul brilliant child leave praise list think star play grow big profile film child amazing praise think story lovely true life share',
       '  big hair big boob bad music giant safety pin word well describe terrible movie love cheesy horror movie see hundred get bad plot paper thin ridiculous acting abomination script completely laughable good end showdown cop work killer damn terribly write clothe sicken funny equal measure hair big lot boob bounce man wear cut tee shirt stomach sicken man actually wear music synthesis trash play scene trashy music boob paramedic take away body gym close bereavement joke aside truly bad film 

In [23]:
X = vectorizer.fit_transform(text).astype(np.float32)
X.shape

(25000, 62100)

In [24]:
X.dtype

dtype('float32')

In [25]:
Y = df["Y"].to_numpy().astype(np.uint8)
Y.shape

(25000,)

In [26]:
X_train, X_test, y_train, y_test =  train_test_split(X, Y, test_size=0.4, random_state=42)

In [27]:
input_size = X.shape[1]
num_classes = 2
dropout = 0.2

In [29]:
model = Sequential([
    Dense(input_size, activation="relu"),
    Dropout(dropout),
    BatchNormalization(),
    
    Dense(1000, activation="relu"),
    Dropout(dropout),
    BatchNormalization(),
    
    Dense(1000, activation="relu"),
    Dropout(dropout),
    BatchNormalization(),
    
    Dense(100, activation="relu"),
    Dropout(dropout),
    BatchNormalization(),
    
    Dense(num_classes, activation="softmax")
])

In [None]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
model.fit(X_train, y_train, epochs=40, validation_split=0.2, batch_size=1)

Epoch 1/40




In [None]:
### Need more GPU