In [440]:
# Imports librairies
from tensorflow import keras
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import nltk
import os
import pandas as pd
from os import getcwd
import tensorflow_datasets as tfds
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from datasets import load_dataset


In [441]:
try:
    import datasets
except ModuleNotFoundError:
    !pip install datasets
    import datasets

In [442]:
try:
    from unidecode import unidecode
except ModuleNotFoundError:
    !pip install unidecode
    from unidecode import unidecode

In [443]:
#Load my data
Data_reddit = pd.read_csv("Reddit_Data.csv")
Data_twitter = pd.read_csv("Twitter_Data.csv")
Data_twitter.rename(columns = {'clean_text':'clean_comment'}, inplace = True)
Data = Data_reddit.append(Data_twitter).reset_index()

In [444]:
print(Data)

         index                                      clean_comment  category
0            0   family mormon have never tried explain them t...       1.0
1            1  buddhism has very much lot compatible with chr...       1.0
2            2  seriously don say thing first all they won get...      -1.0
3            3  what you have learned yours and only yours wha...       0.0
4            4  for your own benefit you may want read living ...       1.0
...        ...                                                ...       ...
200224  162975  why these 456 crores paid neerav modi not reco...      -1.0
200225  162976  dear rss terrorist payal gawar what about modi...      -1.0
200226  162977  did you cover her interaction forum where she ...       0.0
200227  162978  there big project came into india modi dream p...       0.0
200228  162979  have you ever listen about like gurukul where ...       1.0

[200229 rows x 3 columns]


In [445]:
#Rename our label
def rename(val):
    if val == -1:
        return "Negative"
    elif val == 0:
        return "Neutral"
    elif val ==1:
        return "Positive"

In [446]:
Data['category'] = Data['category'].apply(rename)
#shuffling the data
Data = Data.sample(frac=1)

In [447]:
#Drop null value
Data = Data.dropna()
Data.shape

(200118, 3)

In [448]:
#Select the comment
X= Data['clean_comment']

In [449]:
#Select the label
Y = pd.get_dummies(Data['category'],
                   columns=Data["category"]).values

In [450]:
#Create a vectorization layer
max_features = 1000  # Maximum vocab size.
max_len = 4  # Sequence length to pad the outputs to.
vectorize_layer = tf.keras.layers.TextVectorization(
 max_tokens=max_features,
 output_mode='int',
 output_sequence_length=max_len)

In [451]:
#Adapt the vector
clean_comment = X['clean_comment'].values.tolist()
text_dataset = tf.data.Dataset.from_tensor_slices(clean_comment )
text_dataset

<TensorSliceDataset element_spec=TensorSpec(shape=(), dtype=tf.string, name=None)>

In [452]:
vectorize_layer.adapt(text_dataset.batch(64))

In [453]:
vocab = np.array(vectorize_layer.get_vocabulary())
vocab[:50]

array(['', '[UNK]', 'the', 'and', 'that', 'this', 'for', 'you', 'are',
       'they', 'not', 'have', 'with', 'but', 'will', 'was', 'people',
       'what', 'india', 'all', 'modi', 'bjp', 'has', 'can', 'like',
       'from', 'about', 'just', 'there', 'who', 'their', 'one', 'his',
       'good', 'how', 'more', 'don', 'would', 'now', 'your', 'same',
       'them', 'when', 'even', 'some', 'out', 'get', 'any', 'because',
       'only'], dtype='<U14')

In [454]:
#Create the embedding layers
SEED = 34

# set seed for reproducibility
tf.keras.utils.set_random_seed(SEED)

embedding = tf.keras.layers.Embedding(
        input_dim = len(vocab) ,
        output_dim = 6,# Use masking to handle the variable sequence lengths
        mask_zero=True)

In [455]:
# set seed for reproducibility
tf.keras.utils.set_random_seed(SEED)

model = tf.keras.Sequential([
    vectorize_layer,
    embedding,
    tf.keras.layers.Normalization(axis=None),
    tf.keras.layers.SpatialDropout1D(0.2),
    tf.keras.layers.LSTM(100, dropout=0.2, recurrent_dropout=0.2),
    tf.keras.layers.Dense(units = 64, activation = 'relu'),
    tf.keras.layers.Dense(3, activation= "sigmoid")
])

In [456]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [457]:
model.summary()

Model: "sequential_13"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_10 (Text  (None, 4)                0         
 Vectorization)                                                  
                                                                 
 embedding_10 (Embedding)    (None, 4, 6)              6000      
                                                                 
 normalization_2 (Normalizat  (None, 4, 6)             3         
 ion)                                                            
                                                                 
 spatial_dropout1d_3 (Spatia  (None, 4, 6)             0         
 lDropout1D)                                                     
                                                                 
 lstm_14 (LSTM)              (None, 100)               42800     
                                                     

In [458]:
from sklearn.model_selection import train_test_split
train_x, test_x , train_y, test_y = train_test_split(X, Y, test_size=.3)


In [459]:
# dictionary to keep history output from fit calls
logs = {}

# directory in which model checkpoints and logs are saved
LOG_DIR = 'logs'

def best_model_path(model_name):
    base_dir  = os.path.join(LOG_DIR, model_name)
    return os.path.join(base_dir, 'best_val_accuracy.ckpt')

def callback_list(model_name):
    base_dir  = os.path.join(LOG_DIR, model_name)
    tb_cb = tf.keras.callbacks.TensorBoard(base_dir)
    ckpt = tf.keras.callbacks.ModelCheckpoint(
         best_model_path(model_name),
         monitor='val_accuracy',
         mode='max', 
         verbose=0,
         save_best_only=True)
    backup_dir = os.path.join(base_dir, 'backup_checkpoint')
    bkp = tf.keras.callbacks.BackupAndRestore(
        backup_dir)
    return [tb_cb, ckpt, bkp]

In [None]:
from tensorflow.keras.callbacks import EarlyStopping
MODEL_NAME = 'LSTM'
history = model.fit(train_x, train_y, epochs=30,
                    batch_size=32,
                    validation_split=0.15, 
                    callbacks=callback_list(MODEL_NAME))

Epoch 18/30
 617/3721 [===>..........................] - ETA: 37s - loss: 0.8904 - accuracy: 0.5505

In [None]:
#Test our model
from datasets import load_dataset
best_m=tf.keras.models.load_model( best_model_path('LSTM'))
my_review=[unidecode("J'ai détesté ce livre.")]
best_m.predict(my_review)

In [None]:
tf.keras.utils.set_random_seed(SEED)

model2 = tf.keras.Sequential([
    vectorize_layer,
    embedding,
    tf.keras.layers.Normalization(axis=None),
    tf.keras.layers.SpatialDropout1D(0.2),
    tf.keras.layers.LSTM(100, dropout=0.2, recurrent_dropout=0.2),
    tf.keras.layers.LSTM(64,return_sequences=True)
    tf.keras.layers.Dense(units = 64, activation = 'relu'),
    tf.keras.layers.Dense(units = 32, activation = 'relu'),
    tf.keras.layers.Dense(3, activation= "sigmoid")
])

In [None]:
MODEL_NAME = 'Stack2LSTM'
logs[MODEL_NAME] = model2.fit(
    # TODO complete the fit call
    train_x,train_y, epochs=50,
    validation_data=0.15,
    callbacks=callback_list(MODEL_NAME)
    )

In [None]:
#Test our model
from datasets import load_dataset
best_m=tf.keras.models.load_model( best_model_path('LSTM'))
my_review=[unidecode("J'ai détesté ce livre.")]
best_m.predict(my_review)

In [None]:
%load_ext tensorboard
%tensorboard --logdir logs