# Second dataset

## Get Data

In [1]:
import pandas as pd

In [2]:
dfcsv = pd.read_csv("../Twitter_Data.csv")

In [3]:
dfcsv.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [4]:
dfcsv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162980 entries, 0 to 162979
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   clean_text  162976 non-null  object 
 1   category    162973 non-null  float64
dtypes: float64(1), object(1)
memory usage: 2.5+ MB


In [5]:
dfcsv.isnull().sum()

clean_text    4
category      7
dtype: int64

## Data cleansing

In [6]:
import tensorflow as tf
import numpy as np
import re

2022-11-24 12:01:02.762730: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-24 12:01:02.883751: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-11-24 12:01:02.887276: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-11-24 12:01:02.887286: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if yo

### Remove null values 

In [7]:
dfcsv.dropna(inplace=True) # removes rows with null values

In [8]:
dfcsv.isnull().sum()

clean_text    0
category      0
dtype: int64

In [9]:
import re
# Same cleaning function as other dataset
def clean_data(text):
    # Removing mentions
    text = re.sub(r"/b@","",text)
        
    # Remove links
    text = re.sub(r"(?:\@|https?\://)\S+", "", text)
    
    # Remove digits & strings of digits
    text = re.sub(r"([0-9])","",text)
    
    # Remove new lines
    text = re.sub(r"\n","",text)
    
    # Removes all punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    
    return text

In [10]:
dfcsv["clean_text"] = dfcsv["clean_text"].apply(clean_data)

In [11]:
dfcsv.head()

Unnamed: 0,clean_text,category
0,when modi promised minimum government maximum ...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


## Lowercase text

In [12]:
def lowercase_data(data):
    data = data.lower()
    return data

In [13]:
dfcsv["clean_text"] = dfcsv["clean_text"].apply(lowercase_data)

In [14]:
dfcsv.head(10)

Unnamed: 0,clean_text,category
0,when modi promised minimum government maximum ...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0
5,kiya tho refresh maarkefir comment karo,0.0
6,surat women perform yagna seeks divine grace f...,0.0
7,this comes from cabinet which has scholars lik...,0.0
8,with upcoming election india saga going import...,1.0
9,gandhi was gay does modi,1.0


## Remove emojis

In [15]:
def remove_emoji(data):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  #face emojis
                           u"\U0001F300-\U0001F5FF"  #symbols
                           u"\U0001F680-\U0001F6FF"  #transport & map emojis
                           u"\U0001F1E0-\U0001F1FF"  # flags
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', data)

In [16]:
remove_emoji("🇬🇧")

''

In [17]:
dfcsv["clean_text"] = dfcsv["clean_text"].apply(lowercase_data)

In [18]:
dfcsv.head(10)

Unnamed: 0,clean_text,category
0,when modi promised minimum government maximum ...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0
5,kiya tho refresh maarkefir comment karo,0.0
6,surat women perform yagna seeks divine grace f...,0.0
7,this comes from cabinet which has scholars lik...,0.0
8,with upcoming election india saga going import...,1.0
9,gandhi was gay does modi,1.0


## Remove Stop words

In [19]:
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /home/adamo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [20]:
from nltk.corpus import stopwords
stopwords = stopwords.words("english")

In [21]:
print(stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [22]:
dfcsv["clean_text"] = dfcsv["clean_text"].apply(lambda words: ' '.join([word for word in words.split() if word not in (stopwords)]))
# Using lamba to have a neat inline function, also works better than using a written function

In [23]:
dfcsv.head(10)

Unnamed: 0,clean_text,category
0,modi promised minimum government maximum gover...,-1.0
1,talk nonsense continue drama vote modi,0.0
2,say vote modi welcome bjp told rahul main camp...,1.0
3,asking supporters prefix chowkidar names modi ...,1.0
4,answer among powerful world leader today trump...,1.0
5,kiya tho refresh maarkefir comment karo,0.0
6,surat women perform yagna seeks divine grace n...,0.0
7,comes cabinet scholars like modi smriti hema t...,0.0
8,upcoming election india saga going important p...,1.0
9,gandhi gay modi,1.0


## Removing non enlgish words

In [24]:
nltk.download("words")

[nltk_data] Downloading package words to /home/adamo/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [25]:
Allwords = set(nltk.corpus.words.words())

In [26]:
dfcsv["clean_text"] = dfcsv["clean_text"].apply(lambda w: ' '.join([ws for ws in w.split() if ws in (Allwords) or not ws.isalpha()]))

In [27]:
dfcsv.head(10)

Unnamed: 0,clean_text,category
0,minimum government maximum governance begin di...,-1.0
1,talk nonsense continue drama vote,0.0
2,say vote welcome told main campaigner think relax,1.0
3,prefix great service confusion read crustal cl...,1.0
4,answer among powerful world leader today trump...,1.0
5,tho refresh comment karo,0.0
6,surat perform divine grace become,0.0
7,comes cabinet like smriti time introspect,0.0
8,upcoming election saga going important pair lo...,1.0
9,gay,1.0


## Lemmatization

In [28]:
import nltk 
nltk.download("wordnet")
nltk.download("omw-1.4")

[nltk_data] Downloading package wordnet to /home/adamo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/adamo/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [29]:
from nltk.stem import WordNetLemmatizer

In [30]:
lemmatizer = WordNetLemmatizer()

In [31]:
print(lemmatizer.lemmatize("dogs"))
print(lemmatizer.lemmatize("phones"))

dog
phone


In [32]:
dfcsv["clean_text"] = dfcsv["clean_text"].apply(lemmatizer.lemmatize)

In [33]:
dfcsv.head(10)

Unnamed: 0,clean_text,category
0,minimum government maximum governance begin di...,-1.0
1,talk nonsense continue drama vote,0.0
2,say vote welcome told main campaigner think relax,1.0
3,prefix great service confusion read crustal cl...,1.0
4,answer among powerful world leader today trump...,1.0
5,tho refresh comment karo,0.0
6,surat perform divine grace become,0.0
7,comes cabinet like smriti time introspect,0.0
8,upcoming election saga going important pair lo...,1.0
9,gay,1.0


## Encoding the label

In [34]:
def sentiment_encoder(data):
    if data == -1.0:
        data = 0
    elif data == 1.0:
        data = 2
    elif data == 0:
        data = 1
    else:
        data = None
        
    return data

In [35]:
dfcsv["category"] = dfcsv["category"].apply(sentiment_encoder)

In [36]:
dfcsv.head()

Unnamed: 0,clean_text,category
0,minimum government maximum governance begin di...,0
1,talk nonsense continue drama vote,1
2,say vote welcome told main campaigner think relax,2
3,prefix great service confusion read crustal cl...,2
4,answer among powerful world leader today trump...,2


In [37]:
dfcsv.isnull().sum()

clean_text    0
category      0
dtype: int64

## Split data into training and testing

In [38]:
from sklearn.model_selection import train_test_split

In [39]:
# Test size is 30% of total data from data set
x_train, x_test, y_train, y_test = train_test_split(dfcsv["clean_text"].values, dfcsv["category"].values,
                                                    test_size=0.3)

# Checking the sentiment has been split up appropriately 
print('sentiment Text: ', x_train[2])
print('sentiment: ', y_train[2])

sentiment Text:  devotee given name people going back leading country
sentiment:  1


## Tokenizing data

In [40]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [41]:
tokenizer = Tokenizer(num_words=100000000)
totalWords = [*x_train, *x_test]
tokenizer.fit_on_texts(totalWords)

In [42]:
wordindex = tokenizer.word_index
VocabSize= len(wordindex)
print("The number of unique words is: ", VocabSize)

The number of unique words is:  17954


In [43]:
print(wordindex)



In [44]:
len(x_test)

48891

In [45]:
train_seq = tokenizer.texts_to_sequences(x_train) # Compares all the data to vocab and assigns a integer reference for the vocab 
test_seq = tokenizer.texts_to_sequences(x_test)
print(test_seq)

[[59, 286], [42, 3123, 1001], [1357, 46, 579, 153, 2192, 1916, 653, 5572, 579, 62, 1379], [8, 305, 278, 1172, 1364], [510, 3432, 79, 627, 101, 79, 267, 44, 266, 462, 73, 191], [129, 236, 486, 517, 2, 1283, 1115, 255, 2], [232], [40, 998, 9783, 22, 2604, 106, 53, 22, 9445, 303], [347, 56, 933, 22, 986, 85, 148, 986], [2960, 1756, 584, 78, 393, 381, 3642, 5560, 2572, 2225, 67, 271], [8739, 1196, 1159, 3962, 990, 103, 468, 1184, 12, 3962, 60, 53, 4, 923, 3143], [448, 55], [1187, 1000, 271, 476, 49, 60, 179, 73, 80, 285, 290, 16, 12, 285, 759, 16, 1362], [1325, 96, 461, 461], [1196, 1702, 964, 115, 5083, 5855, 924, 31, 18, 335, 58, 346, 256, 548, 4361, 707, 1743, 583, 669], [292, 2347, 7503, 811], [426, 697], [618, 2917, 4685, 234, 1269, 1724, 4771, 2732, 43, 584, 442, 785, 2989, 33, 477, 38, 1, 38], [4452, 2281, 377, 1835, 308, 166, 197, 2219, 186, 98, 4369, 358, 16572], [153, 118, 32, 462, 202, 6, 17, 15, 2260, 1278, 31, 18], [1444, 769, 431, 1, 1825, 1, 48, 365, 148, 1506, 1494, 38], [4

In [46]:
len(test_seq)

48891

In [47]:
from tensorflow.keras.preprocessing.sequence import pad_sequences 

In [48]:
# PAD sequences so they are all the same length as the inputs must all be of same length

train_padded = pad_sequences(train_seq)

print(train_padded[0])

# Check size of each padded sequence

TrainingPadShape = train_padded.shape[1] # for use when defining the model shape 
print(TrainingPadShape)

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0  881   22  293  142   89  949 1643   17
  398   72  523]
31


In [49]:
test_padded = pad_sequences(test_seq,maxlen=TrainingPadShape)
print(test_padded.shape[1])

31


In [50]:
print(y_train)

[1 2 1 ... 2 2 0]


# Data changes

In [51]:
from keras.utils import to_categorical

In [52]:
y_train = to_categorical(y_train, 3)
y_test = to_categorical(y_test, 3)

In [53]:
print(y_train)

[[0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 ...
 [0. 0. 1.]
 [0. 0. 1.]
 [1. 0. 0.]]


## Constructing the model

In [54]:
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, GlobalMaxPooling1D
from tensorflow.keras.models import Model

In [55]:
DimentionEmbeddingLayer = 10
DimentionLSTMLayer = 30

inputLayer = Input(shape=(TrainingPadShape, ))
modelLayers = Embedding(VocabSize +1, DimentionEmbeddingLayer)(inputLayer) # Vocabsize + 1 as indexing starts from 0
modelLayers = LSTM(DimentionLSTMLayer, return_sequences=True)(modelLayers)
modelLayers = GlobalMaxPooling1D()(modelLayers)
modelLayers = Dense(64, activation="relu")(modelLayers)
modelLayers = Dense(3, activation="softmax")(modelLayers)

model = Model(inputLayer, modelLayers)

2022-11-24 12:01:13.714005: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-11-24 12:01:13.714027: W tensorflow/stream_executor/cuda/cuda_driver.cc:263] failed call to cuInit: UNKNOWN ERROR (303)
2022-11-24 12:01:13.714046: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (adamo-Surface-Pro-7): /proc/driver/nvidia/version does not exist
2022-11-24 12:01:13.714210: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Compling the model

In [56]:
model.compile(optimizer="RMSprop",loss="CategoricalCrossentropy",metrics=["accuracy"])

## Fitting data

In [57]:
fittingdata= model.fit(train_padded, y_train, epochs=1,validation_data=(test_padded, y_test))



### Tesing tracking weights

In [58]:
from keras.callbacks import ModelCheckpoint

In [59]:
checkpoint1 = ModelCheckpoint("Final_model.hdf5", monitor='val_accuracy', verbose=1,save_best_only=True, mode='auto', period=1,save_weights_only=False)
history = model.fit(train_padded, y_train, epochs=25,validation_data=(test_padded, y_test),callbacks=[checkpoint1])

Epoch 1/25
Epoch 1: val_accuracy improved from -inf to 0.87171, saving model to Final_model.hdf5
Epoch 2/25
Epoch 2: val_accuracy improved from 0.87171 to 0.87576, saving model to Final_model.hdf5
Epoch 3/25
Epoch 3: val_accuracy improved from 0.87576 to 0.87634, saving model to Final_model.hdf5
Epoch 4/25
Epoch 4: val_accuracy improved from 0.87634 to 0.87662, saving model to Final_model.hdf5
Epoch 5/25
Epoch 5: val_accuracy did not improve from 0.87662
Epoch 6/25
Epoch 6: val_accuracy did not improve from 0.87662
Epoch 7/25
Epoch 7: val_accuracy did not improve from 0.87662
Epoch 8/25
Epoch 8: val_accuracy improved from 0.87662 to 0.87789, saving model to Final_model.hdf5
Epoch 9/25
Epoch 9: val_accuracy did not improve from 0.87789
Epoch 10/25
Epoch 10: val_accuracy did not improve from 0.87789
Epoch 11/25
Epoch 11: val_accuracy did not improve from 0.87789
Epoch 12/25
Epoch 12: val_accuracy did not improve from 0.87789
Epoch 13/25
Epoch 13: val_accuracy did not improve from 0.87789

## Load model two

In [62]:
# Load model as model
from tensorflow import keras
model = keras.models.load_model('Final_model.hdf5')

## Evalutating model

In [63]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 31)]              0         
                                                                 
 embedding (Embedding)       (None, 31, 10)            179550    
                                                                 
 lstm (LSTM)                 (None, 31, 30)            4920      
                                                                 
 global_max_pooling1d (Globa  (None, 30)               0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 64)                1984      
                                                                 
 dense_1 (Dense)             (None, 3)                 195       
                                                             

In [64]:
score = model.evaluate(test_padded, y_test)
print("Model Loss: ", score[0])
print("Model accuracy", score[1])

Model Loss:  0.3893398940563202
Model accuracy 0.8778916597366333


## Predicting data

In [65]:
def predict_sentiment(text):
    text_sequence = tokenizer.texts_to_sequences(text)

    text_padded = pad_sequences(text_sequence, maxlen=TrainingPadShape)
    
    predicted_sentiment = model.predict(text_padded)
    if predicted_sentiment[0][0] > predicted_sentiment[0][1] and predicted_sentiment[0][0] > predicted_sentiment[0][2]:
        print("Negative")
    elif predicted_sentiment[0][1] > predicted_sentiment[0][0] and predicted_sentiment[0][1] > predicted_sentiment[0][2]:
        print("Neutral")
    else:
        print("Positive")

In [66]:
text = []
userinput = input("Text to test sentiment... ")
text.append(userinput)
predict_sentiment(text)

Text to test sentiment... 
Neutral


In [67]:
text = ['Today I have had such a productive day! I watered the plants and went to the gym.']
predict_sentiment(text)

Neutral


In [68]:
text = ['Done nothing, not feeling great, need a sleep.']
predict_sentiment(text)

Positive


In [69]:
text = ['Mathieu Debuchy']
predict_sentiment(text)

Neutral


In [70]:
text = ['Bad day']
predict_sentiment(text)

Negative


In [71]:
text = ['WHAT A WIN']
predict_sentiment(text)

Positive


In [72]:
text = ['Drone war 1']
predict_sentiment(text)

Neutral
