# Second dataset

## Get Data

In [1]:
import pandas as pd

In [2]:
dfcsv = pd.read_csv("../Twitter_Data.csv")

In [3]:
dfcsv.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [4]:
dfcsv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162980 entries, 0 to 162979
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   clean_text  162976 non-null  object 
 1   category    162973 non-null  float64
dtypes: float64(1), object(1)
memory usage: 2.5+ MB


In [5]:
dfcsv.isnull().sum()

clean_text    4
category      7
dtype: int64

## Data cleansing

In [6]:
import tensorflow as tf
import numpy as np
import re

2022-10-31 21:14:48.689678: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-10-31 21:14:49.005046: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-10-31 21:14:49.071856: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-10-31 21:14:49.071885: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if yo

### Remove null values 

In [7]:
dfcsv.dropna(inplace=True) # removes rows with null values

In [8]:
dfcsv.isnull().sum()

clean_text    0
category      0
dtype: int64

In [9]:
import re
# Same cleaning function as other dataset
def clean_data(text):
    # Removing mentions
    text = re.sub(r"/b@","",text)
        
    # Remove links
    text = re.sub(r"(?:\@|https?\://)\S+", "", text)
    
    # Remove digits & strings of digits
    text = re.sub(r"([0-9])","",text)
    
    # Remove new lines
    text = re.sub(r"\n","",text)
    
    # Removes all punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    
    return text

In [10]:
dfcsv["clean_text"] = dfcsv["clean_text"].apply(clean_data)

In [11]:
dfcsv.head()

Unnamed: 0,clean_text,category
0,when modi promised minimum government maximum ...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


## Lowercase text

In [12]:
def lowercase_data(data):
    data = data.lower()
    return data

In [13]:
dfcsv["clean_text"] = dfcsv["clean_text"].apply(lowercase_data)

In [14]:
dfcsv.head(10)

Unnamed: 0,clean_text,category
0,when modi promised minimum government maximum ...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0
5,kiya tho refresh maarkefir comment karo,0.0
6,surat women perform yagna seeks divine grace f...,0.0
7,this comes from cabinet which has scholars lik...,0.0
8,with upcoming election india saga going import...,1.0
9,gandhi was gay does modi,1.0


## Remove emojis

In [15]:
def remove_emoji(data):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  #face emojis
                           u"\U0001F300-\U0001F5FF"  #symbols
                           u"\U0001F680-\U0001F6FF"  #transport & map emojis
                           u"\U0001F1E0-\U0001F1FF"  # flags
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', data)

In [16]:
remove_emoji("🇬🇧")

''

In [17]:
dfcsv["clean_text"] = dfcsv["clean_text"].apply(lowercase_data)

In [18]:
dfcsv.head(10)

Unnamed: 0,clean_text,category
0,when modi promised minimum government maximum ...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0
5,kiya tho refresh maarkefir comment karo,0.0
6,surat women perform yagna seeks divine grace f...,0.0
7,this comes from cabinet which has scholars lik...,0.0
8,with upcoming election india saga going import...,1.0
9,gandhi was gay does modi,1.0


## Remove Stop words

In [19]:
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /home/adamo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [20]:
from nltk.corpus import stopwords
stopwords = stopwords.words("english")

In [21]:
print(stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [22]:
dfcsv["clean_text"] = dfcsv["clean_text"].apply(lambda words: ' '.join([word for word in words.split() if word not in (stopwords)]))
# Using lamba to have a neat inline function, also works better than using a written function

In [23]:
dfcsv.head(10)

Unnamed: 0,clean_text,category
0,modi promised minimum government maximum gover...,-1.0
1,talk nonsense continue drama vote modi,0.0
2,say vote modi welcome bjp told rahul main camp...,1.0
3,asking supporters prefix chowkidar names modi ...,1.0
4,answer among powerful world leader today trump...,1.0
5,kiya tho refresh maarkefir comment karo,0.0
6,surat women perform yagna seeks divine grace n...,0.0
7,comes cabinet scholars like modi smriti hema t...,0.0
8,upcoming election india saga going important p...,1.0
9,gandhi gay modi,1.0


## Removing non enlgish words

In [24]:
nltk.download("words")

[nltk_data] Downloading package words to /home/adamo/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [25]:
Allwords = set(nltk.corpus.words.words())

In [26]:
dfcsv["clean_text"] = dfcsv["clean_text"].apply(lambda w: ' '.join([ws for ws in w.split() if ws in (Allwords) or not ws.isalpha()]))

In [27]:
dfcsv.head(10)

Unnamed: 0,clean_text,category
0,minimum government maximum governance begin di...,-1.0
1,talk nonsense continue drama vote,0.0
2,say vote welcome told main campaigner think relax,1.0
3,prefix great service confusion read crustal cl...,1.0
4,answer among powerful world leader today trump...,1.0
5,tho refresh comment karo,0.0
6,surat perform divine grace become,0.0
7,comes cabinet like smriti time introspect,0.0
8,upcoming election saga going important pair lo...,1.0
9,gay,1.0


## Lemmatization

In [28]:
import nltk 
nltk.download("wordnet")
nltk.download("omw-1.4")

[nltk_data] Downloading package wordnet to /home/adamo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/adamo/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [29]:
from nltk.stem import WordNetLemmatizer

In [30]:
lemmatizer = WordNetLemmatizer()

In [31]:
print(lemmatizer.lemmatize("dogs"))
print(lemmatizer.lemmatize("phones"))

dog
phone


In [32]:
dfcsv["clean_text"] = dfcsv["clean_text"].apply(lemmatizer.lemmatize)

In [33]:
dfcsv.head(10)

Unnamed: 0,clean_text,category
0,minimum government maximum governance begin di...,-1.0
1,talk nonsense continue drama vote,0.0
2,say vote welcome told main campaigner think relax,1.0
3,prefix great service confusion read crustal cl...,1.0
4,answer among powerful world leader today trump...,1.0
5,tho refresh comment karo,0.0
6,surat perform divine grace become,0.0
7,comes cabinet like smriti time introspect,0.0
8,upcoming election saga going important pair lo...,1.0
9,gay,1.0


## Encoding the label

In [34]:
def sentiment_encoder(data):
    if data == -1.0:
        data = 0
    elif data == 1.0:
        data = 1
    elif data == 0:
        data = 0.5
    else:
        data = None
        
    return data

In [35]:
dfcsv["category"] = dfcsv["category"].apply(sentiment_encoder)

In [36]:
dfcsv.head()

Unnamed: 0,clean_text,category
0,minimum government maximum governance begin di...,0.0
1,talk nonsense continue drama vote,0.5
2,say vote welcome told main campaigner think relax,1.0
3,prefix great service confusion read crustal cl...,1.0
4,answer among powerful world leader today trump...,1.0


In [37]:
dfcsv.isnull().sum()

clean_text    0
category      0
dtype: int64

## Split data into training and testing

In [38]:
from sklearn.model_selection import train_test_split

In [39]:
# Test size is 30% of total data from data set
x_train, x_test, y_train, y_test = train_test_split(dfcsv["clean_text"].values, dfcsv["category"].values,
                                                    test_size=0.3)

# Checking the sentiment has been split up appropriately 
print('sentiment Text: ', x_train[2])
print('sentiment: ', y_train[2])

sentiment Text:  doubt fixed baar sarkar big fan sir
sentiment:  1.0


## Tokenizing data

In [40]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [41]:
tokenizer = Tokenizer(num_words=100000000)
totalWords = [*x_train, *x_test]
tokenizer.fit_on_texts(totalWords)

In [42]:
wordindex = tokenizer.word_index
VocabSize= len(wordindex)
print("The number of unique words is: ", VocabSize)

The number of unique words is:  17954


In [43]:
print(wordindex)



In [44]:
len(x_test)

48891

In [45]:
train_seq = tokenizer.texts_to_sequences(x_train) # Compares all the data to vocab and assigns a integer reference for the vocab 
test_seq = tokenizer.texts_to_sequences(x_test)
print(test_seq)

[[92, 388, 211, 3141, 32, 889, 3], [449, 1368, 261, 395, 67, 38, 8, 1079, 463, 221, 39, 115], [61, 563, 1001, 581, 573, 573], [265, 242, 6026, 3, 284, 176, 3543, 16, 213, 2369, 355, 353, 319], [82, 11, 1536, 4834], [57, 338, 204], [1933, 753, 48, 23, 120], [788, 24, 733, 7, 6361, 227, 8, 21, 16], [664, 17, 13, 639, 5469, 152, 6421, 219, 637, 5307, 4311, 2467], [10, 1104, 5, 582, 301], [69, 21, 481, 1800], [280, 177, 1088, 296, 85, 5, 518, 2, 179], [82, 11, 3, 88, 59], [1361, 76, 4720, 1932, 685, 46, 16612, 1241, 1219, 1235, 3146, 841, 4, 392, 780, 434, 1476, 4341, 1520, 841, 3146], [402, 333, 86, 16, 268, 126, 469, 1154, 2600, 5199, 202, 36], [542, 255, 6, 13, 315, 8, 911, 314, 365, 58, 398, 38, 376, 1926, 2334, 955, 3625, 6524, 1251, 428], [534, 387, 182, 382, 21, 1042], [7692], [13], [1319, 617, 1019, 48, 147, 660, 2, 3952, 26], [1856, 205, 49, 65, 582, 71], [13, 75, 1355, 126, 42, 167, 533, 2456, 2713, 3], [596, 193, 779], [2, 4831, 4772, 5], [126, 1, 224, 1391, 10, 287, 280, 393, 3

In [46]:
len(test_seq)

48891

In [47]:
from tensorflow.keras.preprocessing.sequence import pad_sequences 

In [48]:
# PAD sequences so they are all the same length as the inputs must all be of same length

train_padded = pad_sequences(train_seq)

print(train_padded[0])

# Check size of each padded sequence

TrainingPadShape = train_padded.shape[1] # for use when defining the model shape 
print(TrainingPadShape)

[   0    0    0    0    0    0    0    0    0    0    0    0  519  891
 2751  110  891 2751    3 3577 2035 2531  128 2531   85  570  102  198
  183  804 1672]
31


In [49]:
test_padded = pad_sequences(test_seq,maxlen=TrainingPadShape)
print(test_padded.shape[1])

31


## Constructing the model

In [50]:
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, GlobalMaxPooling1D
from tensorflow.keras.models import Model

In [67]:
DimentionEmbeddingLayer = 10
DimentionLSTMLayer = 30

inputLayer = Input(shape=(TrainingPadShape, ))
modelLayers = Embedding(VocabSize +1, DimentionEmbeddingLayer)(inputLayer) # Vocabsize + 1 as indexing starts from 0
modelLayers = LSTM(DimentionLSTMLayer, return_sequences=True)(modelLayers)
modelLayers = GlobalMaxPooling1D()(modelLayers)
modelLayers = Dense(32, activation="relu")(modelLayers)
modelLayers = Dense(1, activation="sigmoid")(modelLayers)

model = Model(inputLayer, modelLayers)

## Compling the model

In [68]:
model.compile(optimizer="RMSprop",loss="binary_crossentropy",metrics=["accuracy"])

## Fitting data

In [65]:
from keras.callbacks import ModelCheckpoint

### Fitting data once

In [66]:
fittingdata= model.fit(train_padded, y_train, epochs=1,validation_data=(test_padded, y_test))



### Fitting data for best model

In [None]:
checkpoint1 = ModelCheckpoint("Final_model.hdf5", monitor='val_accuracy', verbose=1,save_best_only=True, mode='auto', period=1,save_weights_only=False)
history = model.fit(train_padded, y_train, epochs=150,validation_data=(test_padded, y_test),callbacks=[checkpoint1])

## Evalutating model

In [None]:
# Load model as model
from tensorflow import keras
model = keras.models.load_model('best_model1.hdf5')

In [None]:
model.summary()

In [None]:
score = model.evaluate(test_padded, y_test)
print("Model Loss: ", score[0])
print("Model accuracy", score[1])

## Predicting data

In [None]:
def predict_sentiment(text):
    text_sequence = tokenizer.texts_to_sequences(text)

    text_padded = pad_sequences(text_sequence, maxlen=TrainingPadShape)
    
    predicted_sentiment = model.predict(text_padded)
    if predicted_sentiment < 0.3:
        print("Negative")
    elif predicted_sentiment > 0.6:
        print("Positive")
    else:
        print("Neutral")
    print(predicted_sentiment)

In [None]:
text = ['Today I have had such a productive day! I watered the plants and went to the gym.']
predict_sentiment(text)

In [None]:
text = ['Done nothing, not feeling great, need a sleep.']
predict_sentiment(text)

In [None]:
text = ['Mathieu Debuchy']
predict_sentiment(text)

In [None]:
text = ['Bad day']
predict_sentiment(text)

In [None]:
text = ['WHAT A WIN']
predict_sentiment(text)

In [None]:
text = ['Drone war 1']
predict_sentiment(text)