In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from sklearn.tree import DecisionTreeClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Embedding,SimpleRNN,LSTM,GRU,Bidirectional

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('/content/drive/MyDrive/NLP(Classes)/train.csv/train.csv')

In [3]:
df.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31962 entries, 0 to 31961
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      31962 non-null  int64 
 1   label   31962 non-null  int64 
 2   tweet   31962 non-null  object
dtypes: int64(2), object(1)
memory usage: 749.2+ KB


In [5]:
df.shape

(31962, 3)

In [6]:
df['label'].value_counts()

0    29720
1     2242
Name: label, dtype: int64

In [7]:
tweets=df['tweet']

**DATA CLEANING**

In [8]:
tweets = tweets.str.lower()

#removing twitter handles(@user)
tweets = tweets.apply(lambda x : re.sub(r"(@[\w]*)","",x) )

# removing special characters and numbers
tweets = tweets.apply(lambda x : re.sub("[^a-z\s]","",x) )

# remove hash tags
tweets = tweets.str.replace("#", " ")

count_words = tweets.str.findall(r'(\w+)').str.len()
print(count_words.sum())

383088


In [9]:
df['tweet']

0         @user when a father is dysfunctional and is s...
1        @user @user thanks for #lyft credit i can't us...
2                                      bihday your majesty
3        #model   i love u take with u all the time in ...
4                   factsguide: society now    #motivation
                               ...                        
31957    ate @user isz that youuu?ðððððð...
31958      to see nina turner on the airwaves trying to...
31959    listening to sad songs on a monday morning otw...
31960    @user #sikh #temple vandalised in in #calgary,...
31961                     thank you @user for you follow  
Name: tweet, Length: 31962, dtype: object

In [10]:
tweets.head()

0      when a father is dysfunctional and is so sel...
1      thanks for lyft credit i cant use cause they...
2                                  bihday your majesty
3    model   i love u take with u all the time in u...
4                 factsguide society now    motivation
Name: tweet, dtype: object

In [11]:
X = tweets
y = df['label']

**DIVIDING INTO TRAINING AND TESTING DATA**

In [12]:
xtrain,xtest,ytrain,ytest=train_test_split(X,y,test_size=0.3,random_state=1)

In [13]:
df3=pd.concat([xtrain,ytrain],axis=1)
df3.head()

Unnamed: 0,tweet,label
18168,russian default position when faced with accus...,0
15286,if u looked at one of their hairlines its a...,0
4964,wowfinally i see you at southcitymall fellin...,0
5373,im always bereft when i finish somethi...,0
24201,you hispanic amp feel like the are stomping ...,1


**DATA AUGMENTATION**

In [14]:
!pip install nlpaug

Collecting nlpaug
  Downloading nlpaug-1.1.7-py3-none-any.whl (405 kB)
[?25l[K     |▉                               | 10 kB 24.3 MB/s eta 0:00:01[K     |█▋                              | 20 kB 30.8 MB/s eta 0:00:01[K     |██▍                             | 30 kB 24.0 MB/s eta 0:00:01[K     |███▎                            | 40 kB 17.9 MB/s eta 0:00:01[K     |████                            | 51 kB 9.9 MB/s eta 0:00:01[K     |████▉                           | 61 kB 9.6 MB/s eta 0:00:01[K     |█████▋                          | 71 kB 8.1 MB/s eta 0:00:01[K     |██████▌                         | 81 kB 8.9 MB/s eta 0:00:01[K     |███████▎                        | 92 kB 9.0 MB/s eta 0:00:01[K     |████████                        | 102 kB 8.9 MB/s eta 0:00:01[K     |█████████                       | 112 kB 8.9 MB/s eta 0:00:01[K     |█████████▊                      | 122 kB 8.9 MB/s eta 0:00:01[K     |██████████▌                     | 133 kB 8.9 MB/s eta 0:00:01[K 

In [15]:
import nlpaug.augmenter.word as naw
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [16]:
data_resampled_nlpaug = df3.copy()

aug_texts = []
minority_data = data_resampled_nlpaug[data_resampled_nlpaug['label'] == 1]
aug = naw.SynonymAug(aug_src='wordnet')

texts = minority_data['tweet'].tolist()

for text in texts:
    augmented_texts = aug.augment(text, n=14)
    
    for augmented in augmented_texts:
        aug_texts.append(augmented)

print(len(aug_texts))

temp = pd.DataFrame({
    'tweet': aug_texts
})
        
temp['label'] = 1
        
data_resampled_nlpaug = pd.concat([data_resampled_nlpaug, temp], axis=0)
data_resampled_nlpaug = data_resampled_nlpaug.reset_index()
data_resampled_nlpaug = data_resampled_nlpaug.drop(columns=['index'])
del temp, minority_data

22302


In [17]:
data_resampled_nlpaug['label'].value_counts()

1    23895
0    20780
Name: label, dtype: int64

**TRAINING DATA PRE-PROCESSING**

In [18]:
#Tokenization
tok=Tokenizer()
tok.fit_on_texts(data_resampled_nlpaug['tweet'])

In [19]:
#Vocubalary length
vocab_len=len(tok.index_word)
vocab_len

35240

In [20]:
#Text to sequence
train_sequence=tok.texts_to_sequences(data_resampled_nlpaug['tweet'])

In [21]:
doc_len=[]
for doc in train_sequence:
  doc_len.append(len(doc))

In [22]:
np.quantile(doc_len,0.99)

25.0

In [23]:
max_len=25

In [24]:
#Padding
train_matrix=sequence.pad_sequences(train_sequence,maxlen=max_len)
train_matrix

array([[   0,    0,    0, ...,    9,    2, 1123],
       [   0,    0,    0, ...,   66,  215,  102],
       [   0,    0,    0, ...,  454,    8, 3295],
       ...,
       [   0,    0,    0, ..., 5145,  350, 5146],
       [   0,    0,    0, ...,  237,  698, 5146],
       [   0,    0,    0, ..., 5145,  350, 5146]], dtype=int32)

**TEST DATA PRE-PROCESSING**

In [25]:
test_sequence=tok.texts_to_sequences(xtest)
test_matrix=sequence.pad_sequences(test_sequence,maxlen=max_len)
test_matrix

array([[   0,    0,    0, ...,  130,  204, 2747],
       [   0,    0,    0, ..., 8988, 8988, 7533],
       [   0,    0,    0, ...,  669,  110, 2209],
       ...,
       [   0,    0,    0, ...,  198, 3125,  302],
       [   0,    0,    0, ..., 3188,  580, 1704],
       [   0,    0,    0, ...,   11,   33, 7626]], dtype=int32)

**RNN**

In [26]:
#Single layer RNN
model=Sequential()
model.add(Embedding(vocab_len+1,50,input_length=max_len,mask_zero=True))
model.add(SimpleRNN(128))
model.add(Dense(64,activation='tanh'))
model.add(Dense(1,activation='sigmoid'))
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 25, 50)            1762050   
_________________________________________________________________
simple_rnn (SimpleRNN)       (None, 128)               22912     
_________________________________________________________________
dense (Dense)                (None, 64)                8256      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 1,793,283
Trainable params: 1,793,283
Non-trainable params: 0
_________________________________________________________________


In [27]:
model.compile(optimizer="adam",loss="binary_crossentropy")

In [28]:
model.fit(train_matrix,data_resampled_nlpaug['label'],epochs=15,batch_size=64)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f9ffce207d0>

In [29]:
ypred=model.predict(test_matrix)

In [30]:
ypred = np.where(ypred >= 0.5,1,0)

In [31]:
print(classification_report(ytest,ypred))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97      8940
           1       0.60      0.59      0.60       649

    accuracy                           0.95      9589
   macro avg       0.79      0.78      0.78      9589
weighted avg       0.95      0.95      0.95      9589



In [32]:
#Bidirectional RNN
model=Sequential()
model.add(Embedding(vocab_len+1,50,input_length=max_len,mask_zero=True))
model.add(Bidirectional(SimpleRNN(128)))
model.add(Dense(64,activation='tanh'))
model.add(Dense(1,activation='sigmoid'))
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 25, 50)            1762050   
_________________________________________________________________
bidirectional (Bidirectional (None, 256)               45824     
_________________________________________________________________
dense_2 (Dense)              (None, 64)                16448     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 65        
Total params: 1,824,387
Trainable params: 1,824,387
Non-trainable params: 0
_________________________________________________________________


In [33]:
model.compile(optimizer="adam",loss="binary_crossentropy")

In [34]:
model.fit(train_matrix,data_resampled_nlpaug['label'],epochs=15,batch_size=64)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f9ffcd42090>

In [35]:
ypred=model.predict(test_matrix)

In [36]:
ypred = np.where(ypred >= 0.5,1,0)

In [37]:
print(classification_report(ytest,ypred))

              precision    recall  f1-score   support

           0       0.97      0.98      0.97      8940
           1       0.66      0.59      0.63       649

    accuracy                           0.95      9589
   macro avg       0.82      0.78      0.80      9589
weighted avg       0.95      0.95      0.95      9589



In [38]:
#Multiple layer RNN
model=Sequential()
model.add(Embedding(vocab_len+1,50,input_length=max_len,mask_zero=True))
model.add(SimpleRNN(128,return_sequences=True))
model.add(SimpleRNN(64))
model.add(Dense(64,activation='tanh'))
model.add(Dense(1,activation='sigmoid'))
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 25, 50)            1762050   
_________________________________________________________________
simple_rnn_2 (SimpleRNN)     (None, 25, 128)           22912     
_________________________________________________________________
simple_rnn_3 (SimpleRNN)     (None, 64)                12352     
_________________________________________________________________
dense_4 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 65        
Total params: 1,801,539
Trainable params: 1,801,539
Non-trainable params: 0
_________________________________________________________________


In [39]:
model.compile(optimizer="adam",loss="binary_crossentropy")

In [40]:
model.fit(train_matrix,data_resampled_nlpaug['label'],epochs=15,batch_size=64)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f9ffc5d6690>

In [41]:
ypred=model.predict(test_matrix)

In [42]:
ypred = np.where(ypred >= 0.5,1,0)

In [43]:
print(classification_report(ytest,ypred))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97      8940
           1       0.56      0.58      0.57       649

    accuracy                           0.94      9589
   macro avg       0.77      0.77      0.77      9589
weighted avg       0.94      0.94      0.94      9589



**LSTM**

In [44]:
#Single layer LSTM
model=Sequential()
model.add(Embedding(vocab_len+1,50,input_length=max_len,mask_zero=True))
model.add(LSTM(128))
model.add(Dense(64,activation='tanh'))
model.add(Dense(1,activation='sigmoid'))
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 25, 50)            1762050   
_________________________________________________________________
lstm (LSTM)                  (None, 128)               91648     
_________________________________________________________________
dense_6 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 65        
Total params: 1,862,019
Trainable params: 1,862,019
Non-trainable params: 0
_________________________________________________________________


In [45]:
model.compile(optimizer="adam",loss="binary_crossentropy")

In [46]:
model.fit(train_matrix,data_resampled_nlpaug['label'],epochs=15,batch_size=64)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f9ff6bf7250>

In [47]:
ypred=model.predict(test_matrix)

In [48]:
ypred = np.where(ypred >= 0.5,1,0)

In [49]:
print(classification_report(ytest,ypred))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97      8940
           1       0.63      0.60      0.62       649

    accuracy                           0.95      9589
   macro avg       0.80      0.79      0.79      9589
weighted avg       0.95      0.95      0.95      9589



In [50]:
#Bidirectional LSTM
model=Sequential()
model.add(Embedding(vocab_len+1,50,input_length=max_len,mask_zero=True))
model.add(Bidirectional(LSTM(128)))
model.add(Dense(64,activation='tanh'))
model.add(Dense(1,activation='sigmoid'))
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 25, 50)            1762050   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               183296    
_________________________________________________________________
dense_8 (Dense)              (None, 64)                16448     
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 65        
Total params: 1,961,859
Trainable params: 1,961,859
Non-trainable params: 0
_________________________________________________________________


In [51]:
model.compile(optimizer="adam",loss="binary_crossentropy")

In [52]:
model.fit(train_matrix,data_resampled_nlpaug['label'],epochs=15,batch_size=64)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f9ff4ceb810>

In [53]:
ypred=model.predict(test_matrix)

In [54]:
ypred = np.where(ypred >= 0.5,1,0)

In [55]:
print(classification_report(ytest,ypred))

              precision    recall  f1-score   support

           0       0.97      0.98      0.98      8940
           1       0.69      0.61      0.65       649

    accuracy                           0.96      9589
   macro avg       0.83      0.80      0.81      9589
weighted avg       0.95      0.96      0.95      9589



In [56]:
#Multi-layer LSTM
model=Sequential()
model.add(Embedding(vocab_len+1,50,input_length=max_len,mask_zero=True))
model.add(LSTM(128,return_sequences=True))
model.add(LSTM(128))
model.add(Dense(64,activation='tanh'))
model.add(Dense(1,activation='sigmoid'))
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 25, 50)            1762050   
_________________________________________________________________
lstm_2 (LSTM)                (None, 25, 128)           91648     
_________________________________________________________________
lstm_3 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_10 (Dense)             (None, 64)                8256      
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 65        
Total params: 1,993,603
Trainable params: 1,993,603
Non-trainable params: 0
_________________________________________________________________


In [57]:
model.compile(optimizer="adam",loss="binary_crossentropy")

In [58]:
model.fit(train_matrix,data_resampled_nlpaug['label'],epochs=15,batch_size=64)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f9ff262a4d0>

In [59]:
ypred=model.predict(test_matrix)

In [60]:
ypred = np.where(ypred >= 0.5,1,0)

In [61]:
print(classification_report(ytest,ypred))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97      8940
           1       0.62      0.62      0.62       649

    accuracy                           0.95      9589
   macro avg       0.80      0.80      0.80      9589
weighted avg       0.95      0.95      0.95      9589



**GRU**

In [62]:
#GRU
model=Sequential()
model.add(Embedding(vocab_len+1,50,input_length=max_len,mask_zero=True))
model.add(GRU(128))
model.add(Dense(64,activation='tanh'))
model.add(Dense(1,activation='sigmoid'))
model.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 25, 50)            1762050   
_________________________________________________________________
gru (GRU)                    (None, 128)               69120     
_________________________________________________________________
dense_12 (Dense)             (None, 64)                8256      
_________________________________________________________________
dense_13 (Dense)             (None, 1)                 65        
Total params: 1,839,491
Trainable params: 1,839,491
Non-trainable params: 0
_________________________________________________________________


In [63]:
model.compile(optimizer="adam",loss="binary_crossentropy")

In [64]:
model.fit(train_matrix,data_resampled_nlpaug['label'],epochs=15,batch_size=64)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f9ff15231d0>

In [65]:
ypred=model.predict(test_matrix)

In [66]:
ypred = np.where(ypred >= 0.5,1,0)

In [67]:
print(classification_report(ytest,ypred))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97      8940
           1       0.62      0.59      0.60       649

    accuracy                           0.95      9589
   macro avg       0.79      0.78      0.79      9589
weighted avg       0.95      0.95      0.95      9589



In [68]:
#Bidirectional GRU
model=Sequential()
model.add(Embedding(vocab_len+1,50,input_length=max_len,mask_zero=True))
model.add(Bidirectional(GRU(128)))
model.add(Dense(64,activation='tanh'))
model.add(Dense(1,activation='sigmoid'))
model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 25, 50)            1762050   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 256)               138240    
_________________________________________________________________
dense_14 (Dense)             (None, 64)                16448     
_________________________________________________________________
dense_15 (Dense)             (None, 1)                 65        
Total params: 1,916,803
Trainable params: 1,916,803
Non-trainable params: 0
_________________________________________________________________


In [69]:
model.compile(optimizer="adam",loss="binary_crossentropy")

In [70]:
model.fit(train_matrix,data_resampled_nlpaug['label'],epochs=15,batch_size=64)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f9fee4997d0>

In [71]:
ypred=model.predict(test_matrix)

In [72]:
ypred = np.where(ypred >= 0.5,1,0)

In [73]:
print(classification_report(ytest,ypred))

              precision    recall  f1-score   support

           0       0.97      0.98      0.97      8940
           1       0.65      0.60      0.62       649

    accuracy                           0.95      9589
   macro avg       0.81      0.79      0.80      9589
weighted avg       0.95      0.95      0.95      9589



In [74]:
#Multi-layer GRU
model=Sequential()
model.add(Embedding(vocab_len+1,50,input_length=max_len,mask_zero=True))
model.add(GRU(128,return_sequences=True))
model.add(GRU(128))
model.add(Dense(64,activation='tanh'))
model.add(Dense(1,activation='sigmoid'))
model.summary()

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 25, 50)            1762050   
_________________________________________________________________
gru_2 (GRU)                  (None, 25, 128)           69120     
_________________________________________________________________
gru_3 (GRU)                  (None, 128)               99072     
_________________________________________________________________
dense_16 (Dense)             (None, 64)                8256      
_________________________________________________________________
dense_17 (Dense)             (None, 1)                 65        
Total params: 1,938,563
Trainable params: 1,938,563
Non-trainable params: 0
_________________________________________________________________


In [75]:
model.compile(optimizer="adam",loss="binary_crossentropy")

In [76]:
model.fit(train_matrix,data_resampled_nlpaug['label'],epochs=15,batch_size=64)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f9fef0d46d0>

In [77]:
ypred=model.predict(test_matrix)

In [78]:
ypred = np.where(ypred >= 0.5,1,0)

In [79]:
print(classification_report(ytest,ypred))

              precision    recall  f1-score   support

           0       0.97      0.98      0.97      8940
           1       0.65      0.60      0.63       649

    accuracy                           0.95      9589
   macro avg       0.81      0.79      0.80      9589
weighted avg       0.95      0.95      0.95      9589

