In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Embedding,LSTM,Bidirectional

import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('/content/drive/MyDrive/NLP(Classes)/train.csv/train.csv')

In [None]:
df.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [None]:
df['label'].value_counts()

0    29720
1     2242
Name: label, dtype: int64

In [None]:
tweets=df['tweet']

**DATA CLEANING**

In [None]:
tweets = tweets.str.lower()

#removing twitter handles(@user)
tweets = tweets.apply(lambda x : re.sub(r"(@[\w]*)","",x) )

# removing special characters and numbers
tweets = tweets.apply(lambda x : re.sub("[^a-z\s]","",x) )

# remove hash tags
tweets = tweets.str.replace("#", " ")

count_words = tweets.str.findall(r'(\w+)').str.len()
print(count_words.sum())


383088


In [None]:
df['tweet']

0         @user when a father is dysfunctional and is s...
1        @user @user thanks for #lyft credit i can't us...
2                                      bihday your majesty
3        #model   i love u take with u all the time in ...
4                   factsguide: society now    #motivation
                               ...                        
31957    ate @user isz that youuu?ðððððð...
31958      to see nina turner on the airwaves trying to...
31959    listening to sad songs on a monday morning otw...
31960    @user #sikh #temple vandalised in in #calgary,...
31961                     thank you @user for you follow  
Name: tweet, Length: 31962, dtype: object

In [None]:
tweets.head()

0      when a father is dysfunctional and is so sel...
1      thanks for lyft credit i cant use cause they...
2                                  bihday your majesty
3    model   i love u take with u all the time in u...
4                 factsguide society now    motivation
Name: tweet, dtype: object

In [None]:
X = tweets
y = df['label']

**DIVIDING INTO TRAINING AND TESTING DATA**

In [None]:
xtrain,xtest,ytrain,ytest=train_test_split(X,y,test_size=0.3,random_state=1)

In [None]:
df3=pd.concat([xtrain,ytrain],axis=1)
df3.head()

Unnamed: 0,tweet,label
18168,russian default position when faced with accus...,0
15286,if u looked at one of their hairlines its a...,0
4964,wowfinally i see you at southcitymall fellin...,0
5373,im always bereft when i finish somethi...,0
24201,you hispanic amp feel like the are stomping ...,1


In [None]:
df3['label'].value_counts()

0    20780
1     1593
Name: label, dtype: int64

**TRAINING DATA PRE-PROCESSING**

In [None]:
#Tokenization
tok=Tokenizer()
tok.fit_on_texts(df3['tweet'])

In [None]:
#Vocubalary length
vocab_len=len(tok.index_word)
vocab_len

31821

In [None]:
#Text to sequence
train_sequence=tok.texts_to_sequences(df3['tweet'])

**DATA AUGMENTATION**

In [None]:
import gensim.downloader as api
embeddings_ap = api.load("glove-twitter-200")  # load glove vectors



In [None]:
#Create an embedding matrix for the tokenized text 
embedding_matrix = np.zeros((vocab_len+1,200))


for word, i in tok.word_index.items():
	try:
		embedding_vector = embeddings_ap[word]
		if embedding_vector is not None:
			embedding_matrix[i] = embedding_vector
	except:
		pass

In [None]:
doc_len=[]
for doc in train_sequence:
  doc_len.append(len(doc))

In [None]:
np.quantile(doc_len,0.99)

25.0

In [None]:
max_len=25

In [None]:
#Padding
train_matrix=sequence.pad_sequences(train_sequence,maxlen=max_len)
train_matrix

array([[    0,     0,     0, ...,    10,     2,  1330],
       [    0,     0,     0, ...,    71,   213,   164],
       [    0,     0,     0, ...,   289,     8,  1672],
       ...,
       [    0,     0,     0, ..., 31819, 31820, 31821],
       [    0,     0,     0, ...,  2308,   214,  6564],
       [    0,     0,     0, ...,    30,     5,   252]], dtype=int32)

**TEST DATA PRE-PROCESSING**

In [None]:
test_sequence=tok.texts_to_sequences(xtest)
test_matrix=sequence.pad_sequences(test_sequence,maxlen=max_len)
test_matrix

array([[   0,    0,    0, ...,  116,  141, 1175],
       [   0,    0,    0, ..., 5139, 5139, 3641],
       [   0,    0,    0, ...,  877,   59,  996],
       ...,
       [   0,    0,    0, ...,  237, 2563,  236],
       [   0,    0,    0, ..., 1394,  328, 1044],
       [   0,    0,    0, ...,   17,   23, 3774]], dtype=int32)

**Bidirectional LSTM**

In [None]:
model=Sequential()
model.add(Embedding(vocab_len+1,200,weights=[embedding_matrix],input_length=max_len,mask_zero=True))
model.add(Bidirectional(LSTM(16)))
model.add(Dense(8,activation='tanh'))
model.add(Dense(1,activation='sigmoid'))
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 25, 200)           6364400   
_________________________________________________________________
bidirectional (Bidirectional (None, 32)                27776     
_________________________________________________________________
dense (Dense)                (None, 8)                 264       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 9         
Total params: 6,392,449
Trainable params: 6,392,449
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(optimizer="adam",loss="binary_crossentropy")

In [None]:
model.fit(train_matrix,df3['label'],epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7efd0035bed0>

In [None]:
ypred=model.predict(test_matrix)

In [None]:
ypred = np.where(ypred >= 0.5,1,0)

In [None]:
print(classification_report(ytest,ypred))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98      8940
           1       0.79      0.64      0.71       649

    accuracy                           0.96      9589
   macro avg       0.88      0.81      0.84      9589
weighted avg       0.96      0.96      0.96      9589

