In [126]:
import nltk
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

# library to visualize text data
from wordcloud import WordCloud
#a collection of words that don’t provide any meaning to a sentence
from nltk.corpus import stopwords
#used to convert different forms of words into a single item but still keeping the context intact.
from nltk.stem import WordNetLemmatizer

import tensorflow as tf
import tensorflow.keras as keras
from keras.layers import Conv1D, Input, Layer, Dense, Activation, Dropout
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
#used to convert different forms of words into a single item but still keeping the context intact.
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split

nlp= spacy.load('en_core_web_sm')

In [127]:
# load Dataset

df= pd.read_csv(r"D:\Datasets\Twitter Tweets\training.1600000.processed.noemoticon.csv", encoding='latin-1')

In [128]:
df.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [129]:
# Split the data in X and Y

X= df[df.columns[5]]
y= df[df.columns[0]]

In [130]:
y.tail()

19994    4
19995    4
19996    4
19997    4
19998    4
Name: 0, dtype: int64

In [131]:
X.head()

0    is upset that he can't update his Facebook by ...
1    @Kenichan I dived many times for the ball. Man...
2      my whole body feels itchy and like its on fire 
3    @nationwideclass no, it's not behaving at all....
4                        @Kwesidei not the whole crew 
Name: @switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D, dtype: object

In [132]:
X.shape

(19999,)

In [133]:
# split trhe dataset into train and test data
trainset1x, trainset2x, trainset1y, trainset2y = train_test_split(X, y, test_size= 0.02, random_state= 42)

In [134]:
(trainset1x.shape,  trainset1y.shape),(trainset2x.shape, trainset2y.shape)

(((19599,), (19599,)), ((400,), (400,)))

In [135]:
# Here we will be using the smaller train set for preprocessing and modeling

In [136]:
# get dummies for y of train set 2 -- this is done to use it in rnn netweork
trainset2y=pd.get_dummies(trainset2y)

In [137]:
# Data Preprocessing 1st way
def data_preprocess_1st_way(doc):
    
    corpus= []
    doc= nlp(doc)
    
    # regex pattern
    pattern= r'[^a-zA-Z0-9\s]'  # keep letters, digit and whitespace
    
    for word in doc:
        
        # remove special characters using regex sub()
        clean_word= re.sub(pattern, '', word)
        
        # convert to lower case
        clean_word= word.lower()
        
        # two ways of doing tokenization
        clean_word= word.split()
        
        # perform lemmetazation and remove stop words
        lemma= WordNetLemmatizer()
        clean_word= [ lemma.lemmatize(i) for i in clean_word if i not in set(stopwords.words('english'))]
        
        # join the words to form corpus
        corpus.append(' '.join(str(x) for i in clean_word))
       
    return corpus

In [138]:
# Load glove model to convert word into vector

def loadGloveModel(gloveFile):
    print('Loading glove model')
    f= open(gloveFile, 'r', encoding='utf-8')
    print(f)
    model= {}
    
    for line in f:
        splitLine= line.split()
        word= splitLine[0]
        embedding= [float(val) for val in splitLine[1:]]
        model[word]= embedding
        
    print("Done.",len(model)," words loaded!")
        
    return model

In [139]:
# save the glove model
model= loadGloveModel(r"D:\Datasets\glove.6B\glove.6B.300d.txt")

Loading glove model
<_io.TextIOWrapper name='D:\\Datasets\\glove.6B\\glove.6B.300d.txt' mode='r' encoding='utf-8'>
Done. 400000  words loaded!


In [140]:
len(model['the'])

300

In [141]:
len(model.keys())

400000

In [142]:
# vectorize the sentence
def sent_Vectorize(sentence, model):
    sent_vect= np.zeros(300)
    numw= 0
    for word in sentence.split():
        try:
            sent_vect= np.add(sent_vect, model[str(word)])
            numw+=1
            
        except:
            pass
    
    return sent_vect
        

In [143]:
# Data Preprocessing 2nd way
def data_preprocess_2nd_way(document):
    
    corpus= []
    
    for word in document:
        
        # convert to lower case
        clean_word= word.lower()
        
        # lemmatize the word
        lemma= WordNetLemmatizer()
        clean_word= lemma.lemmatize(clean_word)
        clean_word= str(clean_word)
        corpus.append(sent_Vectorize(clean_word, model))
    
    # getting input and output in proper sequence
    cleanVector= np.array(corpus)
    cleanVector= cleanVector.reshape(len(cleanVector), 300, 1)
        
    # tokenize the word
    tokenizer= Tokenizer(num_words= 16000)
    tokenizer.fit_on_texts(document)
    sequences= tokenizer.texts_to_sequences(document)

    word_index= tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))

    padded_data= pad_sequences(sequences, maxlen= 15, padding= 'post')
    print(padded_data.shape)
    
    # reshape the data and prepare to train
    data= padded_data.reshape(len(cleanVector), 15, 1)
    
    
    return data, tokenizer, word_index

In [144]:
# We will be using 2nd way for preprocessing
data, tokenizer, word_index= data_preprocess_2nd_way(trainset2x)

Found 1873 unique tokens.
(400, 15)


In [145]:
# spli the data into train and test
trainx, validx, trainy, validy = train_test_split(data, trainset2y, test_size=0.3,random_state=42 )


In [146]:
# calculate the number of words
nb_words= len(tokenizer.word_index)+1
nb_words

1874

In [147]:
# obtain the embedding matrix for embedding layer
embedding_matrix= np.zeros((nb_words, 300))

for word, i in word_index.items():
    embedding_vector= model.get(word)
    
#     print(len(embedding_vector))
    if embedding_vector is not None:
        embedding_matrix[i]= embedding_vector
    
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

Null word embeddings: 346


In [148]:
embedding_matrix.shape

(1874, 300)

In [149]:
trainy=np.array(trainy)
validy=np.array(validy)

In [157]:
# building a simple RNN model

def modelBuild():
    
    model= keras.Sequential()
    model.add(keras.layers.InputLayer(input_shape= (15,1)))
    keras.layers.Embedding(nb_words, 15, input_length= 15, weights= [embedding_matrix], trainable= False)
    
    model.add(keras.layers.SimpleRNN(units= 100, activation= 'relu', use_bias=True))
    model.add(Dense(units=1000, input_dim=2000, activation= 'relu'))
    model.add(keras.layers.Dense(units=500, input_dim=1000, activation='relu'))
    model.add(keras.layers.Dense(units=2, input_dim=500,activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model

In [160]:
#compiling the model
finalmodel = modelBuild()
finalmodel.fit(trainx, trainy, epochs=50, batch_size=120,validation_data=(validx,validy))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x1ea18e99550>