# Importing liberaries

In [None]:
import zipfile
import pandas as pd
import nltk
from nltk.corpus import stopwords
import re
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet as wn

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import RMSprop


In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Preprocessing the dataset

In [None]:
data= pd.read_csv('training.1600000.processed.noemoticon.csv',  encoding='ISO-8859-1', names=['1','2','3','4','5', '6'], header =None )

In [None]:
data.head()

Unnamed: 0,1,2,3,4,5,6
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [None]:
#number of records in the dataset
print('The number of records in the dataset is '+ str(len(data))+ ' record.') 

The number of records in the dataset is 1600000 record.


In [None]:
#checking for null values
data.isnull().sum().sum() 

0

In [None]:
#change the target and tweets columns' names 
data = data.rename(columns = {'1': 'targ', '6': 'texts'})

In [None]:
data['targ'][data['targ']==4]=1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
data_pos = data[data['targ'] == 1]
data_neg = data[data['targ'] == 0]

In [None]:
data_pos = data_pos.iloc[:int(50000)]
data_neg = data_neg.iloc[:int(50000)]

In [None]:
data = pd.concat([data_pos, data_neg])

In [None]:
data

Unnamed: 0,targ,2,3,4,5,texts
800000,1,1467822272,Mon Apr 06 22:22:45 PDT 2009,NO_QUERY,ersle,I LOVE @Health4UandPets u guys r the best!!
800001,1,1467822273,Mon Apr 06 22:22:45 PDT 2009,NO_QUERY,becca210,im meeting up with one of my besties tonight! ...
800002,1,1467822283,Mon Apr 06 22:22:46 PDT 2009,NO_QUERY,Wingman29,"@DaRealSunisaKim Thanks for the Twitter add, S..."
800003,1,1467822287,Mon Apr 06 22:22:46 PDT 2009,NO_QUERY,katarinka,Being sick can be really cheap when it hurts t...
800004,1,1467822293,Mon Apr 06 22:22:46 PDT 2009,NO_QUERY,_EmilyYoung,@LovesBrooklyn2 he has that effect on everyone
...,...,...,...,...,...,...
49995,0,1678337109,Sat May 02 06:22:31 PDT 2009,NO_QUERY,Jessica_567,@mileycyrus so i have the same insomnia prob a...
49996,0,1678337116,Sat May 02 06:22:31 PDT 2009,NO_QUERY,whouwit077,20 mintues late for my meeting starting @ 8 h...
49997,0,1678337128,Sat May 02 06:22:31 PDT 2009,NO_QUERY,AmyPR,@kentucky_derby super excited! Are you tweetin...
49998,0,1678337159,Sat May 02 06:22:32 PDT 2009,NO_QUERY,cynthia_sue03,I WANT ANOTHER DAY OFF!!!! To much Sh#t to do...


In [None]:
data_nw = data.drop(columns = ['2','3','4','5'])

data_nw.head()

Unnamed: 0,targ,texts
800000,1,I LOVE @Health4UandPets u guys r the best!!
800001,1,im meeting up with one of my besties tonight! ...
800002,1,"@DaRealSunisaKim Thanks for the Twitter add, S..."
800003,1,Being sick can be really cheap when it hurts t...
800004,1,@LovesBrooklyn2 he has that effect on everyone


# Text preprocessing

In [None]:
#stop words list 
list_stp_wrds = stopwords.words('english')
list_stp_wrds

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [None]:
for stpwrd in list_stp_wrds:
    if stpwrd.endswith("n't") | stpwrd.endswith("dn") | stpwrd.endswith("sn"):
        list_stp_wrds.remove(stpwrd)
list_stp_wrds      #updated  

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [None]:
from nltk.tokenize import word_tokenize


In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
def Preprocessing(tw):
    
    emoji = {':)': 'smile', ':-)': 'smile', ';d': 'wink', ':-E': 'vampire', ':(': 'sad', 
              ':-(': 'sad', ':-<': 'sad', ':P': 'raspberry', ':O': 'surprised',
              ':-@': 'shocked', ':@': 'shocked',':-$': 'confused', ':\\': 'annoyed', 
              ':#': 'mute', ':X': 'mute', ':^)': 'smile', ':-&': 'confused', '$_$': 'greedy',
              '@@': 'eyeroll', ':-!': 'confused', ':-D': 'smile', ':-0': 'yell', 'O.o': 'confused',
              '<(-_-)>': 'robot', 'd[-_-]b': 'dj', ":'-)": 'sadsmile', ';)': 'wink', 
              ';-)': 'wink', 'O:-)': 'angel','O*-)': 'angel','(:-D': 'gossip', '=^.^=': 'cat'}

    links = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
    mentions = r"@[^\s]+"
    nonalpha = '[^a-zA-Z0-9]'
    
    
    
    tw = re.sub(links,'',tw)
    
    tw = re.sub(mentions, '', tw) 
    
    
    
    for em in emoji.keys():
        tw = tw.replace(em, emoji[em])   
    
    tw = re.sub(nonalpha, " ", tw)     
    
    tokens = word_tokenize(tw)   
    lemm = '';
    
    for wrd in tokens:
        if wrd not in list_stp_wrds:  
            if len(wrd) > 1:
                obj = WordNetLemmatizer()
                  

                lem_wrd = obj.lemmatize(wrd)  
                lemm += (lem_wrd + ' ')    
    
    return lemm.strip()

In [None]:
data_nw['texts'] = data_nw['texts'].apply(Preprocessing)
data_nw['texts']

800000                                        LOVE guy best
800001    im meeting one besties tonight Cant wait GIRL ...
800002    Thanks Twitter add Sunisa got meet HIN show DC...
800003    Being sick really cheap hurt much eat real foo...
800004                                      effect everyone
                                ...                        
49995     insomnia prob slept hr woke 5am nd couldnt go ...
49996     20 mintues late meeting starting know going la...
49997     super excited Are tweeting event happening Onl...
49998     WANT ANOTHER DAY OFF To much Sh today Got quot...
49999                                  jacked umbrella cake
Name: texts, Length: 100000, dtype: object

In [None]:
X=data.texts
y=data.targ

#Preparing the input features for training 

We converting the text words into arrays form.
Maximum 500 features/words selected for training. These 500 words will be selected on the importance that will distinguish between the positive tweets and negative tweets.

In [None]:
max_len = 500
tok = Tokenizer(num_words=1000)
tok.fit_on_texts(X)
sequences = tok.texts_to_sequences(X)
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)

In [None]:
sequences_matrix.shape

(100000, 500)

total 100000 tweets and the number words/features are 500

In [None]:
from sklearn.model_selection import train_test_split


Separating the 70% data for training data and 30% for testing data

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(sequences_matrix, y, test_size=0.3, random_state=2)

In [None]:
def tensorflow_based_model(): #Defined tensorflow_based_model function for training tenforflow based model
    inputs = Input(name='inputs',shape=[max_len])#step1
    layer = Embedding(2000,50,input_length=max_len)(inputs) #step2
    layer = LSTM(64)(layer) #step3
    layer = Dense(256,name='FC1')(layer) #step4
    layer = Activation('relu')(layer) # step5
    layer = Dropout(0.5)(layer) # step6
    layer = Dense(1,name='out_layer')(layer) #step4 again but this time its giving only one output as because we need to classify the tweet as positive or negative
    layer = Activation('sigmoid')(layer) #step5 but this time activation function is sigmoid for only one output.
    model = Model(inputs=inputs,outputs=layer) #here we are getting the final output value in the model for classification
    return model #function returning the value when we call it

In [None]:
model = tensorflow_based_model() # here we are calling the function of created model
model.compile(loss='binary_crossentropy',optimizer=RMSprop(),metrics=['accuracy'])  

In [None]:
history=model.fit(X_train,Y_train,batch_size=100,epochs=10, validation_split=0.1)# here we are starting the training of model by feeding the training data
print('Training finished !!')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Training finished !!


In [None]:
accr1 = model.evaluate(X_test,Y_test) #we are starting to test the model here




In [None]:
print('Test set\n  Accuracy: {:0.2f}'.format(accr1[1])) #the accuracy of the model on test data is given below

Test set
  Accuracy: 0.77
