<a href="https://colab.research.google.com/github/Nithesh-b/Twitter_Sentiment/blob/post-viva/CustomerAnalytics_LSTMusingKeras_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



###  Binary Classification
## Use Keras for vectorization of clean tweet and apply LSTM

#### 1. we will focus on following 3 columns
   ###### a) Tweet id  ,  
   ###### b) Clean-tweet
   ###### c) class ( derived from Text blob)
   

### Script flow 
##### Step-1
###### ***Read the data from folder  ( Tweet id , Clean-tweet and  class ( derived from Text blob)
###### ***Split the data Train and Test 
###### ***write Train and Test data in to separete csv files 

##### Step-2
###### ***Read train and test csv file
###### ***Clean tweet text of  both test and train file using keras.tokenizer
###### ***Build the LSTM model on train and apply it on test 




In [28]:


import numpy as np 
import pandas as pd 

#import re
#from termcolor import colored
#import nltk 
#import matplotlib.pyplot as plt
#%matplotlib inline
# Data read - All Required data  are in datafolder


dataFolder = "/content/sample_data/outputs"


In [29]:
tweet_df = pd.read_csv(dataFolder+"/Final_PreProcessing_Group33_Cleaned_Tweets.csv")

In [30]:
tweet_df.head(5)

Unnamed: 0,tweet_id,Tweet,class
0,neu-GG-Tweet-11942,ahh repli random follow do not how sad haha,1
1,neu-GG-Tweet-11941,awwww did not get hero,0
2,neu-GG-Tweet-11940,oh realli what bummer,1
3,neu-GG-Tweet-11937,wtf whyd flag oop shitti mous click star,1
4,neu-GG-Tweet-11930,stick intel drive one given decent review anan...,1


In [31]:
print(tweet_df['class'].value_counts())

1    2801
0    2623
Name: class, dtype: int64


In [32]:
# Train test split ( Tweet-id , Clean_tweet , class)

from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test , tweet_id_train , tweet_id_test = train_test_split(tweet_df['Tweet'], tweet_df['class'], tweet_df['tweet_id'],test_size = 0.20, random_state = 100 , stratify = tweet_df['class'])


    


## Store Train and Test data to csv file - so that we can use those in different ML model

In [33]:
# Create Train Data frame
train_dataset = pd.DataFrame({
    'tweet_id' : tweet_id_train ,
    'Tweet'    : X_train,
    'class'    : y_train
    })

print(train_dataset['class'].value_counts())



1    2241
0    2098
Name: class, dtype: int64


In [34]:
# Create Train Test frame
test_dataset = pd.DataFrame({
    'tweet_id' : tweet_id_test ,
    'Tweet'    : X_test,
    'class': y_test
    })


print(test_dataset['class'].value_counts())





1    560
0    525
Name: class, dtype: int64


In [35]:
# Save Train Data
train_dataset.to_csv(dataFolder+"/LSTM_train_Data.csv", index = False)


# Save test data

test_dataset.to_csv(dataFolder+"/LSTM_test_Data.csv", index = False)


### Keras Library 

In [36]:
# Import Tensorflow
import os
import tensorflow
os.environ['KERAS_BACKEND'] = 'tensorflow'

import keras
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D

## Keras Tokenizer
###### ## There are many different methods to do this conversion like count vectorizer, TF-IDF vectorizer, and also Keras have tokenizers that serve the same purpose.

In [37]:

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import pandas as pd


In [38]:
dataFolder = "/content/sample_data/outputs"
# Load data

train_data = pd.read_csv(dataFolder+"/LSTM_train_Data.csv")

test_data = pd.read_csv(dataFolder+"/LSTM_test_Data.csv")


In [39]:
train_data.columns

Index(['tweet_id', 'Tweet', 'class'], dtype='object')

In [40]:
# Tokenization

tokenizer = Tokenizer(num_words = 2500, split = ' ')

# Updates internal vocabulary based on a list of texts. This method creates the vocabu
tokenizer.fit_on_texts(train_data['Tweet'].astype(str).values)

'''
https://stackoverflow.com/questions/51956000/what-does-keras-tokenizer-method-exactly-do

texts_to_sequences Transforms each text in texts to a sequence of integers. 
So it basically takes each word in the text and replaces it with its corresponding 
integer value from the word_index dictionary.
'''

train_tweets = tokenizer.texts_to_sequences(train_data['Tweet'].astype(str).values)



max_len = max([len(i) for i in train_tweets])
# #padding the tweet to have exactly the same shape as `embedding_2` input
# https://www.kaggle.com/shivam001/twitter-tweets-classification-rnn-keras

train_tweets = pad_sequences(train_tweets, maxlen = max_len)
test_tweets = tokenizer.texts_to_sequences(test_data['Tweet'].astype(str).values)
test_tweets = pad_sequences(test_tweets, maxlen = max_len)


In [41]:
train_tweets

array([[   0,    0,    0, ...,  354,  289,  859],
       [   0,    0,    0, ...,    3,   34,  686],
       [   0,    0,    0, ...,    3,    8, 1368],
       ...,
       [   0,    0,    0, ...,  124,   31,   99],
       [   0,    0,    0, ...,   32,   10,  154],
       [   0,    0,    0, ...,   36,  226, 1754]], dtype=int32)

In [42]:
# Building the model

model = Sequential()
model.add(Embedding(2500, 128, input_length = train_tweets.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(256, dropout = 0.2))
model.add(Dense(2, activation = 'softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 23, 128)           320000    
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 23, 128)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 256)               394240    
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 514       
Total params: 714,754
Trainable params: 714,754
Non-trainable params: 0
_________________________________________________________________


In [43]:
# Training the model

history = model.fit(train_tweets, pd.get_dummies(train_data['class']).values, epochs = 10, batch_size = 128, validation_split = 0.2)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [44]:
# Testing the model
# Read about "pd.get_dummies"

score, accuracy = model.evaluate(test_tweets, pd.get_dummies(test_data['class']).values, batch_size = 128)
print("Test accuracy: {}".format(accuracy))

Test accuracy: 0.8903225660324097


# End of LSTM