In [None]:
# Mounting over google drive
from google.colab import drive
drive.mount("TwitterSupport")

Mounted at TwitterSupport


In [None]:
# importing required libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re
import string

In [None]:
# Setting up the root DIR
ROOT_DIR="TwitterSupport/MyDrive/TwitterSupport/"

In [None]:
# Reading Dataset
data = pd.read_csv(ROOT_DIR+'dataset/Tweets.csv')

In [None]:
# Reading top rows
data.head()


Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [None]:
# we check how much null entries in each columns
print("data_is_null \n",data.isnull().sum())

data_is_null 
 tweet_id                            0
airline_sentiment                   0
airline_sentiment_confidence        0
negativereason                   5462
negativereason_confidence        4118
airline                             0
airline_sentiment_gold          14600
name                                0
negativereason_gold             14608
retweet_count                       0
text                                0
tweet_coord                     13621
tweet_created                       0
tweet_location                   4733
user_timezone                    4820
dtype: int64


In [None]:
 #Keeping only the neccessary columns
data = data[['text','airline_sentiment']]

In [None]:
data['text'].values

array(['@VirginAmerica What @dhepburn said.',
       "@VirginAmerica plus you've added commercials to the experience... tacky.",
       "@VirginAmerica I didn't today... Must mean I need to take another trip!",
       ...,
       '@AmericanAir Please bring American Airlines to #BlackBerry10',
       "@AmericanAir you have my money, you change my flight, and don't answer your phones! Any other suggestions so I can make my commitment??",
       '@AmericanAir we have 8 ppl so we need 2 know how many seats are on the next flight. Plz put us on standby for 4 people on the next flight?'],
      dtype=object)

In [None]:
data.describe()

Unnamed: 0,text,airline_sentiment
count,14640,14640
unique,14427,3
top,@united thanks,negative
freq,6,9178


In [None]:
# it will remove all Neutral values from data
data = data[data.airline_sentiment != "neutral"].copy()


In [None]:
data.describe()

Unnamed: 0,text,airline_sentiment
count,11541,11541
unique,11381,2
top,@AmericanAir thanks,negative
freq,5,9178


In [None]:
import nltk
# nltk.download('stopwords')
from nltk import PorterStemmer

In [None]:
# Cleaning Text

def clean_text(txt):

          """
          removing all hashtags , punctuations, stop_words  and links, also stemming words
          """
          from nltk.corpus import stopwords
          txt = txt.lower()
          txt = re.sub(r"(?<=\w)nt", "not",txt) #change don't to do not cna't to cannot
          txt = re.sub(r"(@\S+)", "", txt)  # remove hashtags
          txt = re.sub(r'\W', ' ', str(txt)) # remove all special characters including apastrophie
          txt = txt.translate(str.maketrans('', '', string.punctuation)) # remove punctuations
          txt = re.sub(r'\s+[a-zA-Z]\s+', ' ', txt)   # remove all single characters (it's -> it s then we need to remove s)
          txt = re.sub(r'\s+', ' ', txt, flags=re.I) # Substituting multiple spaces with single space
          txt = re.sub(r"(http\S+|http)", "", txt) # remove links
          txt = ' '.join([PorterStemmer().stem(word=word) for word in txt.split(" ") if word not in stopwords.words('english') ]) # stem & remove stop words
          txt = ''.join([i for i in txt if not i.isdigit()]).strip() # remove digits ()
          return txt

In [None]:
# Data Cleaning
data['text'] = data['text'].apply(clean_text)

In [None]:
data['text'].values

array(['plu ad commerci experi tacki',
       'realli aggress blast obnoxi enotertainmenot guest face amp littl recours',
       'realli big bad thing', ..., 'thank got differenot flight chicago',
       'leav  minut late flight warn commun unotil  minut late flight call shitti custom svc',
       'money chang flight answer phone suggest make commitmenot'],
      dtype=object)

In [None]:
# Embedding words to text sequences
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)

# pad: to make all input of same length
X = pad_sequences(X)

In [None]:
X.shape

(11541, 21)

In [None]:
embed_dim = 128
lstm_out = 196
input_len = 21
import time
model = Sequential()
model.add(Embedding(max_fatures, embed_dim, input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2, activation='softmax'))




In [None]:
start=time.time()
model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
print("Time to compile model:",time.time()-start)


Time to compile model: 0.01389765739440918


In [None]:
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 21, 128)           256000    
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 21, 128)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 196)               254800    
_________________________________________________________________
dense (Dense)                (None, 2)                 394       
Total params: 511,194
Trainable params: 511,194
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
Y = pd.get_dummies(data['airline_sentiment']).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.15, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(9809, 21) (9809, 2)
(1732, 21) (1732, 2)


In [None]:
from tqdm import tqdm
batch_size = 32
tqdm(model.fit(X_train, Y_train, epochs = 7, batch_size=batch_size, verbose = 2))

Epoch 1/7
307/307 - 54s - loss: 0.3280 - accuracy: 0.8669
Epoch 2/7
307/307 - 28s - loss: 0.1883 - accuracy: 0.9236
Epoch 3/7
307/307 - 29s - loss: 0.1541 - accuracy: 0.9384
Epoch 4/7
307/307 - 29s - loss: 0.1348 - accuracy: 0.9476
Epoch 5/7
307/307 - 29s - loss: 0.1168 - accuracy: 0.9534
Epoch 6/7
307/307 - 28s - loss: 0.1021 - accuracy: 0.9594
Epoch 7/7
307/307 - 29s - loss: 0.0905 - accuracy: 0.9622


0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [None]:
validation_size = 1500

X_validate = X_test[-validation_size:]
Y_validate = Y_test[-validation_size:]

score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

55/55 - 2s - loss: 0.2771 - accuracy: 0.9059
score: 0.28
acc: 0.91


In [None]:
pos_cnt, neg_cnt, pos_correct, neg_correct = 0, 0, 0, 0
for x in range(len(X_validate)):
    result = model.predict(X_validate[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 0)[0]
   
    if np.argmax(result) == np.argmax(Y_validate[x]):
        if np.argmax(Y_validate[x]) == 0:
            neg_correct += 1
        else:
            pos_correct += 1
       
    if np.argmax(Y_validate[x]) == 0:
        neg_cnt += 1
    else:
        pos_cnt += 1

In [None]:
print("pos_acc", pos_correct/pos_cnt*100, "%")
print("neg_acc", neg_correct/neg_cnt*100, "%")

pos_acc 73.75886524822694 %
neg_acc 94.08866995073892 %


In [None]:
twt = ['Meetings: ram is a bad man.']
#vectorizing the tweet by the pre-fitted tokenizer instance
twt = tokenizer.texts_to_sequences(twt)
#padding the tweet to have exactly the same shape as `embedding_2` input
twt = pad_sequences(twt, maxlen=28, dtype='int32', value=0)
print(twt)
sentiment = model.predict(twt,batch_size=1,verbose = 0)[0]
if(np.argmax(sentiment) == 0):
    print("negative")
elif (np.argmax(sentiment) == 1):
    print("positive")

[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0 459 113 754]]
negative


In [None]:
import pickle

#saving model
model.save(ROOT_DIR+"binaryClassificationModel.h5")

# saving tokenizer
with open(ROOT_DIR+'tokenizerBinaryClassification.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
import pickle
import tensorflow.keras.models
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding
from keras.layers.recurrent import LSTM
from tensorflow.keras.layers import Input, Dense, Embedding, SpatialDropout1D, add, concatenate
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

#Text Preprocessing
import re
from nltk.corpus import stopwords

class BinaryInference:

      def __init__(self):
          self.load_models()

      def get_model(self):
          max_fatures = 2000
          embed_dim = 128
          lstm_out = 196
          input_len = 21
          model = Sequential()
          model.add(Embedding(max_fatures, embed_dim,input_length = input_len))
          model.add(SpatialDropout1D(0.4))
          model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
          model.add(Dense(2,activation='softmax'))
          model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
          return model

      def clean_text(self, txt):

          """
          removing all hashtags , punctuations, stop_words  and links, also stemming words
          """
          from nltk.corpus import stopwords
          txt = txt.lower()
          txt = re.sub(r"(?<=\w)nt", "not",txt) #change don't to do not cna't to cannot
          txt = re.sub(r"(@\S+)", "", txt)  # remove hashtags
          txt = re.sub(r'\W', ' ', str(txt)) # remove all special characters including apastrophie
          txt = txt.translate(str.maketrans('', '', string.punctuation)) # remove punctuations
          txt = re.sub(r'\s+[a-zA-Z]\s+', ' ', txt)   # remove all single characters (it's -> it s then we need to remove s)
          txt = re.sub(r'\s+', ' ', txt, flags=re.I) # Substituting multiple spaces with single space
          txt = re.sub(r"(http\S+|http)", "", txt) # remove links
          txt = ' '.join([PorterStemmer().stem(word=word) for word in txt.split(" ") if word not in stopwords.words('english') ]) # stem & remove stop words
          txt = ''.join([i for i in txt if not i.isdigit()]).strip() # remove digits ()
          return txt

      def load_models(self):
          with open(ROOT_DIR+'tokenizerBinaryClassification.pickle', 'rb') as handle:
              self.tokenizer = pickle.load(handle)

          self.model = self.get_model()
          self.model.load_weights(ROOT_DIR+"binaryClassificationModel.h5")


      def predict_complaint(self, text):

          #vectorizing the tweet by the pre-fitted tokenizer instance
          text = self.clean_text(text)
          twt = self.tokenizer.texts_to_sequences([text])
          #padding the tweet to have exactly the same shape as `embedding_2` input
          twt = pad_sequences(twt, maxlen=28, dtype='int32', value=0)
          complain = self.model.predict(twt,batch_size=1,verbose = 0)[0]
          if(np.argmax(complain) == 0):
              print("negative")
              return True
          elif (np.argmax(complain) == 1):
              print("positive")
              return False


In [None]:
bi = BinaryInference()

In [None]:
bi.predict_complaint("americanair leaving over 20 minutes late flight no warnings or communication until we were 15 minutes late flight thats called shitty customer svc")

negative


True