In [1]:
import numpy as np
import pandas as pd 
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re
from keras.callbacks import ModelCheckpoint
import pickle
from sklearn.metrics import confusion_matrix,classification_report
import os


max_words = 2000
embed_dim = 128
lstm_out = 196
batch_size=32



In [2]:
class ReadData:
    def __init__(self):
        pass
    def read_airline(self):
        data=pd.read_csv('airline_sentiment_analysis.csv')
        data=data[['airline_sentiment','text']]
        return data

In [3]:
class Encoding_sentiment():
    def __init__(self,data):
        self.data=data
    
    def encoding(self):
        return pd.get_dummies(self.data['airline_sentiment'],drop_first=True).values

In [4]:
class Preprocessing():
    def __init__(self):
          pass
        
    def get_lower_regex(self,text):
        self.text= self.text.apply(lambda x: x.lower())
        self.text = self.text.apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))
        return self.text
    
    def remove_company_tag(self,text):
        return pd.Series([x.split(maxsplit=1)[1] for x in self.text])
    
    def tokenization_padding(self,text):
        tokenizer = Tokenizer(num_words=max_words, split=' ')
        tokenizer.fit_on_texts(self.text.values)
        with open('models/tokenizer.pickle', 'wb') as handle:
            pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
        self.text=tokenizer.texts_to_sequences(self.text.values)
        self.text=pad_sequences(self.text)
        return self.text
        
        

In [5]:
class Train_Process(Preprocessing):
    def __init__(self):
        pass
    
    def dataset_train_preprocess(self,text):
        self.text=data['text']
        self.text= self.get_lower_regex(self.text)
        self.text= self.remove_company_tag(self.text)
        self.text= self.tokenization_padding(self.text)
        return self.text
    
    def single_test_preprocess(self,text):
        self.text=text
        self.text= self.get_lower_regex(self.text)
        self.text= self.tokenization_padding(self.text)
        return self.text
        

In [16]:
class architecture():
    
    def __init__(self):
        pass
    
    
    def lstm_model(self):
        model = Sequential()
        model.add(Embedding(max_words, embed_dim,input_length = X.shape[1]))
        model.add(SpatialDropout1D(0.4))
        model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
        model.add(Dense(1,activation='sigmoid'))
        model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
        return model

In [7]:
class Train_Data_Split():
    
    def __init__(self,text,outcome):
        self.features=text
        self.outcome=outcome
    
    def train_split(self):
        X_train, x_test, Y_train, y_test = train_test_split(self.features,self.outcome, test_size = 0.34, random_state = 42)
        return X_train, x_test, Y_train, y_test
    
class Valid_Data_Split(Train_Data_Split):
    
    def valid_test_split(self):
        X_test, X_valid, Y_test, Y_valid = train_test_split(self.features,self.outcome, test_size = 0.5, random_state = 42)
        return X_test, X_valid, Y_test, Y_valid
    
        

In [29]:
class Train_model:

    def __init__(self):
        pass
    
    def fit(self,model):
        checkpt= ModelCheckpoint('models/model.h5',monitor='val_accuracy', save_best_only=True, mode='max',verbose=0)
        history= model.fit(X_train, Y_train,validation_data=(X_valid,Y_valid), epochs = 7, batch_size=batch_size, verbose = 0,callbacks=[checkpt])
        return history  

In [9]:
class Testing:
    def __init__(self):
        pass
    def testing_metrics(self):
        predict=model.predict_classes(X_test,batch_size=32,verbose=0)
        print(classification_report(Y_test,predict))
        print(confusion_matrix(Y_test,predict))

In [17]:
os.makedirs('models')
data=ReadData().read_airline()
data.head()

Unnamed: 0,airline_sentiment,text
0,positive,@VirginAmerica plus you've added commercials t...
1,negative,@VirginAmerica it's really aggressive to blast...
2,negative,@VirginAmerica and it's a really big bad thing...
3,negative,@VirginAmerica seriously would pay $30 a fligh...
4,positive,"@VirginAmerica yes, nearly every time I fly VX..."


In [18]:
Y=Encoding_sentiment(data).encoding()

In [19]:
Y

array([[1],
       [0],
       [0],
       ...,
       [1],
       [0],
       [0]], dtype=uint8)

In [20]:
Preprocess_object=Train_Process()

In [21]:
X=Preprocess_object.dataset_train_preprocess(data)
X[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,  552,  487, 1246,    1,    2,  166])

In [22]:
model=architecture().lstm_model()

In [23]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 31, 128)           256000    
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 31, 128)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 196)               254800    
_________________________________________________________________
dense (Dense)                (None, 1)                 197       
Total params: 510,997
Trainable params: 510,997
Non-trainable params: 0
_________________________________________________________________


In [24]:
X_train, x_test_valid, Y_train, y_test_valid= Train_Data_Split(X,Y).train_split()

In [25]:
X_test, X_valid, Y_test, Y_valid = Valid_Data_Split(x_test_valid,y_test_valid).valid_test_split()

In [30]:
training=Train_model()
history=training.fit(model)

In [31]:
Test=Testing().testing_metrics()

              precision    recall  f1-score   support

           0       0.95      0.94      0.95      1582
           1       0.76      0.80      0.78       380

    accuracy                           0.91      1962
   macro avg       0.86      0.87      0.86      1962
weighted avg       0.92      0.91      0.91      1962

[[1487   95]
 [  75  305]]
