In [10]:
#Airline sentiment analysis with LSTM, with tensorflow and keras


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report


In [17]:
#Use OOPs concept to train the model, reading data for training, and implement inference class
class SentimentAnalysis: 
    def __init__(self, data_path):
        self.data_path = data_path
        self.data = pd.read_csv(data_path)
        self.data = self.data.dropna()
        self.data['text'] = self.data['text'].apply(lambda x: x.lower())
        self.data['text'] = self.data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))
        self.max_features = 2000
        self.tokenizer = Tokenizer(num_words=self.max_features, split=' ')
        self.tokenizer.fit_on_texts(self.data['text'].values)
        self.X = self.tokenizer.texts_to_sequences(self.data['text'].values)
        self.X = pad_sequences(self.X)
        self.embed_dim = 128
        self.lstm_out = 196

        self.model = keras.Sequential([
            layers.Embedding(self.max_features, self.embed_dim,input_length = self.X.shape[1]),
            layers.SpatialDropout1D(0.4),
            layers.LSTM(self.lstm_out, dropout=0.2, recurrent_dropout=0.2),
            layers.Dense(2,activation='sigmoid')
        ])

        self.model.compile(loss = 'categorical_crossentropy', 
                            optimizer='adam',
                            metrics = ['accuracy'])
                            
        print(self.model.summary())

    def train(self):
        Y = pd.get_dummies(self.data['airline_sentiment']).values
        X_train, X_test, Y_train, Y_test = train_test_split(self.X,Y, test_size = 0.2, random_state = 42)
        batch_size = 32
        self.model.fit(X_train, Y_train, epochs = 7, batch_size=batch_size, verbose = 2)
        score,acc = self.model.evaluate(X_test,Y_test,verbose=2,batch_size=batch_size)
        print(score)
        print(acc)
        print(self.model.metrics_names)
        Y_pred = self.model.predict(X_test)
        y_pred = np.argmax(Y_pred, axis=1)
        print(confusion_matrix(np.argmax(Y_test,axis=1), y_pred))
        print(classification_report(np.argmax(Y_test,axis=1), y_pred))



In [18]:
path = 'airline_sentiment_analysis.csv'
sentiment_analysis = SentimentAnalysis(path)
sentiment_analysis.train()



Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 32, 128)           256000    
                                                                 
 spatial_dropout1d_3 (Spatia  (None, 32, 128)          0         
 lDropout1D)                                                     
                                                                 
 lstm_4 (LSTM)               (None, 196)               254800    
                                                                 
 dense_5 (Dense)             (None, 2)                 394       
                                                                 
Total params: 511,194
Trainable params: 511,194
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/7
289/289 - 48s - loss: 0.3182 - accuracy: 0.8683 - 48s/epoch - 167ms/step
Epoch 2/7
289/289 - 39s - 

In [23]:
#Inference class
class Inference:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer

    def predict(self, text):
        text = text.lower()
        text = re.sub('[^a-zA-z0-9\s]','',text)
        text = self.tokenizer.texts_to_sequences([text])
        text = pad_sequences(text, maxlen=32)
        sentiment = self.model.predict(text, batch_size=1, verbose=2)[0]
        if(np.argmax(sentiment) == 0):
            print("negative")
        elif (np.argmax(sentiment) == 1):
            print("positive")

In [26]:

inference = Inference(sentiment_analysis.model, sentiment_analysis.tokenizer)
inference.predict("This airline is super awesome!")
inference.predict("This airline is very interesting!")
inference.predict("This airline is very bad!")


1/1 - 0s - 51ms/epoch - 51ms/step
positive
1/1 - 0s - 38ms/epoch - 38ms/step
positive
1/1 - 0s - 36ms/epoch - 36ms/step
negative


In [28]:
#save the model
sentiment_analysis.model.save('sentiment_analysis.h5')

#load the model
model = keras.models.load_model('sentiment_analysis.h5')

#inference
inference = Inference(model, sentiment_analysis.tokenizer)
inference.predict("This airline is super awesome!")
inference.predict("This airline is very interesting!")
inference.predict("This airline is very bad!")

1/1 - 1s - 603ms/epoch - 603ms/step
positive
1/1 - 0s - 90ms/epoch - 90ms/step
positive
1/1 - 0s - 100ms/epoch - 100ms/step
negative


The Accuracy of the model is:  0.967