In [45]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re
from keras.callbacks import ModelCheckpoint

In [2]:
data=pd.read_csv('airline_sentiment_analysis.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,airline_sentiment,text
0,1,positive,@VirginAmerica plus you've added commercials t...
1,3,negative,@VirginAmerica it's really aggressive to blast...
2,4,negative,@VirginAmerica and it's a really big bad thing...
3,5,negative,@VirginAmerica seriously would pay $30 a fligh...
4,6,positive,"@VirginAmerica yes, nearly every time I fly VX..."


In [3]:
data=data[['airline_sentiment','text']]

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11541 entries, 0 to 11540
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   airline_sentiment  11541 non-null  object
 1   text               11541 non-null  object
dtypes: object(2)
memory usage: 180.5+ KB


In [6]:
data['text'][0]

"@VirginAmerica plus you've added commercials to the experience... tacky."

In [9]:
Y = pd.get_dummies(data['airline_sentiment'],drop_first=True).values

In [12]:
Y

array([[1],
       [0],
       [0],
       ...,
       [1],
       [0],
       [0]], dtype=uint8)

In [13]:
data['text'] = data['text'].apply(lambda x: x.lower())
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

In [33]:
data.iloc[1]

airline_sentiment                                             negative
text                 its really aggressive to blast obnoxious enter...
Name: 1, dtype: object

In [18]:
data['text']= [x.split(maxsplit=1)[1] for x in data['text']]


In [20]:
data.head()

Unnamed: 0,airline_sentiment,text
0,positive,plus youve added commercials to the experience...
1,negative,its really aggressive to blast obnoxious enter...
2,negative,and its a really big bad thing about it
3,negative,seriously would pay 30 a flight for seats that...
4,positive,yes nearly every time i fly vx this ear worm w...


In [21]:
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['text'].values)

In [25]:
X = tokenizer.texts_to_sequences(data['text'].values)

In [40]:
X = pad_sequences(X)
print(X[75])

[   0    0    0    0    0    0    0    0    0    0    0    0   15 1383
   11  285  205   29   26   16   17  493    1   62  237   10  176  321
   53   15   17]


In [41]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 31, 128)           256000    
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 31, 128)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 196)               254800    
_________________________________________________________________
dense (Dense)                (None, 1)                 197       
Total params: 510,997
Trainable params: 510,997
Non-trainable params: 0
_________________________________________________________________
None


In [44]:
X_train, x_test, Y_train, y_test = train_test_split(X,Y, test_size = 0.34, random_state = 42)
X_valid, X_test , Y_valid, Y_test = train_test_split(x_test,y_test, test_size = 0.5, random_state = 42)

print(X_train.shape,Y_train.shape)
print(X_valid.shape,Y_valid.shape)
print(X_test.shape,Y_test.shape)

(7617, 31) (7617, 1)
(1962, 31) (1962, 1)
(1962, 31) (1962, 1)


In [50]:
checkpt= ModelCheckpoint('model.h5',monitor='val_accuracy', save_best_only=True, mode='max',verbose=1)

In [51]:
batch_size = 32
history= model.fit(X_train, Y_train,validation_data=(X_valid,Y_valid), epochs = 7, batch_size=batch_size, verbose = 1,callbacks=[checkpt])

Epoch 1/7
Epoch 00001: val_accuracy improved from -inf to 0.91590, saving model to model.h5
Epoch 2/7
Epoch 00002: val_accuracy improved from 0.91590 to 0.92100, saving model to model.h5
Epoch 3/7
Epoch 00003: val_accuracy did not improve from 0.92100
Epoch 4/7
Epoch 00004: val_accuracy did not improve from 0.92100
Epoch 5/7
Epoch 00005: val_accuracy improved from 0.92100 to 0.92202, saving model to model.h5
Epoch 6/7
Epoch 00006: val_accuracy did not improve from 0.92202
Epoch 7/7
Epoch 00007: val_accuracy did not improve from 0.92202


In [52]:
model.evaluate(X_test, Y_test, verbose = 1, batch_size = batch_size)



[0.4567536413669586, 0.9062181711196899]