In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.utils import np_utils
from google.colab import files
import numpy as np

In [5]:
Tweets = pd.read_csv("Tweets.csv")
Tweets.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [6]:
Tweets.groupby(['airline_sentiment']).size()

airline_sentiment
negative    9178
neutral     3099
positive    2363
dtype: int64

In [7]:
Tweets = Tweets[Tweets['airline_sentiment_confidence'] > 0.8]

In [8]:
token = Tokenizer(num_words=100)
token.fit_on_texts(Tweets['text'].values)

In [9]:
X = token.texts_to_sequences(Tweets['text'].values)
X = pad_sequences(X, padding="post", maxlen=100)

In [11]:
print(X)


[[97 62  0 ...  0  0  0]
 [97 99  1 ...  0  0  0]
 [97  9 99 ...  0  0  0]
 ...
 [13 98 93 ...  0  0  0]
 [13 89  1 ...  0  0  0]
 [13  6 23 ...  0  0  0]]


In [12]:
labelencoder = LabelEncoder()
y = labelencoder.fit_transform(Tweets['airline_sentiment'])
print(y)

[1 0 0 ... 0 1 0]


In [13]:
y = np_utils.to_categorical(y)
print(y)

[[0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 ...
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]]


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3)
X_test

array([[13, 65, 23, ...,  0,  0,  0],
       [12, 85,  1, ...,  0,  0,  0],
       [ 8, 15, 29, ...,  0,  0,  0],
       ...,
       [ 8, 11, 67, ...,  0,  0,  0],
       [12, 11,  7, ...,  0,  0,  0],
       [16, 20, 33, ...,  0,  0,  0]], dtype=int32)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3)
X_test

array([[13,  7, 69, ...,  0,  0,  0],
       [18,  3, 61, ...,  0,  0,  0],
       [12, 29,  5, ...,  0,  0,  0],
       ...,
       [ 8, 89,  6, ...,  0,  0,  0],
       [12,  7, 10, ...,  0,  0,  0],
       [16,  0,  0, ...,  0,  0,  0]], dtype=int32)

In [16]:
modelo = Sequential()
modelo.add(Embedding(input_dim= len(token.word_index), output_dim=128, input_length=X.shape[1]))
modelo.add(SpatialDropout1D(0.2))
modelo.add(LSTM(units=196, dropout=0.2, recurrent_dropout=0, activation='tanh',
                recurrent_activation='sigmoid', unroll=False, use_bias=True))
modelo.add(Dense(units=3,activation="softmax"))

In [17]:
modelo.compile(loss='categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])
print(modelo.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 128)          1638656   
                                                                 
 spatial_dropout1d (SpatialD  (None, 100, 128)         0         
 ropout1D)                                                       
                                                                 
 lstm (LSTM)                 (None, 196)               254800    
                                                                 
 dense (Dense)               (None, 3)                 591       
                                                                 
Total params: 1,894,047
Trainable params: 1,894,047
Non-trainable params: 0
_________________________________________________________________
None


In [20]:
modelo.fit(X_train, y_train, epochs=10, batch_size=30,verbose=True,validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f1191012980>

In [21]:
loss, accuracy = modelo.evaluate(X_test,y_test)
print("Loss: ", loss)
print("Accuracy: ", accuracy)

Loss:  0.8120669722557068
Accuracy:  0.7049075961112976


In [22]:
prev = modelo.predict(X_test)
print(prev)

[[0.72002596 0.14561068 0.13436343]
 [0.72002596 0.14561066 0.13436343]
 [0.72002596 0.14561066 0.13436343]
 ...
 [0.72002596 0.14561068 0.13436343]
 [0.7200259  0.14561068 0.13436341]
 [0.7200259  0.14561068 0.13436341]]
