In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
import tensorflow
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

In [2]:
pd.set_option('display.max_colwidth',200)
df=pd.read_csv('../input/tweet-1/train.csv',encoding='iso-8859-1')

In [3]:
df.isnull().sum()

ItemID           0
Sentiment        0
SentimentText    0
dtype: int64

In [4]:
df

Unnamed: 0,ItemID,Sentiment,SentimentText
0,1,0,is so sad for my APL friend.............
1,2,0,I missed the New Moon trailer...
2,3,1,omg its already 7:30 :O
3,4,0,.. Omgaga. Im sooo im gunna CRy. I've been at this dentist since 11.. I was suposed 2 just get a crown put on (30mins)...
4,5,0,i think mi bf is cheating on me!!! T_T
...,...,...,...
99984,99996,0,@Cupcake seems like a repeating problem hope you're able to find something.
99985,99997,1,"@cupcake__ arrrr we both replied to each other over different tweets at the same time , i'll see you then, Duno where the hell Kateyy is!"
99986,99998,0,@CuPcAkE_2120 ya i thought so
99987,99999,1,@Cupcake_Dollie Yes. Yes. I'm glad you had more fun with me.


In [5]:
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
        
    return input_txt 

In [6]:
#Remove twitter handles (@VirginAmerica| and @AmericanAir)
df['SentimentTexts'] = np.vectorize(remove_pattern)(df['SentimentText'], "@[\w]*")
df=df.drop(["SentimentText"],axis=1)
df

Unnamed: 0,ItemID,Sentiment,SentimentTexts
0,1,0,is so sad for my APL friend.............
1,2,0,I missed the New Moon trailer...
2,3,1,omg its already 7:30 :O
3,4,0,.. Omgaga. Im sooo im gunna CRy. I've been at this dentist since 11.. I was suposed 2 just get a crown put on (30mins)...
4,5,0,i think mi bf is cheating on me!!! T_T
...,...,...,...
99984,99996,0,seems like a repeating problem hope you're able to find something.
99985,99997,1,"arrrr we both replied to each other over different tweets at the same time , i'll see you then, Duno where the hell Kateyy is!"
99986,99998,0,ya i thought so
99987,99999,1,Yes. Yes. I'm glad you had more fun with me.


In [7]:
## Get the Independent Features

X=df.drop('Sentiment',axis=1)
X.shape


(99989, 2)

In [8]:
## Get the Dependent features
y=df['Sentiment']
y.shape

(99989,)

In [9]:
### Vocabulary size
voc_size=30000

In [10]:
X['SentimentTexts'][3]

"          .. Omgaga. Im sooo  im gunna CRy. I've been at this dentist since 11.. I was suposed 2 just get a crown put on (30mins)..."

In [11]:

#Dataset Preprocessing
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
corpus = []
for i in range(0, len(X)):
    tweet = re.sub('[^a-zA-Z]', ' ', X["SentimentTexts"][i])
    tweet = tweet.lower()
    tweet = tweet.split()
    
    tweet = [ps.stem(word) for word in tweet if not word in stopwords.words('english')]
    tweet = ' '.join(tweet)
    corpus.append(tweet)

In [12]:
corpus[1:7]

['miss new moon trailer',
 'omg alreadi',
 'omgaga im sooo im gunna cri dentist sinc supos get crown put min',
 'think mi bf cheat',
 'worri much',
 'juuuuuuuuuuuuuuuuussssst chillin']

In [13]:
#ONE-HOT REPRESENTATION
onehot_repr=[one_hot(words,voc_size)for words in corpus] 
onehot_repr[1:20]

[[16400, 3381, 2652, 16905],
 [1206, 5960],
 [6216,
  9124,
  13211,
  9124,
  25401,
  13014,
  21146,
  23702,
  28138,
  5056,
  9378,
  10045,
  8571],
 [29059, 18675, 4349, 15882],
 [22899, 22754],
 [19802, 16610],
 [22447, 17080, 28016, 29950, 13665],
 [12532, 4146, 24875, 16400, 5960],
 [9383, 15822, 2371],
 [14017, 29059, 13551],
 [13125, 15332, 24470, 8684],
 [27856, 20577, 11946],
 [23528, 29060, 27361, 11959],
 [27378, 3153, 9243],
 [15395, 5607, 23588, 9580],
 [17160, 25057, 15507, 20390, 11799, 11691, 5082, 22880, 312, 18721],
 [23588, 2625, 11094, 22142, 8065, 11445, 11548, 19298],
 [607, 13445, 9537, 19454],
 [27657, 12842, 13394, 20161, 24053, 22142, 19838, 14786, 11117]]

In [14]:
#Embedding Representing
sent_length=40
embedded_docs=pad_sequences(onehot_repr,padding='post',maxlen=sent_length)
print(embedded_docs)

[[12898 25634 15134 ...     0     0     0]
 [16400  3381  2652 ...     0     0     0]
 [ 1206  5960     0 ...     0     0     0]
 ...
 [15959 22884     0 ...     0     0     0]
 [10761 10761  7792 ...     0     0     0]
 [ 2210 10761     0 ...     0     0     0]]


In [15]:
embedded_docs[2]

array([1206, 5960,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0], dtype=int32)

In [16]:
## Creating model
embedding_vector_features=200
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(LSTM(200))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 40, 200)           6000000   
_________________________________________________________________
lstm (LSTM)                  (None, 200)               320800    
_________________________________________________________________
dense (Dense)                (None, 1)                 201       
Total params: 6,321,001
Trainable params: 6,321,001
Non-trainable params: 0
_________________________________________________________________
None


In [17]:
len(embedded_docs),y.shape #same same dimension

(99989, (99989,))

In [18]:
x=np.array(embedded_docs)
Y=np.array(y)

In [19]:
x.shape,Y.shape

((99989, 40), (99989,))

In [20]:
#Training the model
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, Y, test_size=0.25, random_state=0)

In [21]:
### Finally Training
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=10,batch_size=100)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f43b4128c10>

In [22]:
#Adding Dropout Layer
from tensorflow.keras.layers import Dropout
## Creating model
embedding_vector_features=200
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(Dropout(0.3))
model.add(LSTM(200))
model.add(Dropout(0.3))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [23]:
y_pred=model.predict_classes(X_test)

In [24]:
#Performance metrics and Accuracy
from sklearn.metrics import confusion_matrix,accuracy_score

confusion_matrix(y_test,y_pred)

array([[    0, 10949],
       [    0, 14049]])

In [25]:
accuracy_score(y_test,y_pred)

0.5620049603968318