# LSTM Model for classification of tweet sentiment

---

# 1. Installs and imports

## 1.1. Install all required libraries

In [117]:
# Uncomment line below to install all required libraries
# !pip3 install -r ../requirements.txt -q

## 1.2. Import required libraries

In [118]:
import pandas as pd
import numpy as np

from collections import Counter
from sklearn.model_selection import train_test_split

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.initializers import constant
from keras.optimizers import Adam

---

# 2. Load cleaned tweets dataset

In [119]:
df = pd.read_csv('./cleaned_tweets.csv')

In [120]:
df.head()

Unnamed: 0,sentiment,text,cleaned_tweet,Porter_Stem,Snowball_Stem
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",awww thats bummer shoulda got david carr third...,awww that bummer shoulda got david carr third day,awww that bummer shoulda got david carr third day
1,0,is upset that he can't update his Facebook by ...,upset cant update facebook texting might cry r...,upset cant updat facebook text might cri resul...,upset cant updat facebook text might cri resul...
2,0,@Kenichan I dived many times for the ball. Man...,dived many times ball managed save rest go bounds,dive mani time ball manag save rest go bound,dive mani time ball manag save rest go bound
3,0,my whole body feels itchy and like its on fire,whole body feels itchy like fire,whole bodi feel itchi like fire,whole bodi feel itchi like fire
4,0,"@nationwideclass no, it's not behaving at all....",behaving im mad cant see,behav im mad cant see,behav im mad cant see


# 3. Drop text

In [121]:
df = df[['sentiment', 'Snowball_Stem']]

In [122]:
df.head()

Unnamed: 0,sentiment,Snowball_Stem
0,0,awww that bummer shoulda got david carr third day
1,0,upset cant updat facebook text might cri resul...
2,0,dive mani time ball manag save rest go bound
3,0,whole bodi feel itchi like fire
4,0,behav im mad cant see


# 4. Drop rows with NaN

In [123]:
df.isna().sum()

sentiment           0
Snowball_Stem    7661
dtype: int64

In [124]:
df = df.dropna()

In [125]:
df.isna().sum()

sentiment        0
Snowball_Stem    0
dtype: int64

---

# 5. Reduce dataframe size

In [126]:
df[df.sentiment != 0].shape

(796018, 2)

In [127]:
df[df.sentiment == 0].shape

(796321, 2)

In [128]:
df[df.sentiment != 0][:200000].shape

(200000, 2)

In [129]:
df[df.sentiment == 0][:200000].shape

(200000, 2)

In [130]:
reduced_df = pd.concat([df[df.sentiment != 0][:200000], df[df.sentiment == 0][:200000]])

In [131]:
reduced_df.shape

(400000, 2)

In [132]:
df = reduced_df

---

# 5. Split dataset into training and test data

In [133]:
X = df['Snowball_Stem']

In [134]:
y = df['sentiment']

In [135]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

- 75% training data
- 25% test data

In [136]:
X_train.shape

(300000,)

In [137]:
y_train.shape

(300000,)

In [138]:
X_test.shape

(100000,)

In [139]:
y_test.shape

(100000,)

---

# 6. Collection of all unique words in corpus

In [1]:
# Count of all unique words

def count_unique_words(tweets):
    unique = Counter()
    for tweet in tweets:
        for word in tweet.split():
            unique[word] += 1
    return unique


In [2]:
word_count = count_unique_words(X_train)

NameError: name 'X_train' is not defined

In [142]:
len(word_count)

78734

---

# 7. LSTM Model

## 7.1. Max number of words in a sequence

In [143]:
max_seq_length = 40

## 7.2. Tokenizing the words

In [144]:
tokenizer = Tokenizer(num_words=len(word_count))
tokenizer.fit_on_texts(X_train)

In [145]:
word_index = tokenizer.word_index

In [146]:
# Index for each word in tokenizer
len(word_index)

77981

## 7.3. Convert to sequences

In [147]:
X_train_seq = tokenizer.texts_to_sequences(X_train)

In [148]:
X_train_seq[0]

[18, 24, 55, 13]

## 7.4. Padding sequences

In [149]:
X_train_pad = pad_sequences(X_train_seq, maxlen=max_seq_length, padding="post", truncating="post")

In [150]:
X_train_pad[0]

array([18, 24, 55, 13,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0], dtype=int32)

## 7.5 Performing tokenization and padding for test set

In [151]:
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [152]:
X_test_pad = pad_sequences(X_test_seq, maxlen=max_seq_length, padding="post", truncating="post")

## 7.6 Understanding training and testing data

In [153]:
X_train_seq[0]

[18, 24, 55, 13]

In [154]:
X_train_pad[0]

array([18, 24, 55, 13,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0], dtype=int32)

In [155]:
X_test_seq[0]

[52, 111]

In [156]:
X_test_pad[0]

array([ 52, 111,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0], dtype=int32)

In [157]:
X_train_pad.shape

(300000, 40)

In [158]:
X_test_pad.shape

(100000, 40)

In [159]:
y_train.shape

(300000,)

In [160]:
y_test.shape

(100000,)

## 7.7 Training the model

In [168]:
model = Sequential()

# Maps each word to a finite vector
model.add(Embedding(len(word_count), 5, input_length=max_seq_length))

model.add(LSTM(32, dropout=0.1))

model.add(Dense(1, activation="sigmoid"))

optimizer = Adam(learning_rate=3e-2)

model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=['accuracy'])


In [169]:
model.summary()

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 40, 5)             393670    
_________________________________________________________________
lstm_9 (LSTM)                (None, 32)                4864      
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 33        
Total params: 398,567
Trainable params: 398,567
Non-trainable params: 0
_________________________________________________________________


In [170]:
model.fit(X_train_pad, y_train, epochs=7, validation_data=(X_test_pad, y_test))

Epoch 1/7

KeyboardInterrupt: 

---

# 8. Evaluating model

## 8.1. Create test dataset

In [98]:
test_df = pd.read_csv('./cleaned_tweets.csv')

In [99]:
test_df = pd.concat([test_df[test_df.sentiment != 0][100000:200000], test_df[test_df.sentiment == 0][100000:200000]])

In [100]:
test_df.head()

Unnamed: 0,sentiment,text,cleaned_tweet,Porter_Stem,Snowball_Stem
900000,1,Thanks for pointing out the crucial problems @...,thanks pointing crucial problems taken care cc,thank point crucial problem taken care cc,thank point crucial problem taken care cc
900001,1,please ignore cheesey music,please ignore cheesey music,pleas ignor cheesey music,pleas ignor cheesey music
900002,1,just got home from a meeting with the girls......,got home meeting girls maaaaaan im exhausted g...,got home meet girl maaaaaan im exhaust goodnig...,got home meet girl maaaaaan im exhaust goodnig...
900003,1,@db0y8199 lol thats the kind im eating!!,lol thats kind im eating,lol that kind im eat,lol that kind im eat
900004,1,victory for the bulldogs was celebrated by 3 w...,victory bulldogs celebrated white chocolate ch...,victori bulldog celebr white chocol cheesecak ...,victori bulldog celebr white chocol cheesecak ...


In [101]:
test_df = test_df[['sentiment', 'Snowball_Stem']]

In [102]:
test_df.head()

Unnamed: 0,sentiment,Snowball_Stem
900000,1,thank point crucial problem taken care cc
900001,1,pleas ignor cheesey music
900002,1,got home meet girl maaaaaan im exhaust goodnig...
900003,1,lol that kind im eat
900004,1,victori bulldog celebr white chocol cheesecak ...


## 8.2. Drop rows with NaN

In [103]:
test_df.isna().sum()

sentiment           0
Snowball_Stem    1013
dtype: int64

In [104]:
test_df = test_df.dropna()

In [105]:
test_df.isna().sum()

sentiment        0
Snowball_Stem    0
dtype: int64

## 8.2. Tokenization and padding

In [106]:
test_tweet = test_df['Snowball_Stem']

In [107]:
test_label = test_df['sentiment']

In [108]:
test_tweet.head()

900000            thank point crucial problem taken care cc
900001                            pleas ignor cheesey music
900002    got home meet girl maaaaaan im exhaust goodnig...
900003                                 lol that kind im eat
900004    victori bulldog celebr white chocol cheesecak ...
Name: Snowball_Stem, dtype: object

In [109]:
test_label.head()

900000    1
900001    1
900002    1
900003    1
900004    1
Name: sentiment, dtype: int64

In [110]:
test_tweet_seq = tokenizer.texts_to_sequences(test_tweet)

In [111]:
test_tweet_pad = pad_sequences(test_tweet_seq, maxlen=max_seq_length, padding="post", truncating="post")

In [112]:
test_tweet_pad[0]

array([   15,   478, 11272,   376,   907,   358,  2853,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0], dtype=int32)

## 8.3. Evaluate

In [113]:
scores = model.evaluate(test_tweet_pad, test_label)



In [114]:
scores

[0.5518655776977539, 0.7501444816589355]

In [115]:
loss, accuracy = scores

In [116]:
print("Loss on test set:", loss)
print("Accuracy achieve on test set:", accuracy)

Loss on test set: 0.5518655776977539
Accuracy achieve on test set: 0.7501444816589355


---

# Save model

In [94]:
model.save("./LSTM_train_81_val_76_81_test_75_acc.h5")