# Creating an Ensemble Model

## Installs and imports

### Install all required libraries

In [1]:
# Uncomment line below to install all required libraries
# !pip3 install -r ../requirements.txt -q

### Import required libraries

In [2]:
import nltk
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection, naive_bayes, svm
import numpy as np
from sklearn.metrics import accuracy_score
import pickle
from tensorflow import keras

In [3]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.initializers import constant
from keras.optimizers import Adam
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

## Load cleaned tweets dataset

In [4]:
df = pd.read_csv('./cleaned_tweets.csv')

In [5]:
np.random.seed(450)
df.head()

Unnamed: 0,sentiment,text,cleaned_tweet,Porter_Stem,Snowball_Stem
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",aww bummer shoulda got david carr third day,aww bummer shoulda got david carr third day,aww bummer shoulda got david carr third day
1,0,is upset that he can't update his Facebook by ...,upset cannot update facebook texting might cry...,upset can not updat facebook text might cri re...,upset can not updat facebook text might cri re...
2,0,@Kenichan I dived many times for the ball. Man...,dived many times ball managed save rest go bounds,dive mani time ball manag save rest go bound,dive mani time ball manag save rest go bound
3,0,my whole body feels itchy and like its on fire,whole body feels itchy like fire,whole bodi feel itchi like fire,whole bodi feel itchi like fire
4,0,"@nationwideclass no, it's not behaving at all....",behaving im mad cannot see,behav im mad can not see,behav im mad can not see


## Drop text

In [6]:
df = df[['sentiment', 'Snowball_Stem']]

In [7]:
df.head()

Unnamed: 0,sentiment,Snowball_Stem
0,0,aww bummer shoulda got david carr third day
1,0,upset can not updat facebook text might cri re...
2,0,dive mani time ball manag save rest go bound
3,0,whole bodi feel itchi like fire
4,0,behav im mad can not see


## Drop rows with NaN

In [8]:
df.isna().sum()

sentiment           0
Snowball_Stem    8046
dtype: int64

In [9]:
df = df.dropna()

In [10]:
df.isna().sum()

sentiment        0
Snowball_Stem    0
dtype: int64

# 5. Reduce dataframe size

In [11]:
df[df.sentiment != 0].shape

(795860, 2)

In [12]:
df[df.sentiment == 0].shape

(796094, 2)

In [13]:
df[df.sentiment != 0][:200000].shape

(200000, 2)

In [14]:
df[df.sentiment == 0][:200000].shape

(200000, 2)

In [15]:
reduced_df = pd.concat([df[df.sentiment != 0][740000:], df[df.sentiment == 0][740000:]])

In [16]:
reduced_df.shape

(111954, 2)

In [17]:
#df = reduced_df

In [18]:
X = df['Snowball_Stem']

In [19]:
Y = df['sentiment']

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, Y)

In [21]:
#X_test= reduced_df['Snowball_Stem']

In [22]:
#y_test = reduced_df['sentiment']

In [23]:
#X_train, X_test, y_train, y_test = train_test_split(X, y)
#X_test.head()

In [24]:
#X_train.shape, X_test.shape, y_train.shape, y_test.shape
#y_test.head()

## Loading the models

In [45]:
svm_path = './SVM_UnigramBigram_75.pickle'
nb_path = './NB_UnigramBigram_78.pickle'
lstm_token_path = './LSTM_tokenizer.pickle'
lstm_path = './LSTM_train_75_val_78_test_79_acc.h5'
vectorizer_path = './UnigramBigram_vectorizer.pickle'
RF_path = './RFC_UnigramBigram_72.pickle'
# DT_path = 'DT_72.pickle'

In [47]:
svm = pickle.load(open(svm_path,'rb'))

In [48]:
nb = pickle.load(open(nb_path,'rb'))

In [49]:
tokenizer = pickle.load(open(lstm_token_path,'rb'))

In [50]:
lstm = keras.models.load_model(lstm_path)

In [51]:
vectorizer = pickle.load(open(vectorizer_path,'rb'))

In [52]:
RF = pickle.load(open(RF_path,'rb'))

In [53]:
DT = pickle.load(open(DT_path,'rb'))

# Encoding labels

In [39]:
Encoder = LabelEncoder()
y_test = Encoder.fit_transform(y_test)

## Running the models

In [40]:
xtest = vectorizer.transform(X_test)

### Naive bayes

In [41]:
NB3 = nb.predict(xtest)

In [42]:
NB_acc = accuracy_score(NB3, y_test)*100

In [43]:
NB_acc

77.87652422554392

### SVM

In [54]:
SVM_pred = svm.predict(xtest)

In [55]:
SVM_acc = accuracy_score(SVM_pred, y_test)*100

In [56]:
SVM_acc

77.36997756219392

### LSTM

# Count of all unique words

def count_unique_words(tweets):
    unique = Counter()
    for tweet in tweets:
        for word in tweet.split():
            unique[word] += 1
    return unique


word_count = count_unique_words(X_test)

In [60]:
max_seq_length = 20

tokenizer = Tokenizer(num_words=len(word_count))
tokenizer.fit_on_texts(X_test)

In [57]:
test_tweet_seq = tokenizer.texts_to_sequences(X_test)

In [61]:
test_tweet_pad = pad_sequences(test_tweet_seq, maxlen=max_seq_length, padding="post", truncating="post")

In [62]:
test_tweet_pad[0]

array([   8,  582,  356,   28,  239,   80, 3031,    8,   32,   38,  279,
          0,    0,    0,    0,    0,    0,    0,    0,    0])

In [63]:
lstm_pred = lstm.predict(test_tweet_pad)

In [64]:
LSTM_pred =[]
for i in lstm_pred:
    a = float(i)
    if a>=0.5:
        a = 1
    else:
        a = 0
    LSTM_pred.append(a)

In [65]:
LSTM_pred = np.array(LSTM_pred)

In [75]:
lstm_acc = accuracy_score(LSTM_pred, y_test)*100
lstm_acc

79.1798265781215

In [67]:
scores = lstm.evaluate(test_tweet_pad, y_test)
scores



[0.44447553157806396, 0.7917982935905457]

## Random Forest

In [68]:
RF_pred = RF.predict(xtest)

In [69]:
RF_acc = accuracy_score(RF_pred, y_test)*100
RF_acc

73.10654314566483

## Decision Tree

In [None]:
DT_pred = DT.predict(xtest)

In [None]:
len(DT_pred)

In [None]:
DT_acc = accuracy_score(DT_pred, y_test)*100
DT_acc

## Putting predictions in dataframe

In [97]:
pdf = pd.DataFrame(y_test,columns=['Expected'])

In [98]:
pdf['SVM Predictions'] = SVM_pred
pdf['Naive Bayes Predictions'] = NB3
pdf['LSTM Predictions'] = LSTM_pred
#pdf['Random Forest Predictions'] = RF_pred
#pdf['Decision Tree Predictions'] = DT_pred

In [99]:
pdf.head()[1:3]

Unnamed: 0,Expected,SVM Predictions,Naive Bayes Predictions,LSTM Predictions
1,1,1,1,1
2,0,0,1,1


In [81]:
lstm_acc

79.1798265781215

### Ensemble func

In [100]:
deno = (NB_acc + SVM_acc+ lstm_acc)
wNB = NB_acc/deno
wSVM = SVM_acc/deno
wLSTM = lstm_acc/deno
# wRF = RF_acc/deno
# wDT = DT_acc/deno
classWeights = [wSVM, wNB, wLSTM]
classWeights

[0.33003962524826075, 0.3322004177960988, 0.3377599569556406]

In [101]:
def ensemble_predict(a):
    pc = 0
    nc = 0
    for i in a:
        if i == 'Expected':
            continue
        if int(a[i]) ==1:
            pc+=1
        else:
            nc+=1
            
    p_pc = pc/(pc+nc)
    p_nc = 1-p_pc
    pscore =0
    nscore =0
    for w in classWeights:
        pscore+=w*pc
        nscore+=w*nc
    
    if pscore>nscore:
        return 1
    else:
        return 0

In [102]:
def ensemble(a):
    pc = 0
    nc = 0
    for i in a:
        if i == 'Expected':
            continue
        if int(a[i]) ==1:
            pc+=1
        else:
            nc+=1
            
#     p_pc = pc/(pc+nc)
#     p_nc = 1-p_pc
#     pscore =0
#     nscore =0
#     for w in classWeights:
#         pscore+=w*p_pc
#         nscore+=w*p_nc
    
    if pc>=nc:
        return 1
    else:
        return 0

In [103]:
l = len(pdf)
l

397989

In [104]:
ensPred = []
for i in range(l):
    ensPred.append(ensemble_predict(pdf[i:i+1]))

In [105]:
accuracy_score(ensPred, y_test)*100

79.35344946719633

In [106]:
ensPredi = []
for i in range(l):
    ensPredi.append(ensemble(pdf[i:i+1]))

In [107]:
accuracy_score(ensPredi, y_test)*100

79.35344946719633