# Creating an Ensemble Model

## Installs and imports

### Install all required libraries

In [1]:
# Uncomment line below to install all required libraries
# !pip3 install -r ../requirements.txt -q

### Import required libraries

In [2]:
import nltk
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection, naive_bayes, svm
import numpy as np
from sklearn.metrics import accuracy_score
import pickle
from tensorflow import keras

In [3]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.initializers import constant
from keras.optimizers import Adam
from collections import Counter
from sklearn.model_selection import train_test_split

## Load cleaned tweets dataset

In [4]:
df = pd.read_csv('./cleaned_tweets.csv')

In [5]:
np.random.seed(450)
df.head()

Unnamed: 0,sentiment,text,cleaned_tweet,Porter_Stem,Snowball_Stem
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",awww thats bummer shoulda got david carr third...,awww that bummer shoulda got david carr third day,awww that bummer shoulda got david carr third day
1,0,is upset that he can't update his Facebook by ...,upset cant update facebook texting might cry r...,upset cant updat facebook text might cri resul...,upset cant updat facebook text might cri resul...
2,0,@Kenichan I dived many times for the ball. Man...,dived many times ball managed save rest go bounds,dive mani time ball manag save rest go bound,dive mani time ball manag save rest go bound
3,0,my whole body feels itchy and like its on fire,whole body feels itchy like fire,whole bodi feel itchi like fire,whole bodi feel itchi like fire
4,0,"@nationwideclass no, it's not behaving at all....",behaving im mad cant see,behav im mad cant see,behav im mad cant see


## Drop text

In [6]:
df = df[['sentiment', 'Snowball_Stem']]

In [7]:
df.head()

Unnamed: 0,sentiment,Snowball_Stem
0,0,awww that bummer shoulda got david carr third day
1,0,upset cant updat facebook text might cri resul...
2,0,dive mani time ball manag save rest go bound
3,0,whole bodi feel itchi like fire
4,0,behav im mad cant see


## Drop rows with NaN

In [8]:
df.isna().sum()

sentiment           0
Snowball_Stem    7661
dtype: int64

In [9]:
df = df.dropna()

In [10]:
df.isna().sum()

sentiment        0
Snowball_Stem    0
dtype: int64

# 5. Reduce dataframe size

In [11]:
df[df.sentiment != 0].shape

(796018, 2)

In [12]:
df[df.sentiment == 0].shape

(796321, 2)

In [13]:
df[df.sentiment != 0][:200000].shape

(200000, 2)

In [14]:
df[df.sentiment == 0][:200000].shape

(200000, 2)

In [15]:
reduced_df = pd.concat([df[df.sentiment != 0][640000:690000], df[df.sentiment == 0][640000:690000]])

In [16]:
reduced_df.shape

(100000, 2)

In [17]:
#df = reduced_df

In [18]:
X = df['Snowball_Stem']

In [19]:
Y = df['sentiment']

In [20]:
X_test= reduced_df['Snowball_Stem']

In [21]:
y_test = reduced_df['sentiment']

In [22]:
#X_train, X_test, y_train, y_test = train_test_split(X, y)
X_test.head()

1443331    haley like sing oreo mouth love nick love talk...
1443332    love weekend famili make popcorn clean mess wo...
1443333    world wrap orang ribbon sinc orang fave color ...
1443334                            hahaha someth amus ploiis
1443335    christiiiiin thanx homi ur town come tomorrow ...
Name: Snowball_Stem, dtype: object

In [23]:
#X_train.shape, X_test.shape, y_train.shape, y_test.shape
y_test.head()

1443331    1
1443332    1
1443333    1
1443334    1
1443335    1
Name: sentiment, dtype: int64

## Loading the models

In [24]:
svm_path = './SVM_UnigramBigram_75.pickle'
nb_path = './NB_UnigramBigram_78.pickle'
lstm_path = './LSTM_train_79_val_76_test_74_acc.h5'
vectorizer_path = './UnigramBigram_vectorizer.pickle'

In [25]:
svm = pickle.load(open(svm_path,'rb'))

In [26]:
nb = pickle.load(open(nb_path,'rb'))

In [27]:
lstm = keras.models.load_model(lstm_path)

In [28]:
vectorizer = pickle.load(open(vectorizer_path,'rb'))

# Encoding labels

In [29]:
Encoder = LabelEncoder()
y_test = Encoder.fit_transform(y_test)

## Running the models

### Naive bayes

In [30]:
xtest = vectorizer.transform(X_test)

In [31]:
NB3 = nb.predict(xtest)

In [32]:
NB_acc = accuracy_score(NB3, y_test)*100

In [33]:
NB_acc

87.153

### SVM

In [34]:
SVM_pred = svm.predict(xtest)

In [35]:
SVM_acc = accuracy_score(SVM_pred, y_test)*100

### LSTM

In [36]:
# Count of all unique words

def count_unique_words(tweets):
    unique = Counter()
    for tweet in tweets:
        for word in tweet.split():
            unique[word] += 1
    return unique


In [37]:
word_count = count_unique_words(X_test)

In [38]:
max_seq_length = 40

In [39]:
tokenizer = Tokenizer(num_words=len(word_count))
tokenizer.fit_on_texts(X_test)

In [40]:
test_tweet_seq = tokenizer.texts_to_sequences(X_test)

In [41]:
test_tweet_pad = pad_sequences(test_tweet_seq, maxlen=max_seq_length, padding="post", truncating="post")

In [42]:
test_tweet_pad[0]

array([ 4424,     7,   482,  4029,  1102,     6,   746,     6,   122,
       17303,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0])

In [43]:
lstm_pred = lstm.predict(test_tweet_pad)

In [44]:
LSTM_pred =[]
for i in lstm_pred:
    a = float(i)
    if a>=0.5:
        a = 1
    else:
        a = 0
    LSTM_pred.append(a)

In [45]:
LSTM_pred = np.array(LSTM_pred)

In [46]:
lstm_acc = accuracy_score(LSTM_pred, y_test)
lstm_acc

0.50842

In [47]:
scores = lstm.evaluate(test_tweet_pad, y_test)
scores



[0.9449368715286255, 0.5084199905395508]

## Putting predictions in dataframe

In [48]:
pdf = pd.DataFrame(y_test,columns=['Expected'])

In [56]:
pdf.head()[1:3]

Unnamed: 0,Expected,SVM Predictions,Naive Bayes Predictions,LSTM Predictions
1,1,1,1,0
2,1,1,1,0


In [50]:
pdf['SVM Predictions'] = SVM_pred
pdf['Naive Bayes Predictions'] = NB3
pdf['LSTM Predictions'] = LSTM_pred

### Ensemble func

In [87]:
deno = (NB_acc + SVM_acc)
wNB = NB_acc/deno
wSVM = SVM_acc/deno
wLSTM = lstm_acc/deno
classWeights = [wNB, wSVM]
classWeights

[0.530967466796637, 0.469032533203363]

In [88]:
def ensemble_predict(a):
    pc = 0
    nc = 0
    for i in a:
        if i == 'Expected':
            continue
        if int(a[i]) ==1:
            pc+=1
        else:
            nc+=1
            
    p_pc = pc/(pc+nc)
    p_nc = 1-p_pc
    pscore =0
    nscore =0
    for w in classWeights:
        pscore+=w*pc
        nscore+=w*nc
    
    if pscore>nscore:
        return 1
    else:
        return 0

In [89]:
l = len(pdf)
l

100000

In [90]:
ensPred = []
for i in range(l):
    ensPred.append(ensemble_predict(pdf[i:i+1]))

In [91]:
accuracy_score(ensPred, y_test)*100

82.094