# Creating an Ensemble Model

## Installs and imports

### Install all required libraries

In [None]:
# Uncomment line below to install all required libraries
# !pip3 install -r ../requirements.txt -q

### Import required libraries

In [2]:
import nltk
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection, naive_bayes, svm
import numpy as np
from sklearn.metrics import accuracy_score
import pickle
from tensorflow import keras

In [32]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.initializers import constant
from keras.optimizers import Adam
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

## Load cleaned tweets dataset

In [4]:
df = pd.read_csv('./cleaned_tweets.csv')

In [5]:
np.random.seed(450)
df.head()

Unnamed: 0,sentiment,text,cleaned_tweet,Porter_Stem,Snowball_Stem
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",awww thats bummer shoulda got david carr third...,awww that bummer shoulda got david carr third day,awww that bummer shoulda got david carr third day
1,0,is upset that he can't update his Facebook by ...,upset cant update facebook texting might cry r...,upset cant updat facebook text might cri resul...,upset cant updat facebook text might cri resul...
2,0,@Kenichan I dived many times for the ball. Man...,dived many times ball managed save rest go bounds,dive mani time ball manag save rest go bound,dive mani time ball manag save rest go bound
3,0,my whole body feels itchy and like its on fire,whole body feels itchy like fire,whole bodi feel itchi like fire,whole bodi feel itchi like fire
4,0,"@nationwideclass no, it's not behaving at all....",behaving im mad cant see,behav im mad cant see,behav im mad cant see


## Drop text

In [6]:
df = df[['sentiment', 'Snowball_Stem']]

In [7]:
df.head()

Unnamed: 0,sentiment,Snowball_Stem
0,0,awww that bummer shoulda got david carr third day
1,0,upset cant updat facebook text might cri resul...
2,0,dive mani time ball manag save rest go bound
3,0,whole bodi feel itchi like fire
4,0,behav im mad cant see


## Drop rows with NaN

In [8]:
df.isna().sum()

sentiment           0
Snowball_Stem    7661
dtype: int64

In [9]:
df = df.dropna()

In [10]:
df.isna().sum()

sentiment        0
Snowball_Stem    0
dtype: int64

# 5. Reduce dataframe size

In [11]:
df[df.sentiment != 0].shape

(796018, 2)

In [12]:
df[df.sentiment == 0].shape

(796321, 2)

In [13]:
df[df.sentiment != 0][:200000].shape

(200000, 2)

In [14]:
df[df.sentiment == 0][:200000].shape

(200000, 2)

In [59]:
reduced_df = pd.concat([df[df.sentiment != 0][740000:], df[df.sentiment == 0][740000:]])

In [60]:
reduced_df.shape

(112339, 2)

In [61]:
#df = reduced_df

In [80]:
X = df['Snowball_Stem']

In [81]:
Y = df['sentiment']

In [82]:
X_test= reduced_df['Snowball_Stem']

In [83]:
y_test = reduced_df['sentiment']

In [84]:
#X_train, X_test, y_train, y_test = train_test_split(X, y)
X_test.head()

1543744    strang introduc boyfriend vladimir despit name...
1543745    get follow day use www tweeterfollow com add e...
1543746                  look forward russel visit next week
1543747       im time life well ok show need go come back nc
1543748                               im realli realli happi
Name: Snowball_Stem, dtype: object

In [85]:
#X_train.shape, X_test.shape, y_train.shape, y_test.shape
y_test.head()

1543744    1
1543745    1
1543746    1
1543747    1
1543748    1
Name: sentiment, dtype: int64

## Loading the models

In [99]:
svm_path = './SVM_UnigramBigram_75.pickle'
nb_path = './NB_UnigramBigram_78.pickle'
lstm_path = './LSTM_train_79_val_76_test_74_acc.h5'
vectorizer_path = './UnigramBigram_vectorizer.pickle'
RF_path = './RFC_UnigramBigram.pickle'
DT_path = 'DT_72.pickle'

In [69]:
svm = pickle.load(open(svm_path,'rb'))

In [70]:
nb = pickle.load(open(nb_path,'rb'))

In [71]:
#lstm = keras.models.load_model(lstm_path)

In [72]:
vectorizer = pickle.load(open(vectorizer_path,'rb'))

In [100]:
RF = pickle.load(open(RF_path,'rb'))

In [74]:
DT = pickle.load(open(DT_path,'rb'))

# Encoding labels

In [86]:
Encoder = LabelEncoder()
y_test = Encoder.fit_transform(y_test)

## Running the models

### Naive bayes

In [89]:
xtest = vectorizer.transform(X_test)

In [94]:
NB3 = nb.predict(xtest)

In [95]:
NB_acc = accuracy_score(NB3, y_test)*100

In [96]:
NB_acc

86.68227418794898

### SVM

In [97]:
SVM_pred = svm.predict(xtest)

In [98]:
SVM_acc = accuracy_score(SVM_pred, y_test)*100

### LSTM

In [None]:
# Count of all unique words

def count_unique_words(tweets):
    unique = Counter()
    for tweet in tweets:
        for word in tweet.split():
            unique[word] += 1
    return unique


In [None]:
word_count = count_unique_words(X_test)

In [None]:
max_seq_length = 40

In [None]:
tokenizer = Tokenizer(num_words=len(word_count))
tokenizer.fit_on_texts(X_test)

In [None]:
test_tweet_seq = tokenizer.texts_to_sequences(X_test)

In [None]:
test_tweet_pad = pad_sequences(test_tweet_seq, maxlen=max_seq_length, padding="post", truncating="post")

In [None]:
test_tweet_pad[0]

In [None]:
lstm_pred = lstm.predict(test_tweet_pad)

In [None]:
LSTM_pred =[]
for i in lstm_pred:
    a = float(i)
    if a>=0.5:
        a = 1
    else:
        a = 0
    LSTM_pred.append(a)

In [None]:
LSTM_pred = np.array(LSTM_pred)

In [None]:
lstm_acc = accuracy_score(LSTM_pred, y_test)
lstm_acc

In [None]:
scores = lstm.evaluate(test_tweet_pad, y_test)
scores

## Random Forest

In [101]:
RF_pred = RF.predict(xtest)

In [102]:
RF_acc = accuracy_score(RF_pred, y_test)*100
RF_acc

71.23260844408442

## Decision Tree

In [91]:
DT_pred = DT.predict(xtest)

In [92]:
len(DT_pred)

112339

In [93]:
DT_acc = accuracy_score(DT_pred, y_test)*100
DT_acc

72.14146467388886

## Putting predictions in dataframe

In [45]:
pdf = pd.DataFrame(y_test,columns=['Expected'])

In [47]:
pdf['SVM Predictions'] = SVM_pred
pdf['Naive Bayes Predictions'] = NB3
#pdf['LSTM Predictions'] = LSTM_pred
#pdf['Random Forest Predictions'] = RF_pred
pdf['Decision Tree Predictions'] = DT_pred

In [48]:
pdf.head()[1:3]

Unnamed: 0,Expected,SVM Predictions,Naive Bayes Predictions,Decision Tree Predictions
1,1,1,1,1
2,1,1,1,0


### Ensemble func

In [105]:
deno = (NB_acc + SVM_acc+RF_acc + DT_acc)
wNB = NB_acc/deno
wSVM = SVM_acc/deno
#wLSTM = lstm_acc/deno
wRF = RF_acc/deno
wDT = DT_acc/deno
classWeights = [wNB, wSVM,wRF, wDT]
classWeights

[0.28262781349315763,
 0.24990059353640307,
 0.23225413226138822,
 0.2352174607090511]

In [106]:
def ensemble_predict(a):
    pc = 0
    nc = 0
    for i in a:
        if i == 'Expected':
            continue
        if int(a[i]) ==1:
            pc+=1
        else:
            nc+=1
            
    p_pc = pc/(pc+nc)
    p_nc = 1-p_pc
    pscore =0
    nscore =0
    for w in classWeights:
        pscore+=w*pc
        nscore+=w*nc
    
    if pscore>nscore:
        return 1
    else:
        return 0

In [107]:
l = len(pdf)
l

100000

In [108]:
ensPred = []
for i in range(l):
    ensPred.append(ensemble_predict(pdf[i:i+1]))

In [109]:
accuracy_score(ensPred, y_test)*100

ValueError: Found input variables with inconsistent numbers of samples: [100000, 112339]