# Ensemble learning(voting)

In [1]:
import re
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import string
import nltk
import warnings 
warnings.filterwarnings("ignore", category=DeprecationWarning)

%matplotlib inline

In [2]:
import time
start = time.time()


# reading data

In [3]:
train  = pd.read_csv('2TwitterDataset250k.csv')

In [4]:
train

Unnamed: 0,Tweet,Label
0,@darthrosenburg OMH. I'm so sorry to hear tha...,0
1,@helloimmiranda ...i resized it though,0
2,gotta wait till tomorrow to see the new moon t...,0
3,Dumb Fox announcers were saying Jeff was OK an...,0
4,All these BBQ's means I'm basically living on ...,0
...,...,...
249995,@MizFitOnline seriously! I had more free time ...,1
249996,Maybe if no one bothered revising then that'll...,1
249997,@missbossy good to know you're feeling better..,1
249998,@trohman it is #andyhurleyday #andyhurleyday ...,1


# Removing Twitter Handles (@user)

In [5]:
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
        
    return input_txt 
# remove twitter handles (@user)
train['tidy_tweet'] = np.vectorize(remove_pattern)(train['Tweet'], "@[\w]*")

In [6]:
train['tidy_tweet']

0                        OMH.  I'm so sorry to hear that   
1                                   ...i resized it though 
2         gotta wait till tomorrow to see the new moon t...
3         Dumb Fox announcers were saying Jeff was OK an...
4         All these BBQ's means I'm basically living on ...
                                ...                        
249995     seriously! I had more free time when I worked...
249996    Maybe if no one bothered revising then that'll...
249997                good to know you're feeling better.. 
249998     it is  #andyhurleyday #andyhurleyday #andyhur...
249999    Morning.  Kids slept in Til 7:45am!!  Praise G...
Name: tidy_tweet, Length: 250000, dtype: object

# remove special characters, numbers, punctuations

In [7]:
# remove special characters, numbers, punctuations
train['tidy_tweet'] = train['tidy_tweet'].str.replace("[^a-zA-Z#]", " ")

In [8]:
train['tidy_tweet'] = train['tidy_tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

# tokenisation 

In [9]:
tokenized_tweet1 = train['tidy_tweet'].apply(lambda x: x.split())
tokenized_tweet1.tail()

249995    [seriously, more, free, time, when, worked, of...
249996        [Maybe, bothered, revising, then, that, help]
249997                        [good, know, feeling, better]
249998    [#andyhurleyday, #andyhurleyday, #andyhurleyda...
249999    [Morning, Kids, slept, Praise, Went, sleep, Ee...
Name: tidy_tweet, dtype: object

# stemning

In [10]:
from nltk.stem.porter import *
stemmer = PorterStemmer()

tokenized_tweet1 = tokenized_tweet1.apply(lambda x: [stemmer.stem(i) for i in x]) # stemming
tokenized_tweet1.tail()

249995    [serious, more, free, time, when, work, offic,...
249996              [mayb, bother, revis, then, that, help]
249997                           [good, know, feel, better]
249998    [#andyhurleyday, #andyhurleyday, #andyhurleyda...
249999    [morn, kid, slept, prais, went, sleep, eeeek, ...
Name: tidy_tweet, dtype: object

In [11]:
for i in range(len(tokenized_tweet1)):
    tokenized_tweet1[i] = ' '.join(tokenized_tweet1[i])

train['tidy_tweet'] = tokenized_tweet1

In [12]:
train.tail()

Unnamed: 0,Tweet,Label,tidy_tweet
249995,@MizFitOnline seriously! I had more free time ...,1,serious more free time when work offic week ju...
249996,Maybe if no one bothered revising then that'll...,1,mayb bother revis then that help
249997,@missbossy good to know you're feeling better..,1,good know feel better
249998,@trohman it is #andyhurleyday #andyhurleyday ...,1,#andyhurleyday #andyhurleyday #andyhurleyday #...
249999,Morning. Kids slept in Til 7:45am!! Praise G...,1,morn kid slept prais went sleep eeeek suuuuper...


# TF-IDF Features

In [13]:
X = train['tidy_tweet'] 
y = train['Label']


In [14]:
#Importing TFIDF 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
tfidf = TfidfVectorizer(max_df=0.90, min_df=3, max_features=25000, stop_words='english')

In [15]:
#Fitting TFIDF to both training and test
x_train_tfidf =  tfidf.fit_transform(X) 


In [16]:
from sklearn.naive_bayes import MultinomialNB 
from sklearn.metrics import f1_score

In [17]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.svm import LinearSVC

In [18]:
#sklearn.metrics.SCORERS.keys()

In [19]:
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression ,LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

In [20]:
from sklearn.naive_bayes import MultinomialNB 
from sklearn.neighbors import KNeighborsClassifier

# D,R,S

In [21]:
seed = 7
kfold = model_selection.KFold(n_splits=10,shuffle=True,random_state=seed)
# create the sub models
estimators = []
model1 = RandomForestClassifier(n_estimators=10)
estimators.append(('Randomforest', model1))
model2 = DecisionTreeClassifier()
estimators.append(('decision tree', model2))
model3 = LinearSVC()
estimators.append(('svm', model3))

# create the ensemble model
ensemble = VotingClassifier(estimators,voting="hard")
results = model_selection.cross_val_score(ensemble, x_train_tfidf, y, scoring="f1",cv=kfold)
print(f"{results}\n\nMaximum f1_Score :{results.max()} \nMean f1_score:{results.mean()}")

[0.75284261 0.74719551 0.75315054 0.75382945 0.75319927 0.74935029
 0.74614114 0.75152052 0.75299646 0.75329491]

Maximum f1_Score :0.753829453624412 
Mean f1_score:0.7513520695046593


# D,R,L

In [22]:
seed = 7
kfold = model_selection.KFold(n_splits=10,shuffle=True,random_state=seed)
# create the sub models
estimators = []
model1 = RandomForestClassifier(n_estimators=10)
estimators.append(('Randomforest', model1))
model2 = DecisionTreeClassifier()
estimators.append(('decision tree', model2))
model3 = LogisticRegression(max_iter=1000)
estimators.append(('Logistic', model3))
# create the ensemble model
ensemble = VotingClassifier(estimators)
results = model_selection.cross_val_score(ensemble, x_train_tfidf, y, scoring="f1",cv=kfold)
print(f"{results}\n\nMaximum f1_Score :{results.max()} \nMean f1_score:{results.mean()}")

[0.75230018 0.75243243 0.75151466 0.75078509 0.7568383  0.7487767
 0.74750231 0.75197421 0.75273628 0.74858985]

Maximum f1_Score :0.756838302433602 
Mean f1_score:0.751345001884611


# D,R,N

In [23]:
seed = 7
kfold = model_selection.KFold(n_splits=10,shuffle=True,random_state=seed)
# create the sub models
estimators = []
model1 = RandomForestClassifier(n_estimators=10)
estimators.append(('Randomforest', model1))
model2 = DecisionTreeClassifier()
estimators.append(('decision tree', model2))
model3 = MultinomialNB()
estimators.append(('navie', model3))
# create the ensemble model
ensemble = VotingClassifier(estimators)
results = model_selection.cross_val_score(ensemble, x_train_tfidf, y, scoring="f1",cv=kfold)
print(f"{results}\n\nMaximum f1_Score :{results.max()} \nMean f1_score:{results.mean()}")

[0.74808915 0.74888415 0.75221024 0.74867436 0.75359081 0.74363505
 0.74264498 0.75038421 0.74915744 0.74742394]

Maximum f1_Score :0.7535908075327163 
Mean f1_score:0.7484694315979908


# D,R,K

In [24]:
seed = 7
kfold = model_selection.KFold(n_splits=10,shuffle=True,random_state=seed)
# create the sub models
estimators = []
model1 = RandomForestClassifier(n_estimators=10)
estimators.append(('Randomforest', model1))
model2 = DecisionTreeClassifier()
estimators.append(('decision tree', model2))
model3 = KNeighborsClassifier(n_neighbors=3)
estimators.append(("KNN",model3))

# create the ensemble model
ensemble = VotingClassifier(estimators,voting="hard")
results = model_selection.cross_val_score(ensemble, x_train_tfidf, y, scoring="f1",cv=kfold)
print(f"{results}\n\nMaximum f1_Score :{results.max()} \nMean f1_score:{results.mean()}")

[0.71672469 0.71502851 0.71971266 0.71291544 0.72189547 0.71397398
 0.70936919 0.71734992 0.71721311 0.7098768 ]

Maximum f1_Score :0.7218954657322821 
Mean f1_score:0.7154059775640154


# D,S,L

In [25]:
seed = 7
kfold = model_selection.KFold(n_splits=10,shuffle=True,random_state=seed)
# create the sub models
estimators = []
model1 = LogisticRegression(max_iter=1000)
estimators.append(('logistic', model1))
model2 = DecisionTreeClassifier()
estimators.append(('decision tree', model2))
model3 = LinearSVC()
estimators.append(('svm', model3))
# create the ensemble model
ensemble = VotingClassifier(estimators)
results = model_selection.cross_val_score(ensemble, x_train_tfidf, y, scoring="f1",cv=kfold)
print(f"{results}\n\nMaximum f1_Score :{results.max()} \nMean f1_score:{results.mean()}")

[0.76716267 0.76712007 0.77049632 0.77159037 0.76963046 0.76559957
 0.76360218 0.76964132 0.76769333 0.76910403]

Maximum f1_Score :0.7715903728173666 
Mean f1_score:0.7681640313181355


# D,S,N

In [27]:
seed = 7
kfold = model_selection.KFold(n_splits=10,shuffle=True,random_state=seed)
# create the sub models
estimators = []
model1 = DecisionTreeClassifier()
estimators.append(('decision tree', model1))
model2 = LinearSVC()
estimators.append(('svm', model2))
model3 = MultinomialNB()
estimators.append(("navie bayes",model3))
# create the ensemble model
ensemble = VotingClassifier(estimators)
results = model_selection.cross_val_score(ensemble, x_train_tfidf, y, scoring="f1",cv=kfold)
print(f"{results}\n\nMaximum f1_Score :{results.max()} \nMean f1_score:{results.mean()}")

[0.76431122 0.76373844 0.76671301 0.76501865 0.76494024 0.76030857
 0.75677812 0.76056896 0.76425646 0.76366082]

Maximum f1_Score :0.7667130089374379 
Mean f1_score:0.7630294486220341


# D,S,K

In [32]:
seed = 7
kfold = model_selection.KFold(n_splits=10,shuffle=True,random_state=seed)
# create the sub models
estimators = []
model1 = DecisionTreeClassifier()
estimators.append(('decision tree', model1))
model2 = LinearSVC()
estimators.append(('svm', model2))
model3 = KNeighborsClassifier(n_neighbors=3)
estimators.append(("KNN",model3))
# create the ensemble model
ensemble = VotingClassifier(estimators)
results = model_selection.cross_val_score(ensemble, x_train_tfidf, y, scoring="f1",cv=kfold)
print(f"{results}\n\nMaximum f1_Score :{results.max()} \nMean f1_score:{results.mean()}")

[0.7329244  0.72740901 0.73397045 0.72955764 0.73157174 0.72794546
 0.71952911 0.73326347 0.72908168 0.72932678]

Maximum f1_Score :0.7339704458384204 
Mean f1_score:0.7294579738524246


# D,L,N

In [33]:
seed = 7
kfold = model_selection.KFold(n_splits=10,shuffle=True,random_state=seed)
# create the sub models
estimators = []
model1 = DecisionTreeClassifier()
estimators.append(('decision tree', model1))

model2 = LogisticRegression(max_iter=1000)
estimators.append(('logistic', model2))

model3 = MultinomialNB()
estimators.append(("navie bayes",model3))
# create the ensemble model
ensemble = VotingClassifier(estimators)
results = model_selection.cross_val_score(ensemble, x_train_tfidf, y, scoring="f1",cv=kfold)
print(f"{results}\n\nMaximum f1_Score :{results.max()} \nMean f1_score:{results.mean()}")

[0.76631117 0.76721415 0.76902929 0.76894375 0.76945886 0.76325972
 0.75971564 0.76818991 0.76759983 0.76708971]

Maximum f1_Score :0.7694588584136397 
Mean f1_score:0.7666812027032336


# D,L,K

In [34]:
seed = 7
kfold = model_selection.KFold(n_splits=10,shuffle=True,random_state=seed)
# create the sub models
estimators = []
model1 = DecisionTreeClassifier()
estimators.append(('decision tree', model1))

model2 = LogisticRegression(max_iter=1000)
estimators.append(('logistic', model2))

model3 = KNeighborsClassifier(n_neighbors=3)
estimators.append(("KNN",model3))
# create the ensemble model
ensemble = VotingClassifier(estimators)
results = model_selection.cross_val_score(ensemble, x_train_tfidf, y, scoring="f1",cv=kfold)
print(f"{results}\n\nMaximum f1_Score :{results.max()} \nMean f1_score:{results.mean()}")

[0.7370892  0.73443656 0.74027331 0.7342637  0.73877652 0.73128154
 0.72744714 0.74115146 0.73558368 0.73364779]

Maximum f1_Score :0.7411514570392296 
Mean f1_score:0.7353950893321342


# D,N,K

In [35]:
seed = 7
kfold = model_selection.KFold(n_splits=10,shuffle=True,random_state=seed)
# create the sub models
estimators = []
model1 = DecisionTreeClassifier()
estimators.append(('decision tree', model1))

model3 = MultinomialNB()
estimators.append(("navie bayes",model3))

model3 = KNeighborsClassifier(n_neighbors=3)
estimators.append(("KNN",model3))
# create the ensemble model
ensemble = VotingClassifier(estimators)
results = model_selection.cross_val_score(ensemble, x_train_tfidf, y, scoring="f1",cv=kfold)
print(f"{results}\n\nMaximum f1_Score :{results.max()} \nMean f1_score:{results.mean()}")

[0.71622201 0.71923209 0.73373132 0.7255374  0.71989752 0.71603243
 0.71487919 0.72563896 0.71573082 0.72310942]

Maximum f1_Score :0.7337313180066896 
Mean f1_score:0.721001115830685


# R,S,L

In [36]:
seed = 7
kfold = model_selection.KFold(n_splits=10,shuffle=True,random_state=seed)
# create the sub models
estimators = []
model1 = LogisticRegression(max_iter=1000)
estimators.append(('logistic', model1))
model2 = RandomForestClassifier(n_estimators=10)
estimators.append(('randomForest', model2))
model3 = LinearSVC()
estimators.append(('svm', model3))
# create the ensemble model
ensemble = VotingClassifier(estimators)
results = model_selection.cross_val_score(ensemble, x_train_tfidf, y, scoring="f1",cv=kfold)
print(f"{results}\n\nMaximum f1_Score :{results.max()} \nMean f1_score:{results.mean()}")

[0.76737489 0.76796231 0.77146913 0.77109954 0.77146077 0.76515181
 0.76370065 0.76979351 0.76900859 0.77042802]

Maximum f1_Score :0.7714691270404542 
Mean f1_score:0.7687449212155683


# R,S,N

In [37]:
seed = 7
kfold = model_selection.KFold(n_splits=10,shuffle=True,random_state=seed)
# create the sub models
estimators = []

model1 = RandomForestClassifier(n_estimators=10)
estimators.append(('randomForest', model1))

model2 = LinearSVC()
estimators.append(('svm', model2))

model3 = MultinomialNB()
estimators.append(("navie bayes",model3))
# create the ensemble model
ensemble = VotingClassifier(estimators)
results = model_selection.cross_val_score(ensemble, x_train_tfidf, y, scoring="f1",cv=kfold)
print(f"{results}\n\nMaximum f1_Score :{results.max()} \nMean f1_score:{results.mean()}")

[0.76394308 0.76458465 0.76540748 0.76513375 0.76467818 0.76019024
 0.75801911 0.76145    0.76588022 0.76370135]

Maximum f1_Score :0.7658802177858439 
Mean f1_score:0.7632988067923105



# R,S,K

In [38]:
seed = 7
kfold = model_selection.KFold(n_splits=10,shuffle=True,random_state=seed)
# create the sub models
estimators = []

model1 = RandomForestClassifier(n_estimators=10)
estimators.append(('randomForest', model1))

model2 = LinearSVC()
estimators.append(('svm', model2))

model3 = KNeighborsClassifier(n_neighbors=3)
estimators.append(("KNN",model3))
# create the ensemble model
ensemble = VotingClassifier(estimators)
results = model_selection.cross_val_score(ensemble, x_train_tfidf, y, scoring="f1",cv=kfold)
print(f"{results}\n\nMaximum f1_Score :{results.max()} \nMean f1_score:{results.mean()}")

[0.73933649 0.73536045 0.74110605 0.74220226 0.73862504 0.73864959
 0.73298605 0.74235551 0.73895414 0.73987323]

Maximum f1_Score :0.7423555107526881 
Mean f1_score:0.7389448832654875



# R,L,N

In [21]:
seed = 7
kfold = model_selection.KFold(n_splits=10,shuffle=True,random_state=seed)
# create the sub models
estimators = []

model1 = RandomForestClassifier(n_estimators=10)
estimators.append(('randomForest', model1))

model2 = LogisticRegression(max_iter=1000)
estimators.append(('logistic', model2))

model3 = MultinomialNB()
estimators.append(("navie bayes",model3))
# create the ensemble model
ensemble = VotingClassifier(estimators)
results = model_selection.cross_val_score(ensemble, x_train_tfidf, y, scoring="f1",cv=kfold)
print(f"{results}\n\nMaximum f1_Score :{results.max()} \nMean f1_score:{results.mean()}")

[0.76626544 0.76726221 0.76942675 0.76990341 0.77055023 0.76324061
 0.76087129 0.76807169 0.76807253 0.76824919]

Maximum f1_Score :0.7705502251811239 
Mean f1_score:0.7671913344840444


# R,L,K

In [21]:
seed = 7
kfold = model_selection.KFold(n_splits=10,shuffle=True,random_state=seed)
# create the sub models
estimators = []

model1 = RandomForestClassifier(n_estimators=10)
estimators.append(('randomForest', model1))

model2 = LogisticRegression(max_iter=1000)
estimators.append(('logistic', model2))

model3 = KNeighborsClassifier(n_neighbors=3)
estimators.append(("KNN",model3))
# create the ensemble model
ensemble = VotingClassifier(estimators)
results = model_selection.cross_val_score(ensemble, x_train_tfidf, y, scoring="f1",cv=kfold)
print(f"{results}\n\nMaximum f1_Score :{results.max()} \nMean f1_score:{results.mean()}")

[0.7423164  0.74128547 0.74512261 0.74515039 0.74795612 0.74142426
 0.73878518 0.75056105 0.74637861 0.74331997]

Maximum f1_Score :0.7505610506192336 
Mean f1_score:0.7442300067615955


# R,N,K

In [22]:
seed = 7
kfold = model_selection.KFold(n_splits=10,shuffle=True,random_state=seed)
# create the sub models
estimators = []

model1 = RandomForestClassifier(n_estimators=10)
estimators.append(('randomForest', model1))

model2 = MultinomialNB()
estimators.append(("navie bayes",model2))

model3 = KNeighborsClassifier(n_neighbors=3)
estimators.append(("KNN",model3))
# create the ensemble model
ensemble = VotingClassifier(estimators)
results = model_selection.cross_val_score(ensemble, x_train_tfidf, y, scoring="f1",cv=kfold)
print(f"{results}\n\nMaximum f1_Score :{results.max()} \nMean f1_score:{results.mean()}")

[0.72573044 0.72950573 0.74046803 0.73751958 0.72992763 0.72527473
 0.72754934 0.73673929 0.72689058 0.73479196]

Maximum f1_Score :0.7404680292835683 
Mean f1_score:0.7314397301058385


# S,L,N

In [22]:
seed = 7
kfold = model_selection.KFold(n_splits=10,shuffle=True,random_state=seed)
# create the sub models
estimators = []
model1 = LogisticRegression(max_iter=1000)
estimators.append(('logistic', model1))

model2 = MultinomialNB()
estimators.append(("navie bayes",model2))

model3 = LinearSVC()
estimators.append(('svm', model3))
# create the ensemble model
ensemble = VotingClassifier(estimators)
results = model_selection.cross_val_score(ensemble, x_train_tfidf, y, scoring="f1",cv=kfold)
print(f"{results}\n\nMaximum f1_Score :{results.max()} \nMean f1_score:{results.mean()}")

[0.76484782 0.76655298 0.76711788 0.76854039 0.76671447 0.76242965
 0.75997646 0.76472438 0.76640338 0.76657762]

Maximum f1_Score :0.7685403873405763 
Mean f1_score:0.765388503961175


# S,L,K

In [26]:
seed = 7
kfold = model_selection.KFold(n_splits=10,shuffle=True,random_state=seed)
# create the sub models
estimators = []

model1 = LinearSVC()
estimators.append(('svm', model1))

model2 = LogisticRegression(max_iter=1000)
estimators.append(('logistic', model2))

model3 = KNeighborsClassifier(n_neighbors=3)
estimators.append(("KNN",model3))

# create the ensemble model
ensemble = VotingClassifier(estimators)
results = model_selection.cross_val_score(ensemble, x_train_tfidf, y, scoring="f1",cv=kfold)
print(f"{results}\n\nMaximum f1_Score :{results.max()} \nMean f1_score:{results.mean()}")

[0.76156724 0.76106898 0.76470118 0.76567604 0.76176239 0.76024542
 0.75666216 0.76253781 0.76153937 0.76427095]

Maximum f1_Score :0.7656760417496613 
Mean f1_score:0.7620031541824102


# S,N,K

In [25]:
seed = 7
kfold = model_selection.KFold(n_splits=10,shuffle=True,random_state=seed)
# create the sub models
estimators = []

model1 = LinearSVC()
estimators.append(('svm', model1))

model2 = MultinomialNB()
estimators.append(("navie bayes",model2))

model3 = KNeighborsClassifier(n_neighbors=3)
estimators.append(("KNN",model3))

# create the ensemble model
ensemble = VotingClassifier(estimators)
results = model_selection.cross_val_score(ensemble, x_train_tfidf, y, scoring="f1",cv=kfold)
print(f"{results}\n\nMaximum f1_Score :{results.max()} \nMean f1_score:{results.mean()}")

[0.74480676 0.74931378 0.75852881 0.75775546 0.74651833 0.74559009
 0.74689197 0.75181892 0.74645296 0.75388705]

Maximum f1_Score :0.7585288061690016 
Mean f1_score:0.7501564125130757


# L,N,K

In [23]:
seed = 7
kfold = model_selection.KFold(n_splits=10,shuffle=True,random_state=seed)
# create the sub models
estimators = []

model1 = LogisticRegression(max_iter=1000)
estimators.append(('logistic', model1))

model2 = MultinomialNB()
estimators.append(("navie bayes",model2))

model3 = KNeighborsClassifier(n_neighbors=3)
estimators.append(("KNN",model3))

# create the ensemble model
ensemble = VotingClassifier(estimators)
results = model_selection.cross_val_score(ensemble, x_train_tfidf, y, scoring="f1",cv=kfold)
print(f"{results}\n\nMaximum f1_Score :{results.max()} \nMean f1_score:{results.mean()}")

[0.74689441 0.7541682  0.76172001 0.76081949 0.75249939 0.74806423
 0.7521608  0.75656884 0.75014308 0.75679162]

Maximum f1_Score :0.7617200129324281 
Mean f1_score:0.7539830077131284


In [27]:
end = time.time()
min= (end-start)/60
print(f"Runtime of the program is {min} min")

Runtime of the program is 127.13094227711359 min
