# Ensemble learning (bagging)

In [1]:
import re
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import string
from wordcloud import WordCloud
import nltk
import warnings 
warnings.filterwarnings("ignore", category=DeprecationWarning)

%matplotlib inline

# reading data

In [2]:
train  = pd.read_csv('2TwitterDataset250k.csv')


In [3]:
train

Unnamed: 0,Tweet,Label
0,@darthrosenburg OMH. I'm so sorry to hear tha...,0
1,@helloimmiranda ...i resized it though,0
2,gotta wait till tomorrow to see the new moon t...,0
3,Dumb Fox announcers were saying Jeff was OK an...,0
4,All these BBQ's means I'm basically living on ...,0
...,...,...
249995,@MizFitOnline seriously! I had more free time ...,1
249996,Maybe if no one bothered revising then that'll...,1
249997,@missbossy good to know you're feeling better..,1
249998,@trohman it is #andyhurleyday #andyhurleyday ...,1


# Removing Twitter Handles (@user)

In [4]:
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
        
    return input_txt 
# remove twitter handles (@user)
train['tidy_tweet'] = np.vectorize(remove_pattern)(train['Tweet'], "@[\w]*")

In [5]:
train['tidy_tweet']

0                        OMH.  I'm so sorry to hear that   
1                                   ...i resized it though 
2         gotta wait till tomorrow to see the new moon t...
3         Dumb Fox announcers were saying Jeff was OK an...
4         All these BBQ's means I'm basically living on ...
                                ...                        
249995     seriously! I had more free time when I worked...
249996    Maybe if no one bothered revising then that'll...
249997                good to know you're feeling better.. 
249998     it is  #andyhurleyday #andyhurleyday #andyhur...
249999    Morning.  Kids slept in Til 7:45am!!  Praise G...
Name: tidy_tweet, Length: 250000, dtype: object

# remove special characters, numbers, punctuations

In [6]:
# remove special characters, numbers, punctuations
train['tidy_tweet'] = train['tidy_tweet'].str.replace("[^a-zA-Z]", " ")

In [7]:
train['tidy_tweet'] = train['tidy_tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

# tokenisation 

In [8]:
tokenized_tweet1 = train['tidy_tweet'].apply(lambda x: x.split())
tokenized_tweet1.tail()

249995    [seriously, more, free, time, when, worked, of...
249996        [Maybe, bothered, revising, then, that, help]
249997                        [good, know, feeling, better]
249998    [andyhurleyday, andyhurleyday, andyhurleyday, ...
249999    [Morning, Kids, slept, Praise, Went, sleep, Ee...
Name: tidy_tweet, dtype: object

# stemning

In [9]:
from nltk.stem.porter import *
stemmer = PorterStemmer()

tokenized_tweet1 = tokenized_tweet1.apply(lambda x: [stemmer.stem(i) for i in x]) # stemming
tokenized_tweet1.tail()

249995    [serious, more, free, time, when, work, offic,...
249996              [mayb, bother, revis, then, that, help]
249997                           [good, know, feel, better]
249998    [andyhurleyday, andyhurleyday, andyhurleyday, ...
249999    [morn, kid, slept, prais, went, sleep, eeeek, ...
Name: tidy_tweet, dtype: object

In [10]:
for i in range(len(tokenized_tweet1)):
    tokenized_tweet1[i] = ' '.join(tokenized_tweet1[i])

train['tidy_tweet'] = tokenized_tweet1

In [11]:
train.tail()

Unnamed: 0,Tweet,Label,tidy_tweet
249995,@MizFitOnline seriously! I had more free time ...,1,serious more free time when work offic week ju...
249996,Maybe if no one bothered revising then that'll...,1,mayb bother revis then that help
249997,@missbossy good to know you're feeling better..,1,good know feel better
249998,@trohman it is #andyhurleyday #andyhurleyday ...,1,andyhurleyday andyhurleyday andyhurleyday andy...
249999,Morning. Kids slept in Til 7:45am!! Praise G...,1,morn kid slept prais went sleep eeeek suuuuper...


# TF-IDF Features

In [12]:
X = train['tidy_tweet']
y = train['Label']

In [13]:
#Importing TFIDF 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
tfidf = TfidfVectorizer(max_df=0.90, min_df=3, max_features=25000, stop_words='english')

In [14]:
#Fitting TFIDF to both training and test
x_train_tfidf =  tfidf.fit_transform(X) 


# Base Model LogisticRegression

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier

from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix, classification_report
import time
start_time = time.time()

i=1
f1=[]
kf = StratifiedKFold(n_splits=10,random_state=42,shuffle=True)
for train_index,test_index in kf.split(x_train_tfidf,y):
    print('\n{} of kfold {}'.format(i,kf.n_splits))
    xtr,xvl = x_train_tfidf[train_index],x_train_tfidf[test_index]
    ytr,yvl = y[train_index],y[test_index]
    
    model = BaggingClassifier(base_estimator=LogisticRegression(max_iter=1000,))
    model.fit(xtr, ytr)
    pred=model.predict(xvl)
    f=f1_score(yvl,pred)
    print("f1_score",f)
    f1.append(f)
    i+=1
print("Execution time: " + str((time.time() - start_time)/60) + ' min')
from numpy import mean
print(f'{f1} \n Max F1-Score {max(f1)} \n Mean F1-Score {mean(f1)}')



1 of kfold 10
f1_score 0.7727912431587176

2 of kfold 10
f1_score 0.772265625

3 of kfold 10
f1_score 0.7661980680122517

4 of kfold 10
f1_score 0.7671660424469414

5 of kfold 10
f1_score 0.7655910642258764

6 of kfold 10
f1_score 0.7648346493628333

7 of kfold 10
f1_score 0.7694581664008712

8 of kfold 10
f1_score 0.7640361516491255

9 of kfold 10
f1_score 0.7670590075470222

10 of kfold 10
f1_score 0.7686913156971988
Execution time: 10.402401546637217 min
[0.7727912431587176, 0.772265625, 0.7661980680122517, 0.7671660424469414, 0.7655910642258764, 0.7648346493628333, 0.7694581664008712, 0.7640361516491255, 0.7670590075470222, 0.7686913156971988] 
 Max F1-Score 0.7727912431587176 
 Mean F1-Score 0.7678091333500838


# Base Model Decision Tree

In [18]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix, classification_report
import time
start_time = time.time()

i=1
f1=[]
kf = StratifiedKFold(n_splits=10,random_state=42,shuffle=True)
for train_index,test_index in kf.split(x_train_tfidf,y):
    print('\n{} of kfold {}'.format(i,kf.n_splits))
    xtr,xvl = x_train_tfidf[train_index],x_train_tfidf[test_index]
    ytr,yvl = y[train_index],y[test_index]
    
    model = BaggingClassifier(DecisionTreeClassifier())
    model.fit(xtr, ytr)
    pred=model.predict(xvl)
    f=f1_score(yvl,pred)
    print("f1_score",f)
    f1.append(f)
    i+=1
print("Execution time: " + str((time.time() - start_time)/60) + ' miin')
from numpy import mean
print(f'{f1} \n Max F1-Score {max(f1)} \n Mean F1-Score {mean(f1)}')



1 of kfold 10
f1_score 0.7277289836888331

2 of kfold 10
f1_score 0.7226473461413221

3 of kfold 10
f1_score 0.7200229254513448

4 of kfold 10
f1_score 0.716602820115739

5 of kfold 10
f1_score 0.7230012894906512

6 of kfold 10
f1_score 0.7197632944228274

7 of kfold 10
f1_score 0.7256543651438792

8 of kfold 10
f1_score 0.7187129966110001

9 of kfold 10
f1_score 0.7264246603123099

10 of kfold 10
f1_score 0.7274570223807979
Execution time: 207.31587549448014 miin
[0.7277289836888331, 0.7226473461413221, 0.7200229254513448, 0.716602820115739, 0.7230012894906512, 0.7197632944228274, 0.7256543651438792, 0.7187129966110001, 0.7264246603123099, 0.7274570223807979] 
 Max F1-Score 0.7277289836888331 
 Mean F1-Score 0.7228015703758705


# Base Model Naive bayes

In [18]:
from sklearn.naive_bayes import MultinomialNB 
from sklearn.ensemble import BaggingClassifier 

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix, classification_report
import time
start_time = time.time()

i=1
f1=[]
kf = StratifiedKFold(n_splits=10,random_state=42,shuffle=True)
for train_index,test_index in kf.split(x_train_tfidf,y):
    print('\n{} of kfold {}'.format(i,kf.n_splits))
    xtr,xvl = x_train_tfidf[train_index],x_train_tfidf[test_index]
    ytr,yvl = y[train_index],y[test_index]
    
    model = BaggingClassifier(base_estimator=MultinomialNB())
    model.fit(xtr, ytr)
    pred=model.predict(xvl)
    f=f1_score(yvl,pred)
    print("f1_score",f)
    f1.append(f)
    i+=1
print("Execution time: " + str((time.time() - start_time)/60) + ' min')
from numpy import mean
print(f'{f1} \n Max F1-Score {max(f1)} \n Mean F1-Score {mean(f1)}')


1 of kfold 10
f1_score 0.7584078499643904

2 of kfold 10
f1_score 0.7590342307997777

3 of kfold 10
f1_score 0.7425169235788271

4 of kfold 10
f1_score 0.7539773854393969

5 of kfold 10
f1_score 0.7448453608247423

6 of kfold 10
f1_score 0.7418046088932165

7 of kfold 10
f1_score 0.7515722026658228

8 of kfold 10
f1_score 0.7494035310959122

9 of kfold 10
f1_score 0.7522556540403037

10 of kfold 10
f1_score 0.7545071876737351
Execution time: 0.1309056043624878 min
[0.7584078499643904, 0.7590342307997777, 0.7425169235788271, 0.7539773854393969, 0.7448453608247423, 0.7418046088932165, 0.7515722026658228, 0.7494035310959122, 0.7522556540403037, 0.7545071876737351] 
 Max F1-Score 0.7590342307997777 
 Mean F1-Score 0.7508324934976125


# Base Model SVM


In [16]:
from sklearn.ensemble import BaggingClassifier 
from sklearn.svm import LinearSVC
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix, classification_report
import time
start_time = time.time()

i=1
f1=[]
kf = StratifiedKFold(n_splits=10,random_state=42,shuffle=True)
for train_index,test_index in kf.split(x_train_tfidf,y):
    print('\n{} of kfold {}'.format(i,kf.n_splits))
    xtr,xvl = x_train_tfidf[train_index],x_train_tfidf[test_index]
    ytr,yvl = y[train_index],y[test_index]
    
    model = BaggingClassifier(base_estimator=LinearSVC())
    model.fit(xtr, ytr)
    pred=model.predict(xvl)
    f=f1_score(yvl,pred)
    print("f1_score",f)
    f1.append(f)
    i+=1
print("Execution time: " + str((time.time() - start_time)/60) + ' min')
from numpy import mean
print(f'{f1} \n Max F1-Score {max(f1)} \n Mean F1-Score {mean(f1)}')



1 of kfold 10
f1_score 0.7628035538005923

2 of kfold 10
f1_score 0.7602821142721293

3 of kfold 10
f1_score 0.7529796308845218

4 of kfold 10
f1_score 0.7549548121135246

5 of kfold 10
f1_score 0.7541254125412541

6 of kfold 10
f1_score 0.7543011178942362

7 of kfold 10
f1_score 0.7559297912713472

8 of kfold 10
f1_score 0.7528888535917087

9 of kfold 10
f1_score 0.7559248269040554

10 of kfold 10
f1_score 0.7580320094842915
Execution time: 16.59644770224889 min
[0.7628035538005923, 0.7602821142721293, 0.7529796308845218, 0.7549548121135246, 0.7541254125412541, 0.7543011178942362, 0.7559297912713472, 0.7528888535917087, 0.7559248269040554, 0.7580320094842915] 
 Max F1-Score 0.7628035538005923 
 Mean F1-Score 0.7562222122757662


# Base Model KNN

In [15]:
from sklearn.ensemble import BaggingClassifier 
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix, classification_report
import time
start_time = time.time()

i=1
f1=[]
kf = StratifiedKFold(n_splits=10,random_state=42,shuffle=True)
for train_index,test_index in kf.split(x_train_tfidf,y):
    print('\n{} of kfold {}'.format(i,kf.n_splits))
    xtr,xvl = x_train_tfidf[train_index],x_train_tfidf[test_index]
    ytr,yvl = y[train_index],y[test_index]
    
    model = BaggingClassifier(base_estimator=KNeighborsClassifier(3))
    model.fit(xtr, ytr)
    pred=model.predict(xvl)
    f=f1_score(yvl,pred)
    print("f1_score",f)
    f1.append(f)
    i+=1
print("Execution time: " + str((time.time() - start_time)/60) + ' min')
from numpy import mean
print(f'{f1} \n Max F1-Score {max(f1)} \n Mean F1-Score {mean(f1)}')



1 of kfold 10
f1_score 0.696564220503171

2 of kfold 10
f1_score 0.6965697346132128

3 of kfold 10
f1_score 0.6940348173316053

4 of kfold 10
f1_score 0.6971061542220826

5 of kfold 10
f1_score 0.6923592256972603

6 of kfold 10
f1_score 0.6152962488401891

7 of kfold 10
f1_score 0.625130707563611

8 of kfold 10
f1_score 0.6148400513251625

9 of kfold 10
f1_score 0.7010569933630648

10 of kfold 10
f1_score 0.6249014281959169
Execution time: 236.5935758392016 min
[0.696564220503171, 0.6965697346132128, 0.6940348173316053, 0.6971061542220826, 0.6923592256972603, 0.6152962488401891, 0.625130707563611, 0.6148400513251625, 0.7010569933630648, 0.6249014281959169] 
 Max F1-Score 0.7010569933630648 
 Mean F1-Score 0.6657859581655277


# Base Model Random Forest

In [16]:
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier 

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix, classification_report
import time
start_time = time.time()

i=1
f1=[]
kf = StratifiedKFold(n_splits=10,random_state=42,shuffle=True)
for train_index,test_index in kf.split(x_train_tfidf,y):
    print('\n{} of kfold {}'.format(i,kf.n_splits))
    xtr,xvl = x_train_tfidf[train_index],x_train_tfidf[test_index]
    ytr,yvl = y[train_index],y[test_index]
    
    model = BaggingClassifier(base_estimator=RandomForestClassifier(n_estimators=10))
    model.fit(xtr, ytr)
    pred=model.predict(xvl)
    f=f1_score(yvl,pred)
    print("f1_score",f)
    f1.append(f)
    i+=1
print("Execution time: " + str((time.time() - start_time)/60) + ' min')
from numpy import mean
print(f'{f1} \n Max F1-Score {max(f1)} \n Mean F1-Score {mean(f1)}')



1 of kfold 10
f1_score 0.7556677049244964

2 of kfold 10
f1_score 0.7573922337014608

3 of kfold 10
f1_score 0.7490020756825803

4 of kfold 10
f1_score 0.753225168183617

5 of kfold 10
f1_score 0.754234956569587

6 of kfold 10
f1_score 0.7531288246673774

7 of kfold 10
f1_score 0.7528683155345588

8 of kfold 10
f1_score 0.7498313157372494

9 of kfold 10
f1_score 0.755879903224527

10 of kfold 10
f1_score 0.7559509687623567
Execution time: 120.42282179196675 min
[0.7556677049244964, 0.7573922337014608, 0.7490020756825803, 0.753225168183617, 0.754234956569587, 0.7531288246673774, 0.7528683155345588, 0.7498313157372494, 0.755879903224527, 0.7559509687623567] 
 Max F1-Score 0.7573922337014608 
 Mean F1-Score 0.7537181466987811
