# Ensemble learning (sequentail(boosting))

In [1]:
import re
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import string
from wordcloud import WordCloud
import nltk
import warnings 
warnings.filterwarnings("ignore", category=DeprecationWarning)

%matplotlib inline

# reading data

In [2]:
train  = pd.read_csv('2TwitterDataset250k.csv')

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   Tweet   250000 non-null  object
 1   Label   250000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 3.8+ MB


# Removing Twitter Handles (@user)

In [4]:
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
        
    return input_txt 
# remove twitter handles (@user)
train['tidy_tweet'] = np.vectorize(remove_pattern)(train['Tweet'], "@[\w]*")

In [5]:
train['tidy_tweet']

0                        OMH.  I'm so sorry to hear that   
1                                   ...i resized it though 
2         gotta wait till tomorrow to see the new moon t...
3         Dumb Fox announcers were saying Jeff was OK an...
4         All these BBQ's means I'm basically living on ...
                                ...                        
249995     seriously! I had more free time when I worked...
249996    Maybe if no one bothered revising then that'll...
249997                good to know you're feeling better.. 
249998     it is  #andyhurleyday #andyhurleyday #andyhur...
249999    Morning.  Kids slept in Til 7:45am!!  Praise G...
Name: tidy_tweet, Length: 250000, dtype: object

# remove special characters, numbers, punctuations

In [6]:
# remove special characters, numbers, punctuations
train['tidy_tweet'] = train['tidy_tweet'].str.replace("[^a-zA-Z#]", " ")

In [7]:
train['tidy_tweet'] = train['tidy_tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

# tokenisation 

In [8]:
tokenized_tweet1 = train['tidy_tweet'].apply(lambda x: x.split())
tokenized_tweet1.tail()

249995    [seriously, more, free, time, when, worked, of...
249996        [Maybe, bothered, revising, then, that, help]
249997                        [good, know, feeling, better]
249998    [#andyhurleyday, #andyhurleyday, #andyhurleyda...
249999    [Morning, Kids, slept, Praise, Went, sleep, Ee...
Name: tidy_tweet, dtype: object

# stemning

In [9]:
from nltk.stem.porter import *
stemmer = PorterStemmer()

tokenized_tweet1 = tokenized_tweet1.apply(lambda x: [stemmer.stem(i) for i in x]) # stemming
tokenized_tweet1.tail()

249995    [serious, more, free, time, when, work, offic,...
249996              [mayb, bother, revis, then, that, help]
249997                           [good, know, feel, better]
249998    [#andyhurleyday, #andyhurleyday, #andyhurleyda...
249999    [morn, kid, slept, prais, went, sleep, eeeek, ...
Name: tidy_tweet, dtype: object

In [10]:
for i in range(len(tokenized_tweet1)):
    tokenized_tweet1[i] = ' '.join(tokenized_tweet1[i])

train['tidy_tweet'] = tokenized_tweet1

In [11]:
train.tail()

Unnamed: 0,Tweet,Label,tidy_tweet
249995,@MizFitOnline seriously! I had more free time ...,1,serious more free time when work offic week ju...
249996,Maybe if no one bothered revising then that'll...,1,mayb bother revis then that help
249997,@missbossy good to know you're feeling better..,1,good know feel better
249998,@trohman it is #andyhurleyday #andyhurleyday ...,1,#andyhurleyday #andyhurleyday #andyhurleyday #...
249999,Morning. Kids slept in Til 7:45am!! Praise G...,1,morn kid slept prais went sleep eeeek suuuuper...


# TF-IDF Features

In [12]:
X = train['tidy_tweet']
y = train['Label']

In [13]:
#Importing TFIDF 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
tfidf = TfidfVectorizer(max_df=0.90, min_df=3, max_features=25000, stop_words='english')#(min_df=3,  max_features=100, 
#            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
 #           ngram_range=(1, 4), use_idf=1,smooth_idf=1,sublinear_tf=1,
  #          stop_words = 'english')

In [14]:
#Fitting TFIDF to both training and test
x_train_tfidf =  tfidf.fit_transform(X) 


In [15]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC


# Naive Bayes

In [16]:
from sklearn.ensemble import AdaBoostClassifier
import time
from sklearn.model_selection import StratifiedKFold 
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix, classification_report
start_time = time.time()
i=1
f1=[]
kf = StratifiedKFold(n_splits=10,random_state=42,shuffle=True)
for train_index,test_index in kf.split(x_train_tfidf,y):
    print('\n{} of kfold {}'.format(i,kf.n_splits))
    xtr,xvl = x_train_tfidf[train_index],x_train_tfidf[test_index]
    ytr,yvl = y[train_index],y[test_index]
    
    model = AdaBoostClassifier(base_estimator=MultinomialNB())

    model.fit(xtr, ytr)
    pred=model.predict(xvl)
    f=f1_score(yvl,pred)
    print("f1_score",f)
    f1.append(f)
   # print('roc_auc_score',roc_auc_score(yvl,pred))
    #print('Confusion Matrix \n',confusion_matrix(yvl,pred))
    i+=1

print("Execution time: " + str((time.time() - start_time)/60) + ' min')
from numpy import mean
print(f'{f1} \n Max F1-Score {max(f1)} \n Mean F1-Score {mean(f1)}')


1 of kfold 10
f1_score 0.7026188979677352

2 of kfold 10
f1_score 0.7043956505310887

3 of kfold 10
f1_score 0.6972934172191022

4 of kfold 10
f1_score 0.6966480681293787

5 of kfold 10
f1_score 0.7002418077211707

6 of kfold 10
f1_score 0.6960792544706573

7 of kfold 10
f1_score 0.705094955241362

8 of kfold 10
f1_score 0.6977078575657616

9 of kfold 10
f1_score 0.6998150638870209

10 of kfold 10
f1_score 0.7007096963843278
Execution time: 1.2069494485855103 min
[0.7026188979677352, 0.7043956505310887, 0.6972934172191022, 0.6966480681293787, 0.7002418077211707, 0.6960792544706573, 0.705094955241362, 0.6977078575657616, 0.6998150638870209, 0.7007096963843278] 
 Max F1-Score 0.705094955241362 
 Mean F1-Score 0.7000604669117605


In [1]:
import sklearn as s
#s.metrics.SCORERS.keys()

# Logistic Regression

In [20]:
from sklearn.ensemble import AdaBoostClassifier
import time
from sklearn.model_selection import StratifiedKFold 
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix, classification_report
start_time = time.time()
i=1
f1=[]
kf = StratifiedKFold(n_splits=10,random_state=42,shuffle=True)
for train_index,test_index in kf.split(x_train_tfidf,y):
    print('\n{} of kfold {}'.format(i,kf.n_splits))
    xtr,xvl = x_train_tfidf[train_index],x_train_tfidf[test_index]
    ytr,yvl = y[train_index],y[test_index]
    
    model = AdaBoostClassifier(base_estimator=LogisticRegression(max_iter=1000))

    model.fit(xtr, ytr)
    pred=model.predict(xvl)
    f=f1_score(yvl,pred)
    print("f1_score",f)
    f1.append(f)
   # print('roc_auc_score',roc_auc_score(yvl,pred))
    #print('Confusion Matrix \n',confusion_matrix(yvl,pred))
    i+=1

print("Execution time: " + str((time.time() - start_time)/60) + ' min')
from numpy import mean
print(f'{f1} \n Max F1-Score {max(f1)} \n Mean F1-Score {mean(f1)}')


1 of kfold 10
f1_score 0.6978077571669478

2 of kfold 10
f1_score 0.7007182087029996

3 of kfold 10
f1_score 0.693721611410621

4 of kfold 10
f1_score 0.692129141169023

5 of kfold 10
f1_score 0.6967612015438832

6 of kfold 10
f1_score 0.6915895746208761

7 of kfold 10
f1_score 0.7013906447534766

8 of kfold 10
f1_score 0.6939279179609378

9 of kfold 10
f1_score 0.6967365573216097

10 of kfold 10
f1_score 0.6971462343802769
Execution time: 1.6111236174901327 min
[0.6978077571669478, 0.7007182087029996, 0.693721611410621, 0.692129141169023, 0.6967612015438832, 0.6915895746208761, 0.7013906447534766, 0.6939279179609378, 0.6967365573216097, 0.6971462343802769] 
 Max F1-Score 0.7013906447534766 
 Mean F1-Score 0.6961928849030652


# SVM

In [19]:
from sklearn.ensemble import AdaBoostClassifier
import time
from sklearn.model_selection import StratifiedKFold 
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix, classification_report
start_time = time.time()
i=1
f1=[]
kf = StratifiedKFold(n_splits=10,random_state=42,shuffle=True)
for train_index,test_index in kf.split(x_train_tfidf,y):
    print('\n{} of kfold {}'.format(i,kf.n_splits))
    xtr,xvl = x_train_tfidf[train_index],x_train_tfidf[test_index]
    ytr,yvl = y[train_index],y[test_index]
    
    model = AdaBoostClassifier(base_estimator=LinearSVC(),algorithm="SAMME")

    model.fit(xtr, ytr)
    pred=model.predict(xvl)
    f=f1_score(yvl,pred)
    print("f1_score",f)
    f1.append(f)
   # print('roc_auc_score',roc_auc_score(yvl,pred))
    #print('Confusion Matrix \n',confusion_matrix(yvl,pred))
    i+=1

print("Execution time: " + str((time.time() - start_time)/60) + ' min')
from numpy import mean
print(f'{f1} \n Max F1-Score {max(f1)} \n Mean F1-Score {mean(f1)}')


1 of kfold 10
f1_score 0.723742061553493

2 of kfold 10
f1_score 0.7520489392572839

3 of kfold 10
f1_score 0.7468974784602007

4 of kfold 10
f1_score 0.7364963503649636

5 of kfold 10
f1_score 0.7430352774050982

6 of kfold 10
f1_score 0.7429553539310124

7 of kfold 10
f1_score 0.7387292166348224

8 of kfold 10
f1_score 0.7440254287171839

9 of kfold 10
f1_score 0.7371703997731782

10 of kfold 10
f1_score 0.7303250288731233
Execution time: 3.727829134464264 min
[0.723742061553493, 0.7520489392572839, 0.7468974784602007, 0.7364963503649636, 0.7430352774050982, 0.7429553539310124, 0.7387292166348224, 0.7440254287171839, 0.7371703997731782, 0.7303250288731233] 
 Max F1-Score 0.7520489392572839 
 Mean F1-Score 0.739542553497036


# Decision Tree

In [16]:
from sklearn.ensemble import AdaBoostClassifier
import time
from sklearn.model_selection import StratifiedKFold 
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix, classification_report
start_time = time.time()
i=1
f1=[]
kf = StratifiedKFold(n_splits=10,random_state=42,shuffle=True)
for train_index,test_index in kf.split(x_train_tfidf,y):
    print('\n{} of kfold {}'.format(i,kf.n_splits))
    xtr,xvl = x_train_tfidf[train_index],x_train_tfidf[test_index]
    ytr,yvl = y[train_index],y[test_index]
    
    model = AdaBoostClassifier()

    model.fit(xtr, ytr)
    pred=model.predict(xvl)
    f=f1_score(yvl,pred)
    print("f1_score",f)
    f1.append(f)
   # print('roc_auc_score',roc_auc_score(yvl,pred))
    #print('Confusion Matrix \n',confusion_matrix(yvl,pred))
    i+=1

print("Execution time: " + str((time.time() - start_time)/60) + ' min')
from numpy import mean
print(f'{f1} \n Max F1-Score {max(f1)} \n Mean F1-Score {mean(f1)}')


1 of kfold 10
f1_score 0.7203163618388533

2 of kfold 10
f1_score 0.7236549286555832

3 of kfold 10
f1_score 0.7204078276599244

4 of kfold 10
f1_score 0.7200554345674124

5 of kfold 10
f1_score 0.7215701745734293

6 of kfold 10
f1_score 0.7235718975705844

7 of kfold 10
f1_score 0.7218437933999073

8 of kfold 10
f1_score 0.7197101926560183

9 of kfold 10
f1_score 0.7234112502876681

10 of kfold 10
f1_score 0.7234424322977238
Execution time: 3.806368052959442 min
[0.7203163618388533, 0.7236549286555832, 0.7204078276599244, 0.7200554345674124, 0.7215701745734293, 0.7235718975705844, 0.7218437933999073, 0.7197101926560183, 0.7234112502876681, 0.7234424322977238] 
 Max F1-Score 0.7236549286555832 
 Mean F1-Score 0.7217984293507105


# Random Forest

In [16]:
from sklearn.ensemble import AdaBoostClassifier
import time
from sklearn.model_selection import StratifiedKFold 
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix, classification_report
start_time = time.time()
i=1
f1=[]
kf = StratifiedKFold(n_splits=10,random_state=42,shuffle=True)
for train_index,test_index in kf.split(x_train_tfidf,y):
    print('\n{} of kfold {}'.format(i,kf.n_splits))
    xtr,xvl = x_train_tfidf[train_index],x_train_tfidf[test_index]
    ytr,yvl = y[train_index],y[test_index]
    
    model = AdaBoostClassifier(base_estimator=RandomForestClassifier(n_estimators=10),n_estimators=10)

    model.fit(xtr, ytr)
    pred=model.predict(xvl)
    f=f1_score(yvl,pred)
    print("f1_score",f)
    f1.append(f)
   # print('roc_auc_score',roc_auc_score(yvl,pred))
    #print('Confusion Matrix \n',confusion_matrix(yvl,pred))
    i+=1

print("Execution time: " + str((time.time() - start_time)/60) + ' min')
from numpy import mean
print(f'{f1} \n Max F1-Score {max(f1)} \n Mean F1-Score {mean(f1)}')


1 of kfold 10
f1_score 0.7462298696696461

2 of kfold 10
f1_score 0.7472205736894163

3 of kfold 10
f1_score 0.7401437249374676

4 of kfold 10
f1_score 0.743091417983995

5 of kfold 10
f1_score 0.7426551453260016

6 of kfold 10
f1_score 0.746723343960326

7 of kfold 10
f1_score 0.7429768066695642

8 of kfold 10
f1_score 0.7417150531263578

9 of kfold 10
f1_score 0.7441417866063995

10 of kfold 10
f1_score 0.7473497536945812
Execution time: 247.1474783062935 min
[0.7462298696696461, 0.7472205736894163, 0.7401437249374676, 0.743091417983995, 0.7426551453260016, 0.746723343960326, 0.7429768066695642, 0.7417150531263578, 0.7441417866063995, 0.7473497536945812] 
 Max F1-Score 0.7473497536945812 
 Mean F1-Score 0.7442247475663756
