<p style="background-color:#blue;color:black;font-size:22px;text-align:center;border-radius:10px 10px;font-weight:bold;border:2px solid black;">Natural Language Processing with Disaster Tweets<span style='font-size:28px; background-color:blue ;'></span></p>


<center><img src="https://github.com/Isharaneranjana/kaggle_gif/blob/main/NLP%20WITH%20DISASTER%20TWEETS.gif?raw=true"></center>

## <p style="background-color:#FC7D77;color:black;font-size:20px;text-align:center;border-radius:10px 10px;"> Introduction 🎯</p>
<font size="4">Twitter has become an important communication channel in times of emergency.
The ubiquitousness of smartphones enables people to announce an emergency they’re observing in real-time. Because of this, more agencies are interested in programatically monitoring Twitter. In this notebook, I am going to build a machine learning model that predicts which Tweets are about real disasters and which one’s aren’t. The dataset consists of 10,000 tweets that were hand classified. </font>


In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
!pip install tokenization

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from wordcloud import WordCloud
import gc
import re
import string
import operator
from collections import defaultdict
import tokenization
from wordcloud import STOPWORDS

In [None]:
train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test=  pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

In [None]:
train.head()

In [None]:
plt.figure(figsize = (8, 6))
sns.heatmap(train.isna(), yticklabels = False, cbar = False, cmap = 'Reds')
plt.title("Missing values", fontsize = 14)
plt.xticks(rotation = 35, fontsize = 12)
plt.show()

In [None]:
fig, axes = plt.subplots(ncols=2, figsize=(12, 4))
plt.tight_layout()

train.groupby('target').count()['id'].plot(kind='pie', ax=axes[0], labels=['Not Disaster (57%)', 'Disaster (43%)'],colors=['lightcoral','lightskyblue'])
sns.countplot(x=train['target'], hue=train['target'], ax=axes[1], palette="RdBu")

axes[0].set_ylabel('')
axes[1].set_ylabel('')
axes[1].set_xticklabels(['Not Disaster (4342)', 'Disaster (3271)'])
axes[0].tick_params(axis='x', labelsize=12)
axes[0].tick_params(axis='y', labelsize=12)
axes[1].tick_params(axis='x', labelsize=12)
axes[1].tick_params(axis='y', labelsize=12)

axes[0].set_title('Target Distribution in Training Set', fontsize=13)
axes[1].set_title('Target Count in Training Set', fontsize=13)

plt.show()

In [None]:
plt.figure(figsize = (9, 6))
ax = plt.axes()
ax.set_facecolor('white')
ax = ((train.location.value_counts())[:10]).plot(kind = 'bar', color = 'lightcoral', linewidth = 2, edgecolor = 'white')
plt.title('Location Count', fontsize = 14)
plt.xlabel('Location', fontsize = 12)
plt.ylabel('Count', fontsize = 12)
ax.xaxis.set_tick_params(labelsize = 12, rotation = 30)
ax.yaxis.set_tick_params(labelsize = 12)


In [None]:
def generate_ngrams(text, n_gram=1):
    token = [token for token in text.lower().split(' ') if token != '' if token not in STOPWORDS]
    ngrams = zip(*[token[i:] for i in range(n_gram)])
    return [' '.join(ngram) for ngram in ngrams]

N = 50
DISASTER_TWEETS = train['target'] == 1
# Unigrams
disaster_unigrams = defaultdict(int)
nondisaster_unigrams = defaultdict(int)

for tweet in train[DISASTER_TWEETS]['text']:
    for word in generate_ngrams(tweet):
        disaster_unigrams[word] += 1
        
for tweet in train[~DISASTER_TWEETS]['text']:
    for word in generate_ngrams(tweet):
        nondisaster_unigrams[word] += 1

df_disaster_unigrams = pd.DataFrame(sorted(disaster_unigrams.items(), key=lambda x: x[1])[::-1])
df_nondisaster_unigrams = pd.DataFrame(sorted(nondisaster_unigrams.items(), key=lambda x: x[1])[::-1])

# Bigrams
disaster_bigrams = defaultdict(int)
nondisaster_bigrams = defaultdict(int)

for tweet in train[DISASTER_TWEETS]['text']:
    for word in generate_ngrams(tweet, n_gram=2):
        disaster_bigrams[word] += 1
        
for tweet in train[~DISASTER_TWEETS]['text']:
    for word in generate_ngrams(tweet, n_gram=2):
        nondisaster_bigrams[word] += 1
        
df_disaster_bigrams = pd.DataFrame(sorted(disaster_bigrams.items(), key=lambda x: x[1])[::-1])
df_nondisaster_bigrams = pd.DataFrame(sorted(nondisaster_bigrams.items(), key=lambda x: x[1])[::-1])

# Trigrams
disaster_trigrams = defaultdict(int)
nondisaster_trigrams = defaultdict(int)

for tweet in train[DISASTER_TWEETS]['text']:
    for word in generate_ngrams(tweet, n_gram=3):
        disaster_trigrams[word] += 1
        
for tweet in train[~DISASTER_TWEETS]['text']:
    for word in generate_ngrams(tweet, n_gram=3):
        nondisaster_trigrams[word] += 1
        
df_disaster_trigrams = pd.DataFrame(sorted(disaster_trigrams.items(), key=lambda x: x[1])[::-1])
df_nondisaster_trigrams = pd.DataFrame(sorted(nondisaster_trigrams.items(), key=lambda x: x[1])[::-1])

In [None]:
fig, axes = plt.subplots(ncols=2, figsize=(18, 30), dpi=100)
plt.tight_layout()

sns.barplot(y=df_disaster_unigrams[0].values[:N], x=df_disaster_unigrams[1].values[:N], ax=axes[0], color='lightcoral')
sns.barplot(y=df_nondisaster_unigrams[0].values[:N], x=df_nondisaster_unigrams[1].values[:N], ax=axes[1], color='lightskyblue')

for i in range(2):
    axes[i].spines['right'].set_visible(False)
    axes[i].set_xlabel('')
    axes[i].set_ylabel('')
    axes[i].tick_params(axis='x', labelsize=13)
    axes[i].tick_params(axis='y', labelsize=13)

axes[0].set_title(f'Top {N} most common unigrams in Disaster Tweets', fontsize=15)
axes[1].set_title(f'Top {N} most common unigrams in Non-disaster Tweets', fontsize=15)

plt.show()

In [None]:
fig, axes = plt.subplots(ncols=2, figsize=(18,30), dpi=100)
plt.tight_layout()

sns.barplot(y=df_disaster_bigrams[0].values[:N], x=df_disaster_bigrams[1].values[:N], ax=axes[0], color='lightcoral')
sns.barplot(y=df_nondisaster_bigrams[0].values[:N], x=df_nondisaster_bigrams[1].values[:N], ax=axes[1], color='lightskyblue')

for i in range(2):
    axes[i].spines['right'].set_visible(False)
    axes[i].set_xlabel('')
    axes[i].set_ylabel('')
    axes[i].tick_params(axis='x', labelsize=13)
    axes[i].tick_params(axis='y', labelsize=13)

axes[0].set_title(f'Top {N} most common bigrams in Disaster Tweets', fontsize=15)
axes[1].set_title(f'Top {N} most common bigrams in Non-disaster Tweets', fontsize=15)

plt.show()

In [None]:
fig, axes = plt.subplots(ncols=2, figsize=(20, 30), dpi=100)

sns.barplot(y=df_disaster_trigrams[0].values[:N], x=df_disaster_trigrams[1].values[:N], ax=axes[0], color='lightcoral')
sns.barplot(y=df_nondisaster_trigrams[0].values[:N], x=df_nondisaster_trigrams[1].values[:N], ax=axes[1], color='lightskyblue')

for i in range(2):
    axes[i].spines['right'].set_visible(False)
    axes[i].set_xlabel('')
    axes[i].set_ylabel('')
    axes[i].tick_params(axis='x', labelsize=13)
    axes[i].tick_params(axis='y', labelsize=11)

axes[0].set_title(f'Top {N} most common trigrams in Disaster Tweets', fontsize=15)
axes[1].set_title(f'Top {N} most common trigrams in Non-disaster Tweets', fontsize=15)

plt.show()

In [None]:
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import folium 
from folium import plugins 

new_df = pd.DataFrame()
new_df['location'] = ((train['location'].value_counts())[:10]).index
new_df['count'] = ((train['location'].value_counts())[:10]).values
geolocator = Nominatim(user_agent = 'Rahil')
geocode = RateLimiter(geolocator.geocode, min_delay_seconds = 0.5)
lat = {}
long = {}
for i in new_df['location']:
    location = geocode(i)
    lat[i] = location.latitude
    long[i] = location.longitude
new_df['latitude'] = new_df['location'].map(lat)
new_df['longitude'] = new_df['location'].map(long)
map = folium.Map(location = [10.0, 10.0], tiles = 'CartoDB dark_matter', zoom_start = 1.5)
markers = []
title = '''<h1 align = "center" style = "font-size: 15px"><b>Top 10 Tweet Locations</b></h1>'''
for i, r in new_df.iterrows():
    loss = r['count']
    if r['count'] > 0:
        counts = r['count'] * 0.4
        folium.CircleMarker([float(r['latitude']), float(r['longitude'])], radius = float(counts), color = 'lightcoral', fill = True).add_to(map)
map.get_root().html.add_child(folium.Element(title))
map

In [None]:
df_mislabeled = train.groupby(['text']).nunique().sort_values(by='target', ascending=False)
df_mislabeled = df_mislabeled[df_mislabeled['target'] > 1]['target']
df_mislabeled.index.tolist()

### data cleaning

In [None]:
#drop the id column since it does not contain any valuable information
train=train.drop(['id','keyword','location'],1)

In [None]:
train.head()

In [None]:
len(train)

In [None]:
from nltk import FreqDist
import nltk

train.reset_index(drop=True)
#tokenizes the sentences and convert it to the lowercase and add those values to the list corpus
corpus=[]
for i in range(0,(len(train)-1)):
    sentence= train['text'][i]
    tokens = nltk.wordpunct_tokenize(sentence)
    words = [w.lower() for w in tokens]
    corpus.append(words)


In [None]:
#create a list from above list of lists
from pandas.core.common import flatten
wc=list(flatten(corpus))
wc[:10]

In [None]:
dist = FreqDist(wc)
wordtotal=0
for word in dist.keys():
    wordtotal=wordtotal+ dist[word]

print("total words with punctuations:",wordtotal)
print("total unique words           :",len(dist))

In [None]:
#removing puntuation marks. taking only the words.
import re
def alphaFreqDist (words):
    adist =FreqDist()
    pattern = re.compile('.*[^a-z].*')
    for word in words:
        if not pattern.match(word):
            adist[word] += 1
    return adist

adist= alphaFreqDist(wc)
wordtot=0
for word in adist.keys():
    wordtot=wordtot+ adist[word]

print("total words without punctuations:",wordtot)
print("total unique words              :",len(adist))

In [None]:
#removing stopwords from the corpus
from nltk.corpus import stopwords
nltk.download('stopwords')
without_stopwords_wc = [t for t in wc if not t in stopwords.words("english")]

In [None]:
def withoutStopwordsDist (words):
    wdist =FreqDist()
    pattern = re.compile('.*[^a-z].*')
    for word in words:
        if not pattern.match(word):
            wdist[word] += 1
    return wdist

wdist= withoutStopwordsDist(without_stopwords_wc)
without_sw=list(wdist.keys())
wordtotws=0
for word in wdist.keys():
    wordtotws=wordtotws+ wdist[word]

print("total words without stopwords   :",wordtotws)
print("total unique words              :",len(wdist))

In [None]:
#lemmatization
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
lemmawords=[]
for w in range(1,len(without_stopwords_wc)):
    lemmaword= lemmatizer.lemmatize(without_stopwords_wc[w-1])
    lemmawords.append(lemmaword)

In [None]:
def lemmaDist (words):
    ldist =FreqDist()
    pattern = re.compile('.*[^a-z].*')
    for word in words:
        if not pattern.match(word):
            ldist[word] += 1
    return ldist
ldist= lemmaDist(lemmawords)
wordtotle=0
for word in ldist.keys():
    wordtotle=wordtotle+ ldist[word]

print("total words lemmatized:",wordtotle)
print("total unique words    :",len(ldist))

In [None]:
#adding words count to the dataset. this can be used as feture to increase model accuracy later
train['wordscount'] = train['text'].apply(lambda x:len(str(x).split())) 
train['text']= [w.lower() for w in train['text']]
train.head()

In [None]:
print('maximum number of words in a sentence :',max(train['wordscount']))
print('minimum number of words in a sentence :',min(train['wordscount']))

In [None]:
#this function removes stopwords from the reviews
import string
def stopwordremover(review):
    stop_words = stopwords.words('english')
    review = review.split()
    review = " ".join([word for word in review if not word in stop_words])
    review = review.translate(str.maketrans('', '', string.punctuation))
    return review

In [None]:
#use apply function to remove stopwords from the dataframe column
train['text']= train['text'].apply(stopwordremover)
#after removing stop words then count the number of words in the review.
train['text']= [w.lower() for w in train['text']]
train['wordscount1'] = train['text'].apply(lambda x:len(str(x).split())) 
train.head()

In [None]:
print('maximum number of words in a sentence :',max(train['wordscount1']))
print('minimum number of words in a sentence :',min(train['wordscount1']))

In [None]:
#first try to model those data only using words in the reviews. so to do that we can use vectorizor and tfidf vectorizor functions.
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(train['text'])

In [None]:
y=train['target']

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import LinearSVC

#libraries for model evaluation
from sklearn.metrics import accuracy_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc

In [None]:
from sklearn.model_selection import train_test_split

#divide the dataset into train set and test set 
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

In [None]:
print(x_train.shape)
print(x_test.shape)

In [None]:
rc =RidgeClassifier()
model0=rc.fit(x_train, y_train)
print("train accuracy:",model0.score(x_train, y_train),"\n","test accuracy:",model0.score(x_test,y_test))

rcpred = rc.predict(x_test)
print("\n")
print("classification report for ridge classifier")
print(classification_report(y_test,rcpred))
print("\n")
print("confusion matrix for ridge classifier")
ConfusionMatrixDisplay.from_estimator(rc, x_test, y_test,cmap="Blues")

In [None]:
#logistic regression
lr = LogisticRegression(max_iter=2000,penalty='l2')
model1=lr.fit(x_train, y_train)
print("train accuracy:",model1.score(x_train, y_train),"\n","test accuracy:",model1.score(x_test,y_test))
lrpred = lr.predict(x_test)
print("\n")
print("classification report for logistic regression")
print(classification_report(y_test,lrpred))
print("\n")
print("confusion matrix for logistic regression")
ConfusionMatrixDisplay.from_estimator(lr, x_test, y_test,cmap="Blues")

In [None]:
#support vector machines
svm =LinearSVC()
model2=svm.fit(x_train, y_train)
print("train accuracy:",model2.score(x_train, y_train),"\n","test accuracy:",model2.score(x_test,y_test))
svmpred = svm.predict(x_test)
print("\n")
print("classification report for support vector machines ")
print(classification_report(y_test,svmpred))
print("\n")
print("confusion matrix for support vector machines")
ConfusionMatrixDisplay.from_estimator(svm, x_test, y_test,cmap="Blues")

In [None]:
#decision tree classifier
dt=DecisionTreeClassifier()
model3=dt.fit(x_train, y_train)
print("train accuracy:",model3.score(x_train, y_train),"\n","test accuracy:",model3.score(x_test,y_test))

dtpred = dt.predict(x_test)
print("\n")
print("classification report for decision tree classifier")
print(classification_report(y_test,dtpred))
print("\n")
print("confusion matrix for decision tree classifier")
ConfusionMatrixDisplay.from_estimator(dt, x_test, y_test,cmap="Blues")

In [None]:
#random forest classifier
rf=RandomForestClassifier(random_state=1234)
model4=rf.fit(x_train, y_train)
print("train accuracy:",model4.score(x_train, y_train),"\n","test accuracy:",model4.score(x_test,y_test))

rfpred = rf.predict(x_test)
print("\n")
print("classification report for random forest classifier")
print(classification_report(y_test,rfpred))
print("\n")
print("confusion matrix for random forest classifier")
ConfusionMatrixDisplay.from_estimator(rf, x_test, y_test,cmap="Blues")

In [None]:
# gradient boost classifier 
gbm=GradientBoostingClassifier()
model5=gbm.fit(x_train, y_train)
print("train accuracy:",model5.score(x_train, y_train),"\n","test accuracy:",model5.score(x_test,y_test))

gbmpred = gbm.predict(x_test)
print("\n")
print("classification report for gradient boosting classifier")
print(classification_report(y_test,gbmpred))
print("\n")
print("confusion matrix for gradient boosting classifier")
ConfusionMatrixDisplay.from_estimator(gbm, x_test, y_test,cmap="Blues")

In [None]:
# adaboost classifier 
ada=AdaBoostClassifier()
model6=ada.fit(x_train, y_train)
print("train accuracy:",model6.score(x_train, y_train),"\n","test accuracy:",model6.score(x_test,y_test))

adapred = ada.predict(x_test)
print("\n")
print("classification report for adaboost classifier")
print(classification_report(y_test,adapred))
print("\n")
print("confusion matrix for adaboost classifier")
ConfusionMatrixDisplay.from_estimator(ada, x_test, y_test,cmap="Blues")

In [None]:
# extreme gradient boost classifier
xgb = XGBClassifier(random_state=1234)
model7=xgb.fit(x_train, y_train)
print("train accuracy:",model7.score(x_train, y_train),"\n","test accuracy:",model7.score(x_test,y_test))
xgbpred = xgb.predict(x_test)
print("\n")
print("classification report for extreme gradient boosting classifier")
print(classification_report(y_test,xgbpred))
print("\n")
print("confusion matrix for extreme gradient boosting classifier")
ConfusionMatrixDisplay.from_estimator(xgb, x_test, y_test,cmap="Blues")

In [None]:
# extra tree classifier
extree = ExtraTreesClassifier()
model8=extree.fit(x_train, y_train)
print("train accuracy:",model8.score(x_train, y_train),"\n","test accuracy:",model8.score(x_test,y_test))

extpred = extree.predict(x_test)
print("\n")
print("classification report for extra tree classifier")
print(classification_report(y_test,extpred))
print("\n")
print("confusion matrix for extra tree classifier")
ConfusionMatrixDisplay.from_estimator(extree, x_test, y_test,cmap="Blues")

In [None]:
# voting classifer
from sklearn.ensemble import VotingClassifier
clf1 = RandomForestClassifier()
clf2 = LogisticRegression(max_iter=2000,penalty='l2')

vc = VotingClassifier(estimators=[('ada', clf1),('lr', clf2)], voting='soft')
model9=vc.fit(x_train, y_train)
print("train accuracy:",model9.score(x_train, y_train),"\n","test accuracy:",model9.score(x_test,y_test))

vcpred = vc.predict(x_test)
print("\n")
print("classification report for voting classifier")
print(classification_report(y_test,vcpred))
print("\n")
print("confusion matrix for voting classifier")
ConfusionMatrixDisplay.from_estimator(vc, x_test, y_test,cmap="Blues")

In [None]:
# stacking classifier 
from sklearn.ensemble import StackingClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

estimators = [('rf', RandomForestClassifier()),('ext', make_pipeline(LogisticRegression(max_iter=2000,penalty='l2')))]
sc= StackingClassifier( estimators=estimators)

model10=sc.fit(x_train, y_train)
print("train accuracy:",model10.score(x_train, y_train),"\n","test accuracy:",model10.score(x_test,y_test))

scpred = sc.predict(x_test)
print("\n")
print("classification report for stacking classifier")
print(classification_report(y_test,scpred))
print("\n")
print("confusion matrix for stacking classifier")
ConfusionMatrixDisplay.from_estimator(sc, x_test, y_test,cmap="Blues")

In [None]:
!pip3 install catboost

In [None]:
from catboost import CatBoostClassifier

cc = CatBoostClassifier(silent=True )
model11=cc.fit(x_train, y_train)
print("train accuracy:",model11.score(x_train, y_train),"\n","test accuracy:",model11.score(x_test,y_test))

ccpred = cc.predict(x_test)
print("\n")
print("classification report for cat boost classifier")
print(classification_report(y_test,ccpred))
print("\n")
print("confusion matrix for cat boost classifier")
ConfusionMatrixDisplay.from_estimator(cc, x_test, y_test,cmap="Blues")

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
#getting tfidf vlaues for feedback
vectorizertf = TfidfVectorizer()
Xt = vectorizertf.fit_transform(train['text'])

In [None]:
#dividing the dataset to train and test 
xtrain, xtest, ytrain, ytest = train_test_split(Xt, y, test_size=0.2, random_state=1234)

In [None]:
rc =RidgeClassifier()
model0=rc.fit(x_train, y_train)
print("train accuracy:",model0.score(x_train, y_train),"\n","test accuracy:",model0.score(x_test,y_test))

rcpred = rc.predict(x_test)
print("\n")
print("classification report for ridge classifier")
print(classification_report(y_test,rcpred))
print("\n")
print("confusion matrix for ridge classifier")
ConfusionMatrixDisplay.from_estimator(rc, x_test, y_test,cmap="Blues")

In [None]:
#logistic regression
lr = LogisticRegression(max_iter=2000,penalty='l2')
model1=lr.fit(x_train, y_train)
print("train accuracy:",model1.score(x_train, y_train),"\n","test accuracy:",model1.score(x_test,y_test))
lrpred = lr.predict(x_test)
print("\n")
print("classification report for logistic regression")
print(classification_report(y_test,lrpred))
print("\n")
print("confusion matrix for logistic regression")
ConfusionMatrixDisplay.from_estimator(lr, x_test, y_test,cmap="Blues")

In [None]:
#support vector machines
svm =LinearSVC()
model2=svm.fit(x_train, y_train)
print("train accuracy:",model2.score(x_train, y_train),"\n","test accuracy:",model2.score(x_test,y_test))
svmpred = svm.predict(x_test)
print("\n")
print("classification report for support vector machines ")
print(classification_report(y_test,svmpred))
print("\n")
print("confusion matrix for support vector machines")
ConfusionMatrixDisplay.from_estimator(svm, x_test, y_test,cmap="Blues")

In [None]:
#decision tree classifier
dt=DecisionTreeClassifier()
model3=dt.fit(x_train, y_train)
print("train accuracy:",model3.score(x_train, y_train),"\n","test accuracy:",model3.score(x_test,y_test))

dtpred = dt.predict(x_test)
print("\n")
print("classification report for decision tree classifier")
print(classification_report(y_test,dtpred))
print("\n")
print("confusion matrix for decision tree classifier")
ConfusionMatrixDisplay.from_estimator(dt, x_test, y_test,cmap="Blues")

In [None]:
#random forest classifier
rf=RandomForestClassifier(random_state=1234)
model4=rf.fit(x_train, y_train)
print("train accuracy:",model4.score(x_train, y_train),"\n","test accuracy:",model4.score(x_test,y_test))

rfpred = rf.predict(x_test)
print("\n")
print("classification report for random forest classifier")
print(classification_report(y_test,rfpred))
print("\n")
print("confusion matrix for random forest classifier")
ConfusionMatrixDisplay.from_estimator(rf, x_test, y_test,cmap="Blues")

In [None]:
# gradient boost classifier 
gbm=GradientBoostingClassifier()
model5=gbm.fit(x_train, y_train)
print("train accuracy:",model5.score(x_train, y_train),"\n","test accuracy:",model5.score(x_test,y_test))

gbmpred = gbm.predict(x_test)
print("\n")
print("classification report for gradient boosting classifier")
print(classification_report(y_test,gbmpred))
print("\n")
print("confusion matrix for gradient boosting classifier")
ConfusionMatrixDisplay.from_estimator(gbm, x_test, y_test,cmap="Blues")

In [None]:
# adaboost classifier 
ada=AdaBoostClassifier()
model6=ada.fit(x_train, y_train)
print("train accuracy:",model6.score(x_train, y_train),"\n","test accuracy:",model6.score(x_test,y_test))

adapred = ada.predict(x_test)
print("\n")
print("classification report for adaboost classifier")
print(classification_report(y_test,adapred))
print("\n")
print("confusion matrix for adaboost classifier")
ConfusionMatrixDisplay.from_estimator(ada, x_test, y_test,cmap="Blues")

In [None]:
# extreme gradient boost classifier
xgb = XGBClassifier(random_state=1234)
model7=xgb.fit(x_train, y_train)
print("train accuracy:",model7.score(x_train, y_train),"\n","test accuracy:",model7.score(x_test,y_test))
xgbpred = xgb.predict(x_test)
print("\n")
print("classification report for extreme gradient boosting classifier")
print(classification_report(y_test,xgbpred))
print("\n")
print("confusion matrix for extreme gradient boosting classifier")
ConfusionMatrixDisplay.from_estimator(xgb, x_test, y_test,cmap="Blues")

In [None]:
# extra tree classifier
extree = ExtraTreesClassifier()
model8=extree.fit(x_train, y_train)
print("train accuracy:",model8.score(x_train, y_train),"\n","test accuracy:",model8.score(x_test,y_test))

extpred = extree.predict(x_test)
print("\n")
print("classification report for extra tree classifier")
print(classification_report(y_test,extpred))
print("\n")
print("confusion matrix for extra tree classifier")
ConfusionMatrixDisplay.from_estimator(extree, x_test, y_test,cmap="Blues")

In [None]:
# voting classifer
from sklearn.ensemble import VotingClassifier
clf1 = AdaBoostClassifier()
clf2 = LogisticRegression(max_iter=2000,penalty='l2')

vc = VotingClassifier(estimators=[('ada', clf1),('lr', clf2)], voting='soft')
model9=vc.fit(x_train, y_train)
print("train accuracy:",model9.score(x_train, y_train),"\n","test accuracy:",model9.score(x_test,y_test))

vcpred = vc.predict(x_test)
print("\n")
print("classification report for voting classifier")
print(classification_report(y_test,vcpred))
print("\n")
print("confusion matrix for voting classifier")
ConfusionMatrixDisplay.from_estimator(vc, x_test, y_test,cmap="Blues")

In [None]:
# stacking classifier 
from sklearn.ensemble import StackingClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

estimators = [('ada', AdaBoostClassifier()),('ext', make_pipeline(LogisticRegression(max_iter=2000,penalty='l2')))]
sc= StackingClassifier( estimators=estimators)

model10=sc.fit(x_train, y_train)
print("train accuracy:",model10.score(x_train, y_train),"\n","test accuracy:",model10.score(x_test,y_test))

scpred = sc.predict(x_test)
print("\n")
print("classification report for stacking classifier")
print(classification_report(y_test,scpred))
print("\n")
print("confusion matrix for stacking classifier")
ConfusionMatrixDisplay.from_estimator(sc, x_test, y_test,cmap="Blues")

In [None]:
from catboost import CatBoostClassifier

cc = CatBoostClassifier(silent=True )
model11=cc.fit(x_train, y_train)
print("train accuracy:",model11.score(x_train, y_train),"\n","test accuracy:",model11.score(x_test,y_test))

ccpred = cc.predict(x_test)
print("\n")
print("classification report for cat boost classifier")
print(classification_report(y_test,ccpred))
print("\n")
print("confusion matrix for cat boost classifier")
ConfusionMatrixDisplay.from_estimator(cc, x_test, y_test,cmap="Blues")

Stacking classifier with random forest classifier and logistic regression model has the best results for the given problem. The precision, recall and classification accuracy values are higher than all the other models. 