In [None]:
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## **Importing packages**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#---------------------------------------Text Processing------------------------------------------------------------#
import regex
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer
from string import punctuation
from nltk.stem import WordNetLemmatizer
#------------------------------------Metrics and Validation---------------------------------------------------------#
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, cohen_kappa_score, f1_score
#-------------------------------------Models to be trained----------------------------------------------------------#
from sklearn.ensemble import StackingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
import xgboost

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
###############

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## **Reading data files**

In [None]:
train_data = pd.read_csv('Data/train.csv')
dev_data = pd.read_csv('Data/dev.csv')
train_data.head()

Unnamed: 0,text,label
0,যে দেশে সন্ত্রাসরা দেশ চালায়সে দেশে শান্তি কিভ...,1
1,এই বিচার শেষ বিচার নয়।আসল বিচার হবে আল্লাহর আদ...,0
2,আরব দেশগুলোকে বলব ভারতের সাথে সব ব্যবসা বাণিজ্...,2
3,দেশটা সুস্থ নাই,0
4,আপনার কথা দুঃখ জনক আগে বিডিও থাকলে কেন ধরা হলন...,0


## **Removing stopwords**

In [None]:
wordnet_lemmatizer = WordNetLemmatizer()

stop = stopwords.words('bengali')

for punct in punctuation:
    stop.append(punct)

def filter_text(text, stop_words):
    word_tokens = text.split()
    filtered_text = [w for w in word_tokens if not w in stop_words]
    return " ".join(filtered_text)

In [None]:
train_data["filtered_text"] = train_data.text.apply(lambda x : filter_text(x, stop))
dev_data["filtered_text"] = dev_data.text.apply(lambda x : filter_text(x, stop))

dev_data.head()

Unnamed: 0,text,label,filtered_text
0,পাডা পুতার মাঝখানে পরে সাধারণ ২ মানুষের জিবন শ...,0,পাডা পুতার মাঝখানে ২ মানুষের জিবন শেষ বিচার
1,করোনার চাপে অনেক কিছু বন্ধ ও অনেক বিধি নিষেধ ক...,0,করোনার চাপে বন্ধ বিধি নিষেধ নির্বাচন চলছেই কারন
2,সঠিক তদন্ত করতে হবে। বিচারের আওতায় আনতে হবে য...,0,সঠিক তদন্ত হবে। বিচারের আওতায় আনতে টা করেছে।
3,যে লোকটা মারা গেছে তার কি হবে তার দায়ভার কে ন...,0,লোকটা মারা দায়ভার নিবে ছাত্র ব্যবসায়ী
4,নিউ মার্কেট এবং গুলিস্থান মার্কেটের ব্যবসায়ীর...,1,নিউ মার্কেট গুলিস্থান মার্কেটের ব্যবসায়ীরা কু...


In [None]:
X_train = train_data['filtered_text'].tolist()
y_train = train_data['label'].tolist()

X_test = dev_data['filtered_text'].tolist()
y_test = dev_data['label'].tolist()

total_text = X_train + X_test

## **Converting word to vector using Tfidf vectorizer**

In [None]:
tfidf = TfidfVectorizer()
train_vec = tfidf.fit_transform(X_train)
dev_vec = tfidf.transform(X_test)
train_vec.shape, dev_vec.shape

((2700, 1661), (1330, 1661))

## **Logistic Regression**

In [None]:
model_lr = LogisticRegression(
    C=0.98,
    l1_ratio=0.23,
    max_iter=430,
    random_state=1,
    warm_start=True
).fit(train_vec, y_train)

model_lr.score(train_vec, y_train)



0.7255555555555555

In [None]:
predicted = model_lr.predict(dev_vec)

lr_macro_f1 = f1_score(y_test,predicted, average = 'macro')
print("Logistic regression ", lr_macro_f1)

Logistic regression  0.5297718379603433


## **Multinomial Naive Bayes**

In [None]:
model_mnb = MultinomialNB(alpha=1.9000000000000001, fit_prior=False).fit(train_vec, y_train)

model_mnb.score(train_vec, y_train)

0.6992592592592592

In [None]:
predicted = model_mnb.predict(dev_vec)

mnb_macro_f1 = f1_score(y_test, predicted, average='macro')
print("Naive Bayes : ", mnb_macro_f1)

Naive Bayes :  0.5213483467428038


## **SGD Classifier**

In [None]:
model_sgd_hinge = SGDClassifier(
    loss='squared_hinge',
    penalty='l2',
    alpha=0.0001,
    l1_ratio=0.15,
    fit_intercept=True,
    max_iter=1000,
    tol=0.001,
    shuffle=True,
    verbose=0,
    epsilon=0.1,
    n_jobs=-1,
    random_state=1,
    learning_rate='optimal',
    eta0=0.0,
    power_t=0.5,
    early_stopping=False,
    validation_fraction=0.1,
    n_iter_no_change=5,
    class_weight=None,
    warm_start=False,
    average=False).fit(train_vec, y_train)

model_sgd_hinge.score(train_vec, y_train)



0.7437037037037038

In [None]:
predicted = model_sgd_hinge.predict(dev_vec)

sgd_hinge_acc = f1_score(y_test,predicted, average='macro')
print("SGDClassifier : ", sgd_hinge_acc)

SGDClassifier :  0.4480765225885041


In [None]:
estimators = [
    ('svm', model_sgd_hinge),
    ('mnb', model_mnb),
    ('lr', model_lr)
]

estimators

[('svm', SGDClassifier(loss='squared_hinge', n_jobs=-1, random_state=1)),
 ('mnb', MultinomialNB(alpha=1.9000000000000001, fit_prior=False)),
 ('lr',
  LogisticRegression(C=0.98, l1_ratio=0.23, max_iter=430, random_state=1,
                     warm_start=True))]

## **Majority Voting**

In [None]:
model_voting = VotingClassifier(
    estimators = estimators,
    voting='hard',
    n_jobs=-1,
    flatten_transform=True,
    verbose=1).fit(train_vec, y_train)

model_voting.score(train_vec, y_train)

0.7414814814814815

In [None]:
predicted = model_voting.predict(dev_vec)

voting_f1 = f1_score(y_test,predicted, average='macro')
print("Majority voting : ",voting_f1)

Majority voting :  0.5166813105601362


## **Stacking using XGBoost**

In [None]:
xgc = xgboost.XGBClassifier()

model_stack = StackingClassifier(
    estimators=estimators,
    final_estimator=xgc,
    n_jobs = -1,
    verbose = 1
)

model_stack.fit(train_vec, y_train)

model_stack.score(train_vec, y_train)

0.6725925925925926

In [None]:
predicted = model_stack.predict(dev_vec)

stack_f1 = f1_score(y_test,predicted, average='macro')
print("Stacking : ", stack_f1)

Stacking :  0.5099162647546832


In [None]:
df = pd.DataFrame({'LR' : [lr_macro_f1], 'NB': [mnb_macro_f1], 'SGDclassifier': [sgd_hinge_acc], 'Majority Voting': [voting_f1], 'Stacking': [stack_f1]})
df.head()

Unnamed: 0,LR,NB,SGDclassifier,Majority Voting,Stacking
0,0.529772,0.521348,0.448077,0.516681,0.509916
