In [None]:
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## **Importing packages**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#---------------------------------------Text Processing------------------------------------------------------------#
import regex
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer
from string import punctuation
from nltk.stem import WordNetLemmatizer
#------------------------------------Metrics and Validation---------------------------------------------------------#
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, cohen_kappa_score, f1_score
#-------------------------------------Models to be trained----------------------------------------------------------#
from sklearn.ensemble import StackingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
import xgboost

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
###############

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

## **Reading data**

In [None]:
train_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/BLP2023/blp_task2/data/blp23_sentiment_train.tsv', sep='\t')
dev_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/BLP2023/blp_task2/data/blp23_sentiment_dev.tsv', sep='\t')
train_data.head()

Unnamed: 0,id,text,label
0,10856,এখানে আরো ভালো ভাবে দলীয় ও র এর অবস্থান পাকা হ...,Neutral
1,sentinob_1072,চুয়াডাঙ্গা বাড়ি কে বলেছে আপনার,Neutral
2,sentinob_10530,"ভাই সোনাই ঘোষ এর দই খেয়ে যাইতেন , খুব ই মজার",Positive
3,8001,সমার তালুকদার আপনার ছবিতে ফেসটা কেন জানি বন্য ...,Negative
4,sentinob_10144,ভাইয়া এই নুডলস টা কোথায় কিনতে পাওয়া যাবে প্লিজ...,Positive


## **Removing stopwords**

In [None]:
wordnet_lemmatizer = WordNetLemmatizer()

stop = stopwords.words('bengali')

for punct in punctuation:
    stop.append(punct)

def filter_text(text, stop_words):
    word_tokens = text.split()
    filtered_text = [w for w in word_tokens if not w in stop_words]
    return " ".join(filtered_text)

In [None]:
train_data["filtered_text"] = train_data.text.apply(lambda x : filter_text(x, stop))
dev_data["filtered_text"] = dev_data.text.apply(lambda x : filter_text(x, stop))

dev_data.head()

Unnamed: 0,id,text,label,filtered_text
0,5300,নতুন মতলব আটছে । মানে সৌদি আরব ধ্বংস করার পায়ত...,Negative,মতলব আটছে । মানে সৌদি আরব ধ্বংস পায়তারা ।
1,15392,বিদেশে পড়ালেখা করছে বাংলাদেশের প্রচুর ছেলেময়ের...,Positive,বিদেশে পড়ালেখা বাংলাদেশের প্রচুর ছেলেময়েরা নিজ...
2,6904,মাননীয় আপনি নিজে না বলে সস্তায় কোনো মন্ত্রী কে...,Negative,মাননীয় সস্তায় মন্ত্রী দিয়ে মাইকিং করতেন শোভা প...
3,30790,* করোনার টিকা নিলেন বিএনপি চেয়ারপারসন বেগম খ...,Positive,করোনার টিকা নিলেন বিএনপি চেয়ারপারসন বেগম খালে...
4,2770,একজন প্রধানমন্ত্রীর এমন বক্তব্য জাতির জন্য লজ্...,Negative,একজন প্রধানমন্ত্রীর জাতির লজ্জাজনক


In [None]:
X_train = train_data['filtered_text'].tolist()
y_train = train_data['label'].tolist()

X_test = dev_data['filtered_text'].tolist()
y_test = dev_data['label'].tolist()

total_text = X_train + X_test

# **Transforming word to vector using Tdidf vectorizer**

In [None]:
tfidf = TfidfVectorizer()
train_vec = tfidf.fit_transform(X_train)
dev_vec = tfidf.transform(X_test)
train_vec.shape, dev_vec.shape

((35266, 11153), (3934, 11153))

## **Logistic Regression**

In [None]:
model_lr = LogisticRegression(
    C=0.98,
    l1_ratio=0.23,
    max_iter=430,
    random_state=1,
    warm_start=True
).fit(train_vec, y_train)

model_lr.score(train_vec, y_train)



0.6420915329212272

In [None]:
predicted = model_lr.predict(dev_vec)

lr_micro_f1 = f1_score(y_test,predicted, average = 'micro')
print("Logistic regression ", lr_micro_f1)

Logistic regression  0.5536349771225216


## **Multinomial Naive Bayes**

In [None]:
model_mnb = MultinomialNB(alpha=1.9000000000000001, fit_prior=False).fit(train_vec, y_train)

model_mnb.score(train_vec, y_train)

0.623291555605966

In [None]:
predicted = model_mnb.predict(dev_vec)

mnb_micro_f1 = f1_score(y_test, predicted, average='micro')
print("Naive Bayes : ", mnb_micro_f1)

Naive Bayes :  0.5564311133706151


## **SGD classifier**

In [None]:
model_sgd_hinge = SGDClassifier(
    loss='squared_hinge',
    penalty='l2',
    alpha=0.0001,
    l1_ratio=0.15,
    fit_intercept=True,
    max_iter=1000,
    tol=0.001,
    shuffle=True,
    verbose=0,
    epsilon=0.1,
    n_jobs=-1,
    random_state=1,
    learning_rate='optimal',
    eta0=0.0,
    power_t=0.5,
    early_stopping=False,
    validation_fraction=0.1,
    n_iter_no_change=5,
    class_weight=None,
    warm_start=False,
    average=False).fit(train_vec, y_train)

model_sgd_hinge.score(train_vec, y_train)



0.5892928032666024

In [None]:
predicted = model_sgd_hinge.predict(dev_vec)

sgd_hinge_acc = f1_score(y_test,predicted, average='micro')
print("SGDClassifier : ", sgd_hinge_acc)

SGDClassifier :  0.47178444331469244


In [None]:
estimators = [
    ('svm', model_sgd_hinge),
    ('mnb', model_mnb),
    ('lr', model_lr)
]

estimators

[('svm', SGDClassifier(loss='squared_hinge', n_jobs=-1, random_state=1)),
 ('mnb', MultinomialNB(alpha=1.9000000000000001, fit_prior=False)),
 ('lr',
  LogisticRegression(C=0.98, l1_ratio=0.23, max_iter=430, random_state=1,
                     warm_start=True))]

## **Majority Voting classifier**

In [None]:
model_voting = VotingClassifier(
    estimators = estimators,
    voting='hard',
    n_jobs=-1,
    flatten_transform=True,
    verbose=1).fit(train_vec, y_train)

model_voting.score(train_vec, y_train)

0.6433675494810867

In [None]:
predicted = model_voting.predict(dev_vec)

voting_f1 = f1_score(y_test,predicted, average='micro')
print("Majority voting : ",voting_f1)

Majority voting :  0.5531265887137773


## **Stacking with XGBoost**

In [None]:
xgc = xgboost.XGBClassifier()

model_stack = StackingClassifier(
    estimators=estimators,
    final_estimator=xgc,
    n_jobs = -1,
    verbose = 1
)

model_stack.fit(train_vec, y_train)

model_stack.score(train_vec, y_train)

0.6265524868144956

In [None]:
predicted = model_stack.predict(dev_vec)

stack_f1 = f1_score(y_test,predicted, average='micro')
print("Stacking : ", stack_f1)

Stacking :  0.5439755973563802


In [None]:
df = pd.DataFrame({'LR' : [lr_micro_f1], 'NB': [mnb_micro_f1], 'SGDclassifier': [sgd_hinge_acc], 'Majority Voting': [voting_f1], 'Stacking': [stack_f1]})
df.head()

Unnamed: 0,LR,NB,SGDclassifier,Majority Voting,Stacking
0,0.553635,0.556431,0.471784,0.553127,0.543976
