In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.multiclass import OneVsRestClassifier


import xgboost as xgb
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, f1_score

from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


In [None]:
# IMPORT DATA
orig_train_df = pd.read_csv('/content/sarcasm_mal_train.csv')
val_df = pd.read_csv('/content/sarcasm_mal_dev.csv')

test_df = pd.read_csv('/content/sarcasm_mal_test.csv')

train_df = pd.concat([orig_train_df, val_df], ignore_index=True)


In [None]:
train_df.head()

Unnamed: 0,Text,labels
0,ആദ്യം കേട്ടിട്ട് ഇഷ്ടായില്ല ഇപ്പൊ വീണ്ടും വീ...,Non-sarcastic
1,1:07 . ezhuthi kaanichathu kondu aaranennu man...,Non-sarcastic
2,"Uyyantaa moneee, ikkaa uyyyrrr",Sarcastic
3,K J Yesudas fans like here,Non-sarcastic
4,Bollywood moviyude oru feel തോന്നിയവർ ലൈക് അടി...,Non-sarcastic


In [None]:
test_df.head()

Unnamed: 0,Text,labels
0,ഈ സിനിമ ഇറങ്ങുന്നത് വരെ എനിക്ക് sleepless nigh...,Non-sarcastic
1,Lalettan fan anu Ikka poliyanne. Full support,Non-sarcastic
2,Surajetten orupadang valarnnu. Proud of him.,Non-sarcastic
3,Valsyam enna otta cinema kond ente hridhayathi...,Non-sarcastic
4,zayed masood aka prithvi @1.01 <3,Non-sarcastic


In [None]:
import pandas as pd
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()

    return text


cleaned_data = train_df['Text'].apply(clean_text)

train_df['Clean_Text'] = cleaned_data

# cleaned_data = test_df['Text'].apply(clean_text)
# # Print the cleaned data
# test_df['Clean_Text'] = cleaned_data




train_texts = train_df['Clean_Text'].tolist()
test_texts = test_df['Text'].tolist()
train_labels = train_df['labels'].tolist()
test_labels = test_df['labels'].tolist()


# Encode class labels
label_encoder = LabelEncoder()
train_df['Encoded_Labels'] = label_encoder.fit_transform(train_labels)
test_df['Encoded_Labels'] = label_encoder.transform(test_labels)

In [None]:
train_df.head()

Unnamed: 0,Text,labels,Clean_Text,Encoded_Labels
0,ആദ്യം കേട്ടിട്ട് ഇഷ്ടായില്ല ഇപ്പൊ വീണ്ടും വീ...,Non-sarcastic,ആദയ കടടടട ഇഷടയലല ഇപപ വണട വണട repeat,0
1,1:07 . ezhuthi kaanichathu kondu aaranennu man...,Non-sarcastic,ezhuthi kaanichathu kondu aaranennu manasilaayi,0
2,"Uyyantaa moneee, ikkaa uyyyrrr",Sarcastic,uyyantaa moneee ikkaa uyyyrrr,1
3,K J Yesudas fans like here,Non-sarcastic,k j yesudas fans like here,0
4,Bollywood moviyude oru feel തോന്നിയവർ ലൈക് അടി...,Non-sarcastic,bollywood moviyude oru feel തനനയവർ ലക അടകക,0


In [None]:
label_counts = train_df['labels'].value_counts()


# Print label counts
print(label_counts)


Non-sarcastic    12225
Sarcastic         2847
Name: labels, dtype: int64


In [None]:
def modelling(tf_x_train, y_train, tf_x_test, y_test, **kwargs):

    # f1_scorer = make_scorer(f1_score, average='weighted')
    if 'mb' in kwargs.keys() and kwargs['mb']:
        mb = MultinomialNB()
        mb.fit(tf_x_train,y_train)
        y_test_pred=mb.predict(tf_x_test)
        report=classification_report(y_test, y_test_pred, output_dict=True, zero_division=1)
        print("Multinomial Naive Bayes:")
        print("------------------")
        print('F1 Score in weighted avg: ', report['weighted avg']['f1-score'])
        print(report)
        print()
        print()

    if 'rf' in kwargs.keys() and kwargs['rf']:
        # Params checked
        rf = RandomForestClassifier()
        rf.fit(tf_x_train,y_train)
        y_test_pred=rf.predict(tf_x_test)
        report=classification_report(y_test, y_test_pred,output_dict=True, zero_division=1)
        print("Random Forest:")
        print("------------------")
        print('F1 Score in weighted avg: ', report['weighted avg']['f1-score'])
        print(report)
        print()
        print()

    if 'lr' in kwargs.keys() and kwargs['lr']:
        lr = LogisticRegression(max_iter=1000)
        lr.fit(tf_x_train,y_train)
        y_test_pred=lr.predict(tf_x_test)

        report=classification_report(y_test, y_test_pred,output_dict=True, zero_division=1)
        print("LR:")
        print("------------------")
        print('F1 Score in weighted avg: ', report['weighted avg']['f1-score'])
        print(report)
        print()
        print()

    if 'svm' in kwargs.keys() and kwargs['svm']:
        svm = LinearSVC(max_iter=100000)
        svm.fit(tf_x_train,y_train)
        y_test_pred=svm.predict(tf_x_test)

        report=classification_report(y_test, y_test_pred,output_dict=True, zero_division=1)
        print("SVM:")
        print("------------------")
        print('F1 Score in weighted avg: ', report['weighted avg']['f1-score'])
        print(report)
        print()
        print()

    if 'dt' in kwargs.keys() and kwargs['dt']:
        # Params checked
        dt = DecisionTreeClassifier()
        dt.fit(tf_x_train,y_train)
        y_test_pred=dt.predict(tf_x_test)
        report=classification_report(y_test, y_test_pred,output_dict=True, zero_division=1)
        print("Decision Tree:")
        print("------------------")
        print('F1 Score in weighted avg: ', report['weighted avg']['f1-score'])
        print(report)
        print()
        print()

    if 'knn' in kwargs.keys() and kwargs['knn']:
        # Params checked
        knn = KNeighborsClassifier()
        knn.fit(tf_x_train,y_train)
        y_test_pred=knn.predict(tf_x_test)
        report=classification_report(y_test, y_test_pred,output_dict=True, zero_division=1)
        print("KNN:")
        print("------------------")
        print('F1 Score in weighted avg: ', report['weighted avg']['f1-score'])
        print(report)
        print()
        print()

    if 'mlp' in kwargs.keys() and kwargs['mlp']:
        mlp = MLPClassifier(max_iter=1000)
        mlp.fit(tf_x_train,y_train)
        y_test_pred=mlp.predict(tf_x_test)
        report=classification_report(y_test, y_test_pred,output_dict=True, zero_division=1)
        print("Multi-Layer Perceptron:")
        print("------------------")
        print('F1 Score in weighted avg: ', report['weighted avg']['f1-score'])
        print(report)
        print()
        print()

    return 1

In [None]:
def modelling_ensemble(X_train, y_train, X_test, y_test, **kwargs):

    if 'ada' in kwargs.keys() and kwargs['ada']:
        ada = AdaBoostClassifier()
        ada.fit(X_train,y_train)
        y_test_pred=ada.predict(X_test)
        report=classification_report(y_test, y_test_pred,output_dict=True, zero_division=1)
        print("Adaboost:")
        print("------------------")
        print('F1 Score in weighted avg: ', report['weighted avg']['f1-score'])
        print(report)

        print()
        print()

    if 'ovr' in kwargs.keys() and kwargs['ovr']:
        lr_classifier = LogisticRegression(max_iter=1000)
        ovr_classifier = OneVsRestClassifier(lr_classifier)
        ovr_classifier.fit(X_train, y_train)
        y_test_pred = ovr_classifier.predict(X_test)
        report=classification_report(y_test, y_test_pred,output_dict=True, zero_division=1)
        print("OneVsRest:")
        print("------------------")
        print('F1 Score in weighted avg: ', report['weighted avg']['f1-score'])
        print(report)

        print()
        print()

    if 'xgb' in kwargs.keys() and kwargs['xgb']:
        xbg = xgb.XGBClassifier()
        xbg.fit(X_train,y_train)
        y_test_pred=xbg.predict(X_test)
        report=classification_report(y_test, y_test_pred, output_dict=True, zero_division=1)
        print("XGBoost: ")
        print("------------------")
        print('F1 Score in weighted avg: ', report['weighted avg']['f1-score'])
        print(report)


        print()
        print()

    if 'gb' in kwargs.keys() and kwargs['gb']:
        bg = GradientBoostingClassifier()
        bg.fit(X_train,y_train)
        y_test_pred=bg.predict(X_test)
        report=classification_report(y_test, y_test_pred, output_dict=True,zero_division=1)
        print("Gradient Boost: ")
        print("------------------")
        print('F1 Score in weighted avg: ', report['weighted avg']['f1-score'])
        print(report)

        print()
        print()


    return 1

In [None]:
def modeling_complex(X_train, y_train, X_test, y_test):
    # Stacking Classifier

    clf1 = LinearSVC(max_iter=100000)
    clf2 = RandomForestClassifier()
    clf3 = KNeighborsClassifier()

    meta_clf = LogisticRegression(max_iter=1000)


    stacking_clf = StackingClassifier(estimators=[('knn', clf1), ('rf', clf2), ('mb', clf3)], final_estimator=meta_clf)
    stacking_clf.fit(X_train, y_train)
    y_pred = stacking_clf.predict(X_test)
    report = classification_report(y_test, y_pred,output_dict=True)

    print("Stacking Classifier: ")
    print("------------------")
    print('F1 Score in weighted avg: ', report['weighted avg']['f1-score'])
    print(report)

    print()
    print()

    # Voting Classifier

    lr_clf = LogisticRegression(max_iter=1000)
    rf_clf = RandomForestClassifier()
    svm_clf = SVC()

    voting_clf = VotingClassifier(estimators=[('lr', lr_clf), ('rf', rf_clf), ('svm', svm_clf)], voting='hard')
    voting_clf.fit(X_train, y_train)
    y_pred = voting_clf.predict(X_test)

    report = classification_report(y_test, y_pred,output_dict=True)

    print("Voting Classifier: ")
    print("------------------")
    print('F1 Score in weighted avg: ', report['weighted avg']['f1-score'])
    print(report)

    print()
    print()

    # Bagging Classifier

    base_classifier = KNeighborsClassifier()
    bagging_classifier = BaggingClassifier(base_classifier, n_estimators=10, random_state=42)
    bagging_classifier.fit(X_train, y_train)
    y_test_pred=bagging_classifier.predict(X_test)
    report=classification_report(y_test, y_test_pred,output_dict=True)
    print("Bagging Classifier: ")
    print("------------------")
    print('F1 Score in weighted avg: ', report['weighted avg']['f1-score'])
    print(report)


In [None]:
def run_all_models(X_train, y_train, X_test, y_test):
  modelling(X_train, y_train, X_test, y_test, knn=True, svm=True, lr=True, dt=True, rf=True)
  modelling_ensemble(X_train, y_train, X_test, y_test, gb=True, ada=True, ovr = True)
  modeling_complex(X_train, y_train, X_test, y_test)


In [None]:
# Separate the training and test data
train_text = train_df['Clean_Text']
y_train = train_df['Encoded_Labels']
test_text = test_df['Text']
y_test = test_df['Encoded_Labels']

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_text)
X_test = vectorizer.transform(test_text)
run_all_models(X_train, y_train, X_test, y_test)


Random Forest:
------------------
F1 Score in weighted avg:  0.759153402980405
{'0': {'precision': 0.8267929634641408, 'recall': 0.9909179370742783, 'f1-score': 0.9014458542342875, 'support': 3083}, '1': {'precision': 0.6164383561643836, 'recall': 0.06569343065693431, 'f1-score': 0.11873350923482849, 'support': 685}, 'accuracy': 0.8227176220806794, 'macro avg': {'precision': 0.7216156598142622, 'recall': 0.5283056838656063, 'f1-score': 0.5100896817345579, 'support': 3768}, 'weighted avg': {'precision': 0.788551746372757, 'recall': 0.8227176220806794, 'f1-score': 0.759153402980405, 'support': 3768}}


LR:
------------------
F1 Score in weighted avg:  0.7535821617232031
{'0': {'precision': 0.8244952893674293, 'recall': 0.993512812195913, 'f1-score': 0.9011473962930273, 'support': 3083}, '1': {'precision': 0.6226415094339622, 'recall': 0.04817518248175182, 'f1-score': 0.08943089430894309, 'support': 685}, 'accuracy': 0.821656050955414, 'macro avg': {'precision': 0.7235683994006958, 'recal

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
# Separate the training and test data
train_text = train_df['Clean_Text']
y_train = train_df['Encoded_Labels']
test_text = test_df['Text']
y_test = test_df['Encoded_Labels']

vectorizer = CountVectorizer()

X_train = vectorizer.fit_transform(train_text)
X_test = vectorizer.transform(test_text)
run_all_models(X_train, y_train, X_test, y_test)


Random Forest:
------------------
F1 Score in weighted avg:  0.7597846704546912
{'0': {'precision': 0.82710027100271, 'recall': 0.9899448589036652, 'f1-score': 0.9012254540085634, 'support': 3083}, '1': {'precision': 0.6025641025641025, 'recall': 0.06861313868613139, 'f1-score': 0.12319790301441677, 'support': 685}, 'accuracy': 0.822452229299363, 'macro avg': {'precision': 0.7148321867834062, 'recall': 0.5292789987948983, 'f1-score': 0.5122116785114901, 'support': 3768}, 'weighted avg': {'precision': 0.7862809304028039, 'recall': 0.822452229299363, 'f1-score': 0.7597846704546912, 'support': 3768}}


LR:
------------------
F1 Score in weighted avg:  0.7723376863232999
{'0': {'precision': 0.8346853146853147, 'recall': 0.9678884203697697, 'f1-score': 0.8963652748573145, 'support': 3083}, '1': {'precision': 0.48704663212435234, 'recall': 0.13722627737226278, 'f1-score': 0.21412300683371302, 'support': 685}, 'accuracy': 0.8168789808917197, 'macro avg': {'precision': 0.6608659734048336, 'rec

In [None]:
#WORD2VEC
from gensim.models import Word2Vec

def train_word2vec(train_df, vector_size=100, window=5, min_count=1, workers=4):
    train_df['tokenized_words'] = train_df['Clean_Text'].apply(lambda x: [x.lower()])

    word2vec_model = Word2Vec(sentences=train_df['tokenized_words'], vector_size=vector_size,
                              window=window, min_count=min_count, workers=workers)

    return word2vec_model


def get_word_vector(model, word):
    if word in model.wv:
        return model.wv[word]
    else:
        return np.zeros(model.vector_size)


# Train Word2Vec model on training data
word2vec_model = train_word2vec(train_df)
word_vectors = word2vec_model.wv
word_vectors.save_word2vec_format("/content/drive/MyDrive/Research/fire drav_sarcasm/word_vectors.txt", binary=False)
test_df['word_vectors'] = [get_word_vector(word2vec_model, word) for word in test_df['Text']]

X_train = [get_word_vector(word2vec_model, word) for word in train_df['Text']]
y_train = train_df['Encoded_Labels']

X_test = np.vstack(test_df['word_vectors'])
y_test = test_df['Encoded_Labels']
run_all_models(X_train, y_train, X_test, y_test)


Random Forest:
------------------
F1 Score in weighted avg:  0.7363972931873196
{'0': {'precision': 0.8182059447983014, 'recall': 1.0, 'f1-score': 0.9000145964092833, 'support': 3083}, '1': {'precision': 1.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 685}, 'accuracy': 0.8182059447983014, 'macro avg': {'precision': 0.9091029723991507, 'recall': 0.5, 'f1-score': 0.45000729820464164, 'support': 3768}, 'weighted avg': {'precision': 0.8512550233049796, 'recall': 0.8182059447983014, 'f1-score': 0.7363972931873196, 'support': 3768}}


LR:
------------------
F1 Score in weighted avg:  0.7363972931873196
{'0': {'precision': 0.8182059447983014, 'recall': 1.0, 'f1-score': 0.9000145964092833, 'support': 3083}, '1': {'precision': 1.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 685}, 'accuracy': 0.8182059447983014, 'macro avg': {'precision': 0.9091029723991507, 'recall': 0.5, 'f1-score': 0.45000729820464164, 'support': 3768}, 'weighted avg': {'precision': 0.8512550233049796, 'recall': 0.81820594479

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Stacking Classifier: 
------------------
F1 Score in weighted avg:  0.7363972931873196
{'0': {'precision': 0.8182059447983014, 'recall': 1.0, 'f1-score': 0.9000145964092833, 'support': 3083}, '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 685}, 'accuracy': 0.8182059447983014, 'macro avg': {'precision': 0.4091029723991507, 'recall': 0.5, 'f1-score': 0.45000729820464164, 'support': 3768}, 'weighted avg': {'precision': 0.669460968103281, 'recall': 0.8182059447983014, 'f1-score': 0.7363972931873196, 'support': 3768}}




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Voting Classifier: 
------------------
F1 Score in weighted avg:  0.7363972931873196
{'0': {'precision': 0.8182059447983014, 'recall': 1.0, 'f1-score': 0.9000145964092833, 'support': 3083}, '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 685}, 'accuracy': 0.8182059447983014, 'macro avg': {'precision': 0.4091029723991507, 'recall': 0.5, 'f1-score': 0.45000729820464164, 'support': 3768}, 'weighted avg': {'precision': 0.669460968103281, 'recall': 0.8182059447983014, 'f1-score': 0.7363972931873196, 'support': 3768}}


Bagging Classifier: 
------------------
F1 Score in weighted avg:  0.7363972931873196
{'0': {'precision': 0.8182059447983014, 'recall': 1.0, 'f1-score': 0.9000145964092833, 'support': 3083}, '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 685}, 'accuracy': 0.8182059447983014, 'macro avg': {'precision': 0.4091029723991507, 'recall': 0.5, 'f1-score': 0.45000729820464164, 'support': 3768}, 'weighted avg': {'precision': 0.669460968103281, 're

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
!pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/68.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.11.1-py3-none-any.whl (227 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp310-cp310-linux_x86_64.whl size=4199773 sha256=821e01c55cb8924bd345112c669abf977a7fb32ccb5b80cb1dbb0be707e8a5e6
  Stored in directory: /root/.cache/pip/wheels/a5/13/75/f811c84a8ab36eedbaef977a6a58a98990e8e0f1967f98f394
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Successfully installed fasttext-0.9.2 pybind11-2.11.1


In [None]:
import fasttext
import pandas as pd
from sklearn.metrics import classification_report


X_train = train_df['Clean_Text'].values
y_train = train_df['Encoded_Labels'].values

X_test = test_df['Text'].values
y_test = test_df['Encoded_Labels'].values

with open('train.txt', 'w') as f:
    for text in X_train:
        f.write(text + '\n')

with open('test.txt', 'w') as f:
    for text in X_test:
        f.write(text + '\n')

model = fasttext.train_unsupervised(input="train.txt", dim=100, epoch=10, lr=0.1)

X_train_embeddings = [model.get_sentence_vector(sent) for sent in X_train]
X_test_embeddings = [model.get_sentence_vector(sent) for sent in X_test]

run_all_models(X_train_embeddings, y_train, X_test_embeddings, y_test)


Random Forest:
------------------
F1 Score in weighted avg:  0.7592351960958464
{'0': {'precision': 0.8267567567567567, 'recall': 0.9922153746350957, 'f1-score': 0.9019607843137254, 'support': 3083}, '1': {'precision': 0.6470588235294118, 'recall': 0.06423357664233577, 'f1-score': 0.11686586985391767, 'support': 685}, 'accuracy': 0.8235138004246284, 'macro avg': {'precision': 0.7369077901430843, 'recall': 0.5282244756387158, 'f1-score': 0.5094133270838215, 'support': 3768}, 'weighted avg': {'precision': 0.7940887407639936, 'recall': 0.8235138004246284, 'f1-score': 0.7592351960958464, 'support': 3768}}


LR:
------------------
F1 Score in weighted avg:  0.7652603732586175
{'0': {'precision': 0.8296478296478297, 'recall': 0.9857281868310087, 'f1-score': 0.9009783575452119, 'support': 3083}, '1': {'precision': 0.580952380952381, 'recall': 0.08905109489051095, 'f1-score': 0.15443037974683543, 'support': 685}, 'accuracy': 0.8227176220806794, 'macro avg': {'precision': 0.7053001053001053, 'r