In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.multiclass import OneVsRestClassifier


import xgboost as xgb
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, f1_score

from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


In [None]:
# IMPORT DATA
orig_train_df = pd.read_csv('/content/sarcasm_tam_train.csv')
val_df = pd.read_csv('/content/sarcasm_tam_dev.csv')

test_df = pd.read_csv('/content/sarcasm_tam_test.csv')

train_df = pd.concat([orig_train_df, val_df], ignore_index=True)


In [None]:
train_df.head()

Unnamed: 0,Text,labels
0,1st like Vijay Fanss Hit Likes,Non-sarcastic
1,Vijay sethupathi is always mass.... Avaru kaha...,Non-sarcastic
2,My jayam ravi anna i am waiting,Non-sarcastic
3,Muga moodi part 2 va irukka koodathu kadavulee...,Sarcastic
4,thala mass padam pakkanum pola irukae 7 days i...,Non-sarcastic


In [None]:
test_df.head()

Unnamed: 0,Text,labels
0,100 % Best Movie in 2020,Non-sarcastic
1,Yuvan haters kallu uppu eduthuttu varisayil ...,Non-sarcastic
2,Mohanlal intha padathile romba mass kaata poraaar,Non-sarcastic
3,Thala Thala tha tamil in beggast flim,Non-sarcastic
4,Pakkathane pooreee indhe kaali odeee athathe.....,Sarcastic


In [None]:
import pandas as pd
import re

def clean_text(text):
    # Remove non-Roman characters (assuming Tamil script is non-Roman)
    #text = re.sub(r'[^\x00-\x7F]', '', text)
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()

    return text


cleaned_data = train_df['Text'].apply(clean_text)
train_df['Clean_Text'] = cleaned_data

# cleaned_data = test_df['Text'].apply(clean_text)
# # Print the cleaned data
# test_df['Clean_Text'] = cleaned_data




train_texts = train_df['Clean_Text'].tolist()
test_texts = test_df['Text'].tolist()
train_labels = train_df['labels'].tolist()
test_labels = test_df['labels'].tolist()


# Encode class labels
label_encoder = LabelEncoder()
train_df['Encoded_Labels'] = label_encoder.fit_transform(train_labels)
test_df['Encoded_Labels'] = label_encoder.transform(test_labels)

In [None]:
label_counts = train_df['labels'].value_counts()


# Print label counts
print(label_counts)


Non-sarcastic    24805
Sarcastic         8990
Name: labels, dtype: int64


In [None]:
train_df.head()

Unnamed: 0,Text,labels,Clean_Text,Encoded_Labels
0,1st like Vijay Fanss Hit Likes,Non-sarcastic,st like vijay fanss hit likes,0
1,Vijay sethupathi is always mass.... Avaru kaha...,Non-sarcastic,vijay sethupathi is always mass avaru kaha ven...,0
2,My jayam ravi anna i am waiting,Non-sarcastic,my jayam ravi anna i am waiting,0
3,Muga moodi part 2 va irukka koodathu kadavulee...,Sarcastic,muga moodi part va irukka koodathu kadavulee,1
4,thala mass padam pakkanum pola irukae 7 days i...,Non-sarcastic,thala mass padam pakkanum pola irukae days irukae,0


In [None]:
def modelling(tf_x_train, y_train, tf_x_test, y_test, **kwargs):

    if 'mb' in kwargs.keys() and kwargs['mb']:
        mb = MultinomialNB()
        mb.fit(tf_x_train,y_train)
        y_test_pred=mb.predict(tf_x_test)
        report=classification_report(y_test, y_test_pred, output_dict=True, zero_division=1)
        print("Multinomial Naive Bayes:")
        print("------------------")
        print('F1 Score in weighted avg: ', report['weighted avg']['f1-score'])
        print(report)
        # scores = cross_val_score(mb, tf_x_test, y_test, cv=5, scoring=f1_scorer)
        # print("Cross-validation scores:", scores)
        # print("Average score:", scores.mean())

        print()
        print()

    if 'rf' in kwargs.keys() and kwargs['rf']:
        # Params checked
        rf = RandomForestClassifier()
        rf.fit(tf_x_train,y_train)
        y_test_pred=rf.predict(tf_x_test)
        report=classification_report(y_test, y_test_pred,output_dict=True, zero_division=1)
        print("Random Forest:")
        print("------------------")
        print('F1 Score in weighted avg: ', report['weighted avg']['f1-score'])
        print(report)

        # scores = cross_val_score(rf, tf_x_test, y_test, cv=5, scoring=f1_scorer)
        # print("Cross-validation scores:", scores)
        # print("Average score:", scores.mean())

        print()
        print()

    if 'lr' in kwargs.keys() and kwargs['lr']:
        lr = LogisticRegression(max_iter=1000)
        lr.fit(tf_x_train,y_train)
        y_test_pred=lr.predict(tf_x_test)

        report=classification_report(y_test, y_test_pred,output_dict=True, zero_division=1)
        print("LR:")
        print("------------------")
        print('F1 Score in weighted avg: ', report['weighted avg']['f1-score'])
        print(report)

        # scores = cross_val_score(lr, tf_x_test, y_test, cv=5, scoring=f1_scorer)
        # print("Cross-validation scores:", scores)
        # print("Average score:", scores.mean())

        print()
        print()

    if 'svm' in kwargs.keys() and kwargs['svm']:
        svm = LinearSVC(max_iter=100000)
        svm.fit(tf_x_train,y_train)
        y_test_pred=svm.predict(tf_x_test)

        report=classification_report(y_test, y_test_pred,output_dict=True, zero_division=1)
        print("SVM:")
        print("------------------")
        print('F1 Score in weighted avg: ', report['weighted avg']['f1-score'])
        print(report)

        # scores = cross_val_score(svm, tf_x_test, y_test, cv=5, scoring=f1_scorer)
        # print("Cross-validation scores:", scores)
        # print("Average score:", scores.mean())

        print()
        print()

    if 'dt' in kwargs.keys() and kwargs['dt']:
        # Params checked
        dt = DecisionTreeClassifier()
        dt.fit(tf_x_train,y_train)
        y_test_pred=dt.predict(tf_x_test)
        report=classification_report(y_test, y_test_pred,output_dict=True, zero_division=1)
        print("Decision Tree:")
        print("------------------")
        print('F1 Score in weighted avg: ', report['weighted avg']['f1-score'])
        print(report)

        # scores = cross_val_score(dt, tf_x_test, y_test, cv=5, scoring=f1_scorer)
        # print("Cross-validation scores:", scores)
        # print("Average score:", scores.mean())

        print()
        print()

    if 'knn' in kwargs.keys() and kwargs['knn']:
        # Params checked
        knn = KNeighborsClassifier()
        knn.fit(tf_x_train,y_train)
        y_test_pred=knn.predict(tf_x_test)
        report=classification_report(y_test, y_test_pred,output_dict=True, zero_division=1)
        print("KNN:")
        print("------------------")
        print('F1 Score in weighted avg: ', report['weighted avg']['f1-score'])
        print(report)

        # scores = cross_val_score(knn, tf_x_test, y_test, cv=5, scoring=f1_scorer)
        # print("Cross-validation scores:", scores)
        # print("Average score:", scores.mean())

        print()
        print()

    if 'mlp' in kwargs.keys() and kwargs['mlp']:
        mlp = MLPClassifier(max_iter=1000)
        mlp.fit(tf_x_train,y_train)
        y_test_pred=mlp.predict(tf_x_test)
        report=classification_report(y_test, y_test_pred,output_dict=True, zero_division=1)
        print("Multi-Layer Perceptron:")
        print("------------------")
        print('F1 Score in weighted avg: ', report['weighted avg']['f1-score'])
        print(report)

        # scores = cross_val_score(mlp, tf_x_test, y_test, cv=5, scoring=f1_scorer)
        # print("Cross-validation scores:", scores)
        # print("Average score:", scores.mean())

        print()
        print()

    return 1

In [None]:
def modelling_ensemble(X_train, y_train, X_test, y_test, **kwargs):

    if 'ada' in kwargs.keys() and kwargs['ada']:
        ada = AdaBoostClassifier()
        ada.fit(X_train,y_train)
        y_test_pred=ada.predict(X_test)
        report=classification_report(y_test, y_test_pred,output_dict=True, zero_division=1)
        print("Adaboost:")
        print("------------------")
        print('F1 Score in weighted avg: ', report['weighted avg']['f1-score'])
        print(report)

        print()
        print()

    if 'ovr' in kwargs.keys() and kwargs['ovr']:
        lr_classifier = LogisticRegression(max_iter=1000)
        ovr_classifier = OneVsRestClassifier(lr_classifier)
        ovr_classifier.fit(X_train, y_train)
        y_test_pred = ovr_classifier.predict(X_test)
        report=classification_report(y_test, y_test_pred,output_dict=True, zero_division=1)
        print("OneVsRest:")
        print("------------------")
        print('F1 Score in weighted avg: ', report['weighted avg']['f1-score'])
        print(report)

        print()
        print()

    if 'xgb' in kwargs.keys() and kwargs['xgb']:
        xbg = xgb.XGBClassifier()
        xbg.fit(X_train,y_train)
        y_test_pred=xbg.predict(X_test)
        report=classification_report(y_test, y_test_pred, output_dict=True, zero_division=1)
        print("XGBoost: ")
        print("------------------")
        print('F1 Score in weighted avg: ', report['weighted avg']['f1-score'])
        print(report)


        print()
        print()

    if 'gb' in kwargs.keys() and kwargs['gb']:
        bg = GradientBoostingClassifier()
        bg.fit(X_train,y_train)
        y_test_pred=bg.predict(X_test)
        report=classification_report(y_test, y_test_pred, output_dict=True,zero_division=1)
        print("Gradient Boost: ")
        print("------------------")
        print('F1 Score in weighted avg: ', report['weighted avg']['f1-score'])
        print(report)

        print()
        print()


    return 1

In [None]:
def modeling_complex(X_train, y_train, X_test, y_test):
    # Stacking Classifier

    clf1 = LinearSVC(max_iter=100000)
    clf2 = RandomForestClassifier()
    clf3 = KNeighborsClassifier()

    meta_clf = LogisticRegression(max_iter=1000)


    stacking_clf = StackingClassifier(estimators=[('knn', clf1), ('rf', clf2), ('mb', clf3)], final_estimator=meta_clf)
    stacking_clf.fit(X_train, y_train)
    y_pred = stacking_clf.predict(X_test)
    report = classification_report(y_test, y_pred,output_dict=True)

    print("Stacking Classifier: ")
    print("------------------")
    print('F1 Score in weighted avg: ', report['weighted avg']['f1-score'])
    print(report)

    print()
    print()

    # Voting Classifier

    lr_clf = LogisticRegression(max_iter=1000)
    rf_clf = RandomForestClassifier()
    svm_clf = SVC()

    voting_clf = VotingClassifier(estimators=[('lr', lr_clf), ('rf', rf_clf), ('svm', svm_clf)], voting='hard')
    voting_clf.fit(X_train, y_train)
    y_pred = voting_clf.predict(X_test)

    report = classification_report(y_test, y_pred,output_dict=True)

    print("Voting Classifier: ")
    print("------------------")
    print('F1 Score in weighted avg: ', report['weighted avg']['f1-score'])
    print(report)

    print()
    print()

    # Bagging Classifier

    base_classifier = KNeighborsClassifier()
    bagging_classifier = BaggingClassifier(base_classifier, n_estimators=10, random_state=42)
    bagging_classifier.fit(X_train, y_train)
    y_test_pred=bagging_classifier.predict(X_test)
    report=classification_report(y_test, y_test_pred,output_dict=True)
    print("Bagging Classifier: ")
    print("------------------")
    print('F1 Score in weighted avg: ', report['weighted avg']['f1-score'])
    print(report)


In [None]:
def run_all_models(X_train, y_train, X_test, y_test):
  modelling(X_train, y_train, X_test, y_test, knn=True, svm=True, lr=True, dt=True, rf=True)
  modelling_ensemble(X_train, y_train, X_test, y_test, gb=True, ada=True, ovr = True)
  modeling_complex(X_train, y_train, X_test, y_test)


In [None]:
# Separate the training and test data
train_text = train_df['Clean_Text']
y_train = train_df['Encoded_Labels']
test_text = test_df['Text']
y_test = test_df['Encoded_Labels']

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_text)
X_test = vectorizer.transform(test_text)
run_all_models(X_train, y_train, X_test, y_test)


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
# Separate the training and test data
train_text = train_df['Clean_Text']
y_train = train_df['Encoded_Labels']
test_text = test_df['Text']
y_test = test_df['Encoded_Labels']

vectorizer = CountVectorizer()

X_train = vectorizer.fit_transform(train_text)
X_test = vectorizer.transform(test_text)
run_all_models(X_train, y_train, X_test, y_test)


Random Forest:
------------------
F1 Score in weighted avg:  0.7401812852951598
{'0': {'precision': 0.784957627118644, 'recall': 0.9582929194956353, 'f1-score': 0.8630077158247197, 'support': 6186}, '1': {'precision': 0.7123745819397993, 'recall': 0.28236853733981443, 'f1-score': 0.4044303797468355, 'support': 2263}, 'accuracy': 0.77725174576873, 'macro avg': {'precision': 0.7486661045292217, 'recall': 0.6203307284177249, 'f1-score': 0.6337190477857776, 'support': 8449}, 'weighted avg': {'precision': 0.7655168138579356, 'recall': 0.77725174576873, 'f1-score': 0.7401812852951598, 'support': 8449}}


LR:
------------------
F1 Score in weighted avg:  0.7826241910669196
{'0': {'precision': 0.8246176256372906, 'recall': 0.9151309408341416, 'f1-score': 0.8675197302888669, 'support': 6186}, '1': {'precision': 0.6685606060606061, 'recall': 0.46796288113124174, 'f1-score': 0.5505588770470498, 'support': 2263}, 'accuracy': 0.7953603976801988, 'macro avg': {'precision': 0.7465891158489484, 'recal

In [None]:
#WORD2VEC
from gensim.models import Word2Vec

def train_word2vec(train_df, vector_size=100, window=5, min_count=1, workers=4):
    train_df['tokenized_words'] = train_df['Clean_Text'].apply(lambda x: [x.lower()])

    word2vec_model = Word2Vec(sentences=train_df['tokenized_words'], vector_size=vector_size,
                              window=window, min_count=min_count, workers=workers)

    return word2vec_model


def get_word_vector(model, word):
    if word in model.wv:
        return model.wv[word]
    else:
        return np.zeros(model.vector_size)


# Train Word2Vec model on training data
word2vec_model = train_word2vec(train_df)
word_vectors = word2vec_model.wv
word_vectors.save_word2vec_format("/content/drive/MyDrive/Research/fire drav_sarcasm/word_vectors.txt", binary=False)
test_df['word_vectors'] = [get_word_vector(word2vec_model, word) for word in test_df['Text']]

X_train = [get_word_vector(word2vec_model, word) for word in train_df['Text']]
y_train = train_df['Encoded_Labels']

X_test = np.vstack(test_df['word_vectors'])
y_test = test_df['Encoded_Labels']
run_all_models(X_train, y_train, X_test, y_test)


Random Forest:
------------------
F1 Score in weighted avg:  0.6192235216962901
{'0': {'precision': 0.7322443181818182, 'recall': 1.0, 'f1-score': 0.8454284542845428, 'support': 6186}, '1': {'precision': 1.0, 'recall': 0.0004418912947414936, 'f1-score': 0.00088339222614841, 'support': 2263}, 'accuracy': 0.7322760089951473, 'macro avg': {'precision': 0.8661221590909092, 'recall': 0.5002209456473707, 'f1-score': 0.4231559232553456, 'support': 8449}, 'weighted avg': {'precision': 0.8039606287457365, 'recall': 0.7322760089951473, 'f1-score': 0.6192235216962901, 'support': 8449}}


LR:
------------------
F1 Score in weighted avg:  0.6189446168762813
{'0': {'precision': 0.7321576517931117, 'recall': 1.0, 'f1-score': 0.845370686709942, 'support': 6186}, '1': {'precision': 1.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 2263}, 'accuracy': 0.7321576517931117, 'macro avg': {'precision': 0.8660788258965558, 'recall': 0.5, 'f1-score': 0.422685343354971, 'support': 8449}, 'weighted avg': {'precisio

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
!pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/68.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.11.1-py3-none-any.whl (227 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp310-cp310-linux_x86_64.whl size=4199769 sha256=d907ad8835b4e7cf91bebe2d88c2aac68984b8b78d119ec58e267a669135dedc
  Stored in directory: /root/.cache/pip/wheels/a5/13/75/f811c84a8ab36eedbaef977a6a58a98990e8e0f1967f98f394
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Successfully installed fasttext-0.9.2 pybind11-2.11.1


In [None]:
import fasttext
import pandas as pd
from sklearn.metrics import classification_report

X_train = train_df['Clean_Text'].values
y_train = train_df['Encoded_Labels'].values

X_test = test_df['Text'].values
y_test = test_df['Encoded_Labels'].values

with open('train.txt', 'w') as f:
    for text in X_train:
        f.write(text + '\n')

with open('test.txt', 'w') as f:
    for text in X_test:
        f.write(text + '\n')

model = fasttext.train_unsupervised(input="train.txt", dim=100, epoch=10, lr=0.1)

X_train_embeddings = [model.get_sentence_vector(sent) for sent in X_train]
X_test_embeddings = [model.get_sentence_vector(sent) for sent in X_test]

run_all_models(X_train_embeddings, y_train, X_test_embeddings, y_test)


Random Forest:
------------------
F1 Score in weighted avg:  0.7356680327804733
{'0': {'precision': 0.7843925985518906, 'recall': 0.9456838021338506, 'f1-score': 0.8575197889182058, 'support': 6186}, '1': {'precision': 0.6609485368314834, 'recall': 0.2894387980556783, 'f1-score': 0.4025814382298709, 'support': 2263}, 'accuracy': 0.7699135992425139, 'macro avg': {'precision': 0.7226705676916869, 'recall': 0.6175613000947644, 'f1-score': 0.6300506135740384, 'support': 8449}, 'weighted avg': {'precision': 0.7513290511885007, 'recall': 0.7699135992425139, 'f1-score': 0.7356680327804733, 'support': 8449}}


LR:
------------------
F1 Score in weighted avg:  0.7377259384410005
{'0': {'precision': 0.7888446215139442, 'recall': 0.9282250242483027, 'f1-score': 0.8528778314147791, 'support': 6186}, '1': {'precision': 0.6205128205128205, 'recall': 0.32081307998232433, 'f1-score': 0.4229536848237693, 'support': 2263}, 'accuracy': 0.7655343827671914, 'macro avg': {'precision': 0.7046787210133824, 'r