### Import Libraries

In [1]:
import math
import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

### Fetch and Load Data

In [2]:
!wget https://raw.githubusercontent.com/iabufarha/iSarcasmEval/main/train/train.En.csv
!wget https://raw.githubusercontent.com/iabufarha/iSarcasmEval/main/test/task_A_En_test.csv

--2023-04-29 23:49:32--  https://raw.githubusercontent.com/iabufarha/iSarcasmEval/main/train/train.En.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 491395 (480K) [text/plain]
Saving to: ‘train.En.csv’


2023-04-29 23:49:33 (12.7 MB/s) - ‘train.En.csv’ saved [491395/491395]

--2023-04-29 23:49:33--  https://raw.githubusercontent.com/iabufarha/iSarcasmEval/main/test/task_A_En_test.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 130890 (128K) [text/plain]
Saving to: ‘task_A_En_test.csv’


2023-04-29 23:49:33 (4.95 MB/s) - ‘

In [3]:
df = pd.read_csv('train.En.csv')
df_test_taskA_en = pd.read_csv('task_A_En_test.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,tweet,sarcastic,rephrase,sarcasm,irony,satire,understatement,overstatement,rhetorical_question
0,0,The only thing I got from college is a caffein...,1,"College is really difficult, expensive, tiring...",0.0,1.0,0.0,0.0,0.0,0.0
1,1,I love it when professors draw a big question ...,1,I do not like when professors don’t write out ...,1.0,0.0,0.0,0.0,0.0,0.0
2,2,Remember the hundred emails from companies whe...,1,"I, at the bare minimum, wish companies actuall...",0.0,1.0,0.0,0.0,0.0,0.0
3,3,Today my pop-pop told me I was not “forced” to...,1,"Today my pop-pop told me I was not ""forced"" to...",1.0,0.0,0.0,0.0,0.0,0.0
4,4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1,I would say Ted Cruz is an asshole and doesn’t...,1.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# Removing rows where tweets are missing
df=df.dropna(subset=['tweet'])
df_test_taskA_en=df_test_taskA_en.dropna(subset=['text'])

print(df.shape)
print(df_test_taskA_en.shape)

(3467, 10)
(1400, 2)


In [6]:
df['sarcastic'].value_counts()

0    2600
1     867
Name: sarcastic, dtype: int64

In [7]:
df_train, df_val = train_test_split(df, test_size=0.2)

## Perform Mutation for Data Augmentation
Here, the setting order refers to the presence/absence of (remove, replace, shuffle) words in that order.

Ex: the setting (0,1,0) means only the replace words operation is performed.

In [8]:
# import from mutant.py
import mutant as mt
aug=mt.TextMutant()

In [15]:
data_aug = aug.create_new_dataset(df_train, ["0","1","0"])
data_aug.to_csv("mutation_010.csv")
data_aug = aug.create_new_dataset(df_train, ["0","0","1"])
data_aug.to_csv("mutation_001.csv")
data_aug = aug.create_new_dataset(df_train, ["1","0","0"])
data_aug.to_csv("mutation_100.csv")
data_aug = aug.create_new_dataset(df_train, ["0","1","1"])
data_aug.to_csv("mutation_011.csv")
data_aug = aug.create_new_dataset(df_train, ["1","1","0"])
data_aug.to_csv("mutation_110.csv")
data_aug = aug.create_new_dataset(df_train, ["1","0","1"])
data_aug.to_csv("mutation_101.csv")
data_aug = aug.create_new_dataset(df_train, ["1","1","1"])
data_aug.to_csv("mutation_111.csv")

In [16]:
mut001=pd.read_csv("mutation_001.csv")
mut010=pd.read_csv("mutation_010.csv")
mut011=pd.read_csv("mutation_011.csv")
mut100=pd.read_csv("mutation_100.csv")
mut101=pd.read_csv("mutation_101.csv")
mut110=pd.read_csv("mutation_110.csv")
mut111=pd.read_csv("mutation_111.csv")

df_train_aug_001 = df_train[["tweet","sarcastic"]].append(mut001[["tweet","sarcastic"]])
df_train_aug_001.head()

  df_train_aug_001 = df_train[["tweet","sarcastic"]].append(mut001[["tweet","sarcastic"]])


Unnamed: 0,tweet,sarcastic
1518,my mom says i need to interact w actual ppl an...,0
3183,if he’s:\n- smooth talkin\n- so rockin\n- got ...,0
1581,Is only 12 hours away from my first #raceforli...,0
2922,My investments investing in me! Sit back and w...,0
2833,Does anyone else get really freaked out when s...,0


In [18]:
df_train_aug_001 = pd.concat((df_train[["tweet","sarcastic"]], mut001[["tweet","sarcastic"]]))
df_train_aug_010 = pd.concat((df_train[["tweet","sarcastic"]], mut010[["tweet","sarcastic"]]))
df_train_aug_011 = pd.concat((df_train[["tweet","sarcastic"]], mut011[["tweet","sarcastic"]]))
df_train_aug_100 = pd.concat((df_train[["tweet","sarcastic"]], mut100[["tweet","sarcastic"]]))
df_train_aug_101 = pd.concat((df_train[["tweet","sarcastic"]], mut101[["tweet","sarcastic"]]))
df_train_aug_110 = pd.concat((df_train[["tweet","sarcastic"]], mut110[["tweet","sarcastic"]]))
df_train_aug_111 = pd.concat((df_train[["tweet","sarcastic"]], mut111[["tweet","sarcastic"]]))

In [19]:
df_train_aug_111.sarcastic.value_counts()

0    4210
1    1336
Name: sarcastic, dtype: int64

In [20]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
import nltk
nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [21]:
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.linear_model import SGDClassifier, LogisticRegression, LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, KFold
from sklearn.svm import SVC, LinearSVC
from sklearn import metrics, utils
import sklearn
from xgboost import XGBClassifier
import xgboost as xgb 
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer

In [23]:
tokens_val = df_val['tweet'].apply(lambda x: ' '.join([val for val in word_tokenize(x.lower()) if val not in stop_words])).values
tokens_test = df_test_taskA_en['text'].apply(lambda x: ' '.join([val for val in word_tokenize(x.lower()) if val not in stop_words])).values

y_val = df_val["sarcastic"].values
y_test = df_test_taskA_en['sarcastic'].values
 
dataaug_list = [df_train_aug_110, df_train_aug_111, df_train_aug_101,
                df_train_aug_100, df_train_aug_011, df_train_aug_010, 
                df_train_aug_001]

for idx, df_curr in enumerate(dataaug_list):
  print(f'Mutation: {idx}')
  
  tokens_train = df_curr['tweet'].apply(lambda x: ' '.join([val for val in word_tokenize(x.lower()) if val not in stop_words])).values

  tfidf_vectorizer = TfidfVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
  xtrain_tfidf = tfidf_vectorizer.fit_transform(tokens_train)  
  xval_tfidf = tfidf_vectorizer.transform(tokens_val)
  xtest_tfidf= tfidf_vectorizer.transform(tokens_test)

  print(xtrain_tfidf.shape, xval_tfidf.shape, xtest_tfidf.shape)

  y_train = df_curr['sarcastic'].values
  
  rf = RandomForestClassifier(class_weight='balanced_subsample')
  grid_rf = GridSearchCV(rf, {'n_estimators':[300,700], 'max_depth':[7,11]})

  xgb_classifier = XGBClassifier()
  grid_xgb = GridSearchCV(xgb_classifier, {'n_estimators':[500,700], 
                                          'max_depth':[7,11]})

  lr = LogisticRegressionCV(max_iter=1000)

  lr_model = lr.fit(xtrain_tfidf, y_train)
  rf_model = grid_rf.fit(xtrain_tfidf, y_train)
  xgb_model = grid_xgb.fit(xtrain_tfidf, y_train)

  y_pred_lr = lr_model.predict(xval_tfidf)
  y_pred_rf = rf_model.predict(xval_tfidf)
  y_pred_xgb = xgb_model.predict(xval_tfidf)

  print(f'\tVal Accuracy RF: {round(np.mean(y_pred_rf==y_val), 3)}')
  print(f'\tVal Accuracy LR: {round(np.mean(y_pred_lr==y_val), 3)}')
  print(f'\tVal Accuracy XGB: {round(np.mean(y_pred_xgb==y_val), 3)}')

  print(f'\tVal F1 score RF: {round(f1_score(y_val, y_pred_rf), 3)}')
  print(f'\tVal F1 score LR: {round(f1_score(y_val, y_pred_lr), 3)}')
  print(f'\tVal F1 score XGB: {round(f1_score(y_val, y_pred_xgb), 3)}')

  y_pred_lr = lr_model.predict(xtest_tfidf)
  y_pred_rf = rf_model.predict(xtest_tfidf)
  y_pred_xgb = xgb_model.predict(xtest_tfidf)

  print(f'\tTest Accuracy RF: {round(np.mean(y_pred_rf==y_test), 3)}')
  print(f'\tTest Accuracy LR: {round(np.mean(y_pred_lr==y_test), 3)}')
  print(f'\tTest Accuracy XGB: {round(np.mean(y_pred_xgb==y_test), 3)}')

  print(f'\tTest F1 score RF: {round(f1_score(y_test, y_pred_rf), 3)}')
  print(f'\tTest F1 score LR: {round(f1_score(y_test, y_pred_lr), 3)}')
  print(f'\tTest F1 score XGB: {round(f1_score(y_test, y_pred_xgb), 3)}')


Mutation: 0
(5546, 1000) (694, 1000) (1400, 1000)
	Val Accuracy RF: 0.677
	Val Accuracy LR: 0.723
	Val Accuracy XGB: 0.696
	Val F1 score RF: 0.353
	Val F1 score LR: 0.25
	Val F1 score XGB: 0.265
	Test Accuracy RF: 0.774
	Test Accuracy LR: 0.795
	Test Accuracy XGB: 0.793
	Test F1 score RF: 0.368
	Test F1 score LR: 0.178
	Test F1 score XGB: 0.233
Mutation: 1
(5546, 1000) (694, 1000) (1400, 1000)
	Val Accuracy RF: 0.656
	Val Accuracy LR: 0.705
	Val Accuracy XGB: 0.7
	Val F1 score RF: 0.356
	Val F1 score LR: 0.244
	Val F1 score XGB: 0.297
	Test Accuracy RF: 0.75
	Test Accuracy LR: 0.811
	Test Accuracy XGB: 0.792
	Test F1 score RF: 0.357
	Test F1 score LR: 0.205
	Test F1 score XGB: 0.24
Mutation: 2
(5546, 1000) (694, 1000) (1400, 1000)
	Val Accuracy RF: 0.657
	Val Accuracy LR: 0.702
	Val Accuracy XGB: 0.68
	Val F1 score RF: 0.32
	Val F1 score LR: 0.247
	Val F1 score XGB: 0.26
	Test Accuracy RF: 0.794
	Test Accuracy LR: 0.799
	Test Accuracy XGB: 0.785
	Test F1 score RF: 0.385
	Test F1 score 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

	Val Accuracy RF: 0.682
	Val Accuracy LR: 0.653
	Val Accuracy XGB: 0.676
	Val F1 score RF: 0.363
	Val F1 score LR: 0.387
	Val F1 score XGB: 0.286
	Test Accuracy RF: 0.783
	Test Accuracy LR: 0.669
	Test Accuracy XGB: 0.775
	Test F1 score RF: 0.385
	Test F1 score LR: 0.269
	Test F1 score XGB: 0.222


### Augmentation Balanced

In [None]:
data_aug = aug.create_new_dataset(df_train[df_train['sarcastic']==1], ["0","1","0"])
data_aug.to_csv("mutation_010_sarcastic.csv")
data_aug = aug.create_new_dataset(df_train[df_train['sarcastic']==1], ["0","0","1"])
data_aug.to_csv("mutation_001_sarcastic.csv")
data_aug = aug.create_new_dataset(df_train[df_train['sarcastic']==1], ["1","0","0"])
data_aug.to_csv("mutation_100_sarcastic.csv")
data_aug = aug.create_new_dataset(df_train[df_train['sarcastic']==1], ["0","1","1"])
data_aug.to_csv("mutation_011_sarcastic.csv")
data_aug = aug.create_new_dataset(df_train[df_train['sarcastic']==1], ["1","1","0"])
data_aug.to_csv("mutation_110_sarcastic.csv")
data_aug = aug.create_new_dataset(df_train[df_train['sarcastic']==1], ["1","0","1"])
data_aug.to_csv("mutation_101_sarcastic.csv")
data_aug = aug.create_new_dataset(df_train[df_train['sarcastic']==1], ["1","1","1"])
data_aug.to_csv("mutation_111_sarcastic.csv")

In [26]:
mut001=pd.read_csv("mutation_001_sarcastic.csv")
mut010=pd.read_csv("mutation_010_sarcastic.csv")
mut011=pd.read_csv("mutation_011_sarcastic.csv")
mut100=pd.read_csv("mutation_100_sarcastic.csv")
mut101=pd.read_csv("mutation_101_sarcastic.csv")
mut110=pd.read_csv("mutation_110_sarcastic.csv")
mut111=pd.read_csv("mutation_111_sarcastic.csv")

In [27]:
df_train_aug_001 = pd.concat((df_train[["tweet","sarcastic"]], mut001[["tweet","sarcastic"]]))
df_train_aug_010 = pd.concat((df_train[["tweet","sarcastic"]], mut010[["tweet","sarcastic"]]))
df_train_aug_011 = pd.concat((df_train[["tweet","sarcastic"]], mut011[["tweet","sarcastic"]]))
df_train_aug_100 = pd.concat((df_train[["tweet","sarcastic"]], mut100[["tweet","sarcastic"]]))
df_train_aug_101 = pd.concat((df_train[["tweet","sarcastic"]], mut101[["tweet","sarcastic"]]))
df_train_aug_110 = pd.concat((df_train[["tweet","sarcastic"]], mut110[["tweet","sarcastic"]]))
df_train_aug_111 = pd.concat((df_train[["tweet","sarcastic"]], mut111[["tweet","sarcastic"]]))

In [29]:
df_train_aug_111.sarcastic.value_counts()

0    2105
1    1336
Name: sarcastic, dtype: int64

In [30]:
tokens_val = df_val['tweet'].apply(lambda x: ' '.join([val for val in word_tokenize(x.lower()) if val not in stop_words])).values
tokens_test = df_test_taskA_en['text'].apply(lambda x: ' '.join([val for val in word_tokenize(x.lower()) if val not in stop_words])).values

y_val = df_val["sarcastic"].values
y_test = df_test_taskA_en['sarcastic'].values
 
dataaug_list = [df_train_aug_110, df_train_aug_111, df_train_aug_101,
                df_train_aug_100, df_train_aug_011, df_train_aug_010, 
                df_train_aug_001]

for idx, df_curr in enumerate(dataaug_list):
  print(f'Mutation: {idx}')
  
  tokens_train = df_curr['tweet'].apply(lambda x: ' '.join([val for val in word_tokenize(x.lower()) if val not in stop_words])).values

  tfidf_vectorizer = TfidfVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
  xtrain_tfidf = tfidf_vectorizer.fit_transform(tokens_train)  
  xval_tfidf = tfidf_vectorizer.transform(tokens_val)
  xtest_tfidf= tfidf_vectorizer.transform(tokens_test)

  print(xtrain_tfidf.shape, xval_tfidf.shape, xtest_tfidf.shape)

  y_train = df_curr['sarcastic'].values
  
  rf = RandomForestClassifier(class_weight='balanced_subsample')
  grid_rf = GridSearchCV(rf, {'n_estimators':[300,700], 'max_depth':[7,11]})

  xgb_classifier = XGBClassifier()
  grid_xgb = GridSearchCV(xgb_classifier, {'n_estimators':[500,700], 
                                          'max_depth':[7,11]})

  lr = LogisticRegressionCV(max_iter=1000)

  lr_model = lr.fit(xtrain_tfidf, y_train)
  rf_model = grid_rf.fit(xtrain_tfidf, y_train)
  xgb_model = grid_xgb.fit(xtrain_tfidf, y_train)

  y_pred_lr = lr_model.predict(xval_tfidf)
  y_pred_rf = rf_model.predict(xval_tfidf)
  y_pred_xgb = xgb_model.predict(xval_tfidf)

  print(f'\tVal Accuracy RF: {round(np.mean(y_pred_rf==y_val), 3)}')
  print(f'\tVal Accuracy LR: {round(np.mean(y_pred_lr==y_val), 3)}')
  print(f'\tVal Accuracy XGB: {round(np.mean(y_pred_xgb==y_val), 3)}')

  print(f'\tVal F1 score RF: {round(f1_score(y_val, y_pred_rf), 3)}')
  print(f'\tVal F1 score LR: {round(f1_score(y_val, y_pred_lr), 3)}')
  print(f'\tVal F1 score XGB: {round(f1_score(y_val, y_pred_xgb), 3)}')

  y_pred_lr = lr_model.predict(xtest_tfidf)
  y_pred_rf = rf_model.predict(xtest_tfidf)
  y_pred_xgb = xgb_model.predict(xtest_tfidf)

  print(f'\tTest Accuracy RF: {round(np.mean(y_pred_rf==y_test), 3)}')
  print(f'\tTest Accuracy LR: {round(np.mean(y_pred_lr==y_test), 3)}')
  print(f'\tTest Accuracy XGB: {round(np.mean(y_pred_xgb==y_test), 3)}')

  print(f'\tTest F1 score RF: {round(f1_score(y_test, y_pred_rf), 3)}')
  print(f'\tTest F1 score LR: {round(f1_score(y_test, y_pred_lr), 3)}')
  print(f'\tTest F1 score XGB: {round(f1_score(y_test, y_pred_xgb), 3)}')

Mutation: 0
(3441, 1000) (694, 1000) (1400, 1000)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

	Val Accuracy RF: 0.692
	Val Accuracy LR: 0.715
	Val Accuracy XGB: 0.663
	Val F1 score RF: 0.078
	Val F1 score LR: 0.124
	Val F1 score XGB: 0.32
	Test Accuracy RF: 0.844
	Test Accuracy LR: 0.832
	Test Accuracy XGB: 0.747
	Test F1 score RF: 0.167
	Test F1 score LR: 0.106
	Test F1 score XGB: 0.259
Mutation: 1
(3441, 1000) (694, 1000) (1400, 1000)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

	Val Accuracy RF: 0.699
	Val Accuracy LR: 0.712
	Val Accuracy XGB: 0.663
	Val F1 score RF: 0.14
	Val F1 score LR: 0.107
	Val F1 score XGB: 0.328
	Test Accuracy RF: 0.829
	Test Accuracy LR: 0.84
	Test Accuracy XGB: 0.736
	Test F1 score RF: 0.184
	Test F1 score LR: 0.138
	Test F1 score XGB: 0.242
Mutation: 2
(3441, 1000) (694, 1000) (1400, 1000)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

	Val Accuracy RF: 0.589
	Val Accuracy LR: 0.635
	Val Accuracy XGB: 0.643
	Val F1 score RF: 0.424
	Val F1 score LR: 0.363
	Val F1 score XGB: 0.371
	Test Accuracy RF: 0.538
	Test Accuracy LR: 0.677
	Test Accuracy XGB: 0.708
	Test F1 score RF: 0.259
	Test F1 score LR: 0.289
	Test F1 score XGB: 0.263
Mutation: 3
(3441, 1000) (694, 1000) (1400, 1000)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


	Val Accuracy RF: 0.581
	Val Accuracy LR: 0.634
	Val Accuracy XGB: 0.651
	Val F1 score RF: 0.421
	Val F1 score LR: 0.38
	Val F1 score XGB: 0.37
	Test Accuracy RF: 0.523
	Test Accuracy LR: 0.649
	Test Accuracy XGB: 0.708
	Test F1 score RF: 0.256
	Test F1 score LR: 0.27
	Test F1 score XGB: 0.274
Mutation: 4
(3441, 1000) (694, 1000) (1400, 1000)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

	Val Accuracy RF: 0.7
	Val Accuracy LR: 0.702
	Val Accuracy XGB: 0.669
	Val F1 score RF: 0.103
	Val F1 score LR: 0.321
	Val F1 score XGB: 0.299
	Test Accuracy RF: 0.844
	Test Accuracy LR: 0.762
	Test Accuracy XGB: 0.73
	Test F1 score RF: 0.167
	Test F1 score LR: 0.227
	Test F1 score XGB: 0.244
Mutation: 5
(3441, 1000) (694, 1000) (1400, 1000)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

	Val Accuracy RF: 0.706
	Val Accuracy LR: 0.686
	Val Accuracy XGB: 0.682
	Val F1 score RF: 0.038
	Val F1 score LR: 0.278
	Val F1 score XGB: 0.336
	Test Accuracy RF: 0.843
	Test Accuracy LR: 0.763
	Test Accuracy XGB: 0.749
	Test F1 score RF: 0.06
	Test F1 score LR: 0.213
	Test F1 score XGB: 0.248
Mutation: 6
(3441, 1000) (694, 1000) (1400, 1000)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

	Val Accuracy RF: 0.651
	Val Accuracy LR: 0.618
	Val Accuracy XGB: 0.671
	Val F1 score RF: 0.363
	Val F1 score LR: 0.355
	Val F1 score XGB: 0.325
	Test Accuracy RF: 0.744
	Test Accuracy LR: 0.65
	Test Accuracy XGB: 0.762
	Test F1 score RF: 0.37
	Test F1 score LR: 0.26
	Test F1 score XGB: 0.281


### Without Augmentation

In [24]:
df_train_without_aug = df_train[["tweet","sarcastic"]]
tokens_train = df_train_without_aug['tweet'].apply(lambda x: ' '.join([val for val in word_tokenize(x.lower()) if val not in stop_words])).values
tokens_val = df_val['tweet'].apply(lambda x: ' '.join([val for val in word_tokenize(x.lower()) if val not in stop_words])).values
tokens_test = df_test_taskA_en['text'].apply(lambda x: ' '.join([val for val in word_tokenize(x.lower()) if val not in stop_words])).values

tfidf_vectorizer = TfidfVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
xtrain_tfidf = tfidf_vectorizer.fit_transform(tokens_train)
xval_tfidf = tfidf_vectorizer.transform(tokens_val)
xtest_tfidf= tfidf_vectorizer.transform(tokens_test)

print(xtrain_tfidf.shape, xval_tfidf.shape, xtest_tfidf.shape)

y_train = df_train_without_aug['sarcastic'].values
y_val = df_val["sarcastic"].values
y_test = df_test_taskA_en['sarcastic'].values
y_train.shape, y_val.shape, y_test.shape

rf = RandomForestClassifier(class_weight='balanced_subsample')
grid_rf = GridSearchCV(rf, {'n_estimators':[300,700], 'max_depth':[7,11]})

xgb_classifier = XGBClassifier()
grid_xgb = GridSearchCV(xgb_classifier, {'n_estimators':[500,700], 
                                         'max_depth':[7,11]})

lr = LogisticRegressionCV(max_iter=1000)

lr_model = lr.fit(xtrain_tfidf, y_train)
rf_model = grid_rf.fit(xtrain_tfidf, y_train)
xgb_model = grid_xgb.fit(xtrain_tfidf, y_train)

y_pred_lr = lr_model.predict(xval_tfidf)
y_pred_rf = rf_model.predict(xval_tfidf)
y_pred_xgb = xgb_model.predict(xval_tfidf)

print(f'Accuracy RF: {round(np.mean(y_pred_rf==y_val), 3)}')
print(f'Accuracy LR: {round(np.mean(y_pred_lr==y_val), 3)}')
print(f'Accuracy XGB: {round(np.mean(y_pred_xgb==y_val), 3)}')

print(f'F1 score RF: {round(f1_score(y_val, y_pred_rf), 3)}')
print(f'F1 score LR: {round(f1_score(y_val, y_pred_lr), 3)}')
print(f'F1 score XGB: {round(f1_score(y_val, y_pred_xgb), 3)}')

y_pred_lr = lr_model.predict(xtest_tfidf)
y_pred_rf = rf_model.predict(xtest_tfidf)
y_pred_xgb = xgb_model.predict(xtest_tfidf)

print(f'Accuracy RF: {round(np.mean(y_pred_rf==y_test), 3)}')
print(f'Accuracy LR: {round(np.mean(y_pred_lr==y_test), 3)}')
print(f'Accuracy XGB: {round(np.mean(y_pred_xgb==y_test), 3)}')

print(f'F1 score RF: {round(f1_score(y_test, y_pred_rf), 3)}')
print(f'F1 score LR: {round(f1_score(y_test, y_pred_lr), 3)}')
print(f'F1 score XGB: {round(f1_score(y_test, y_pred_xgb), 3)}')


(2773, 1000) (694, 1000) (1400, 1000)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Accuracy RF: 0.677
Accuracy LR: 0.713
Accuracy XGB: 0.686
F1 score RF: 0.345
F1 score LR: 0.0
F1 score XGB: 0.273
Accuracy RF: 0.781
Accuracy LR: 0.857
Accuracy XGB: 0.8
F1 score RF: 0.376
F1 score LR: 0.0
F1 score XGB: 0.271
