# Importing Libraries

In [1]:
!pip -q install simpletransformers
!pip -q install xgboost



In [2]:
import pandas as pd
import torch
from simpletransformers.language_representation import RepresentationModel
from sklearn.linear_model import LogisticRegression
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import GridSearchCV
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier

# Loading the datasets

In [3]:
df = pd.read_csv('../input/abusive-comment-detection/Abusive_Comment_Tamil_train.csv',sep='\t',header=None)
df_test = pd.read_csv('../input/abusive-comment-detection/Abusive_Comment_Tamil_test.csv',sep='\t',header=None)
df_eval = pd.read_csv('../input/abusive-comment-detection/Abusive_Comment_Tamil_dev.csv',sep='\t',header=None)

In [4]:
df.rename(columns={0:'Labels',1:'Text'},inplace=True)
df_test.rename(columns={0:'Labels',1:'Text'},inplace=True)
df_test = df_test[['Text','Labels']]
df_eval.rename(columns={0:'Labels',1:'Text'},inplace=True)
df_eval = df_eval[['Text','Labels']]

In [5]:
num_labels = len(df['Labels'].unique())
keys = list(df['Labels'].unique())
values = list(range(0, num_labels))
label_dict = dict(zip(keys,values))
df['Labels'] = df['Labels'].apply(lambda x:label_dict[x])
df_test['Labels'] = df_test['Labels'].apply(lambda x:label_dict[x])
df_eval['Labels'] = df_eval['Labels'].apply(lambda x:label_dict[x])

In [6]:
df_test = df_test.dropna()
df_test = df_test.reset_index().drop(['index'],axis=1)

# Getting Sentence Encodings with Context using Bert

In [7]:
model = RepresentationModel(
        model_type="bert",
        model_name="bert-base-multilingual-cased",
        use_cuda=True
    )

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/681M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForTextRepresentation: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForTextRepresentation from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTextRepresentation from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/972k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.87M [00:00<?, ?B/s]

In [8]:
train_sentence_vectors = model.encode_sentences(df['Text'].to_list(), combine_strategy="mean")

In [9]:
eval_sentences = df_eval['Text'].to_list()
eval_sentence_vectors = model.encode_sentences(eval_sentences, combine_strategy="mean")

In [10]:
test_sentences = df_test['Text'].to_list()
test_sentence_vectors = model.encode_sentences(test_sentences, combine_strategy="mean")

In [11]:
train_sentence_vectors.shape

(2240, 768)

# Model Training

## Logistic Regression

In [12]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

#### Grid Search CV

In [13]:
combined_df = pd.concat([df,df_eval],ignore_index=True)

In [14]:
combined_encodings = np.concatenate((train_sentence_vectors, eval_sentence_vectors))
combined_encodings.shape

(2800, 768)

In [15]:
# warnings.filterwarnings('ignore')
# parameters = {
#     'penalty' : ['l1','l2'], 
#     'C'       : np.logspace(-3,3,7),
#     'solver'  : ['newton-cg', 'lbfgs', 'liblinear']
# }

# logreg = LogisticRegression()
# clf = GridSearchCV(logreg,                    
#                    param_grid = parameters,   
#                    scoring='f1_weighted',        
#                    cv=10)
# clf.fit(combined_encodings,combined_df['Labels'].to_list())
# print("Tuned Hyperparameters :", clf.best_params_)
# print("Weighted f1_score :",clf.best_score_)

#### Model Training

In [16]:
lm = LogisticRegression(C=1.0,multi_class='ovr', solver='liblinear',penalty='l1')
lm.fit(train_sentence_vectors, df['Labels'].to_list())

LogisticRegression(multi_class='ovr', penalty='l1', solver='liblinear')

In [17]:
log_test_preds = lm.predict(test_sentence_vectors)

In [18]:
report_log = classification_report(df_test['Labels'].to_list(),log_test_preds,output_dict=True)

  _warn_prf(average, modifier, msg_start, len(result))


In [19]:
df_log = pd.DataFrame(report_log).transpose()
df_log

Unnamed: 0,precision,recall,f1-score,support
0,0.272727,0.115385,0.162162,26.0
1,0.698,0.838942,0.762009,416.0
2,1.0,0.125,0.222222,8.0
3,0.569106,0.551181,0.56,127.0
4,0.25,0.191489,0.216867,47.0
5,0.631579,0.25,0.358209,48.0
6,0.0,0.0,0.0,25.0
7,0.0,0.0,0.0,2.0
accuracy,0.635193,0.635193,0.635193,0.635193
macro avg,0.427676,0.259,0.285184,699.0


## Decision Trees

#### Grid Search CV

In [20]:
# warnings.filterwarnings('ignore')
# params = {'max_leaf_nodes': list(range(2, 100)), 'min_samples_split': [2, 3, 4]}
# grid_search_cv = GridSearchCV(DecisionTreeClassifier(random_state=42), params, scoring='f1_weighted',verbose=1, cv=5)
# grid_search_cv.fit(combined_encodings,combined_df['Labels'].to_list())
# print("Tuned Hyperparameters :", grid_search_cv.best_params_)
# print("Weighted f1_score :",grid_search_cv.best_score_)

Tuned Hyperparameters : {'max_leaf_nodes': 42, 'min_samples_split': 2}
Weighted f1_score : 0.5224775860387323

#### Model Training

In [21]:
dtree_model = DecisionTreeClassifier(max_leaf_nodes=42,min_samples_split=2).fit(train_sentence_vectors, df['Labels'].to_list())

In [22]:
tree_preds = dtree_model.predict(test_sentence_vectors)
report = classification_report(df_test['Labels'].to_list(),tree_preds,output_dict=True)
final_report = pd.DataFrame(report).transpose()
final_report

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,precision,recall,f1-score,support
0,0.0,0.0,0.0,26.0
1,0.624549,0.831731,0.713402,416.0
2,0.2,0.125,0.153846,8.0
3,0.371901,0.354331,0.362903,127.0
4,0.266667,0.085106,0.129032,47.0
5,0.0,0.0,0.0,48.0
6,0.0,0.0,0.0,25.0
7,0.0,0.0,0.0,2.0
accuracy,0.566524,0.566524,0.566524,0.566524
macro avg,0.18289,0.174521,0.169898,699.0


## SVC

#### Grid Search CV

In [23]:
# param_grid = {'C': [0.1, 1, 10, 100, 1000],
#               'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
#               'kernel': ['rbf','linear','poly','sigmoid']}
# grid = GridSearchCV(SVC(), param_grid, scoring='f1_weighted',cv=5)
# grid.fit(combined_encodings,combined_df['Labels'].to_list())
# print("Tuned Hyperparameters :", grid.best_params_)
# print("Weighted f1_score :",grid.best_score_)

Tuned Hyperparameters : {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}
Weighted f1_score : 0.615556790854965

#### Model Training

In [24]:
svm = SVC(C=10,gamma=0.01,kernel='rbf').fit(train_sentence_vectors,df['Labels'].to_list())

In [25]:
svm_preds = svm.predict(test_sentence_vectors)
svm_report = classification_report(df_test['Labels'].to_list(),svm_preds,output_dict=True)
svm_report = pd.DataFrame(svm_report).transpose()
svm_report

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,precision,recall,f1-score,support
0,0.214286,0.115385,0.15,26.0
1,0.696078,0.853365,0.766739,416.0
2,0.6,0.375,0.461538,8.0
3,0.564356,0.448819,0.5,127.0
4,0.305556,0.234043,0.26506,47.0
5,0.6,0.3125,0.410959,48.0
6,0.25,0.08,0.121212,25.0
7,0.0,0.0,0.0,2.0
accuracy,0.638054,0.638054,0.638054,0.638054
macro avg,0.403785,0.302389,0.334439,699.0


## Random Forest

####  Grid Search CV

In [26]:
# # Number of trees in random forest
# n_estimators = [10,100,200,500,1000]
# # Number of features to consider at every split
# max_features = ['auto', 'sqrt']
# bootstrap = [True, False]

# param_grid = {'n_estimators': n_estimators,
#                'max_features': max_features,
#                'bootstrap': bootstrap}
# grid = GridSearchCV(RandomForestClassifier(), param_grid, scoring='f1_weighted',cv=2)
# grid.fit(combined_encodings,combined_df['Labels'].to_list())
# print("Tuned Hyperparameters :", grid.best_params_)
# print("Weighted f1_score :",grid.best_score_)

Tuned Hyperparameters obtained from GridSearchCV: {'bootstrap': False, 'max_features': 'sqrt', 'n_estimators': 100}

#### Model Training

In [27]:
rf_params = {'bootstrap': False, 'max_features': 'sqrt', 'n_estimators': 100}
random_clf = RandomForestClassifier(n_estimators=rf_params['n_estimators'],max_features=rf_params['max_features'],bootstrap=rf_params['bootstrap']).fit(train_sentence_vectors, df['Labels'].to_list())

In [28]:
random_pred = random_clf.predict(test_sentence_vectors)
report_random = classification_report(df_test['Labels'].to_list(),random_pred,output_dict=True)
df_random = pd.DataFrame(report_random).transpose()
df_random

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,precision,recall,f1-score,support
0,0.0,0.0,0.0,26.0
1,0.646965,0.973558,0.777351,416.0
2,0.0,0.0,0.0,8.0
3,0.571429,0.283465,0.378947,127.0
4,0.375,0.06383,0.109091,47.0
5,1.0,0.041667,0.08,48.0
6,0.0,0.0,0.0,25.0
7,0.0,0.0,0.0,2.0
accuracy,0.638054,0.638054,0.638054,0.638054
macro avg,0.324174,0.170315,0.168174,699.0


## XG Boost


#### Grid SearchCV

In [29]:
# xg_grid_params = {
#     'gamma': range(0,100,20),
#     'max_depth': [1,5,10],
#     'lambda':[0.01,0.1,1,10,100],
#     'alpha':[0.001,0.01,0.1,1,10],
#     'learning_rate':[0.01,0.1,1,10]
# }
# warnings.filterwarnings('ignore')
# grid_xg = grid = GridSearchCV(XGBClassifier(), xg_grid_params, scoring='f1_weighted')
# grid_xg.fit(combined_encodings,combined_df['Labels'].to_list())
# print("Tuned Hyperparameters :", grid_xg.best_params_)
# print("Weighted f1_score :",grid_xg.best_score_)

#### Model Training

In [30]:
model_xg = XGBClassifier().fit(train_sentence_vectors, df['Labels'].to_list())





In [31]:
y_preds_xg = model_xg.predict(test_sentence_vectors)
report_xg = classification_report(df_test['Labels'].to_list(),y_preds_xg,output_dict=True)
df_xg = pd.DataFrame(report_xg).transpose()
df_xg

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,precision,recall,f1-score,support
0,0.0,0.0,0.0,26.0
1,0.676208,0.908654,0.775385,416.0
2,0.5,0.125,0.2,8.0
3,0.554455,0.440945,0.491228,127.0
4,0.333333,0.170213,0.225352,47.0
5,0.7,0.145833,0.241379,48.0
6,0.5,0.04,0.074074,25.0
7,0.0,0.0,0.0,2.0
accuracy,0.645207,0.645207,0.645207,0.645207
macro avg,0.408,0.228831,0.250927,699.0


## Multi Layer Perceptron

#### Grid SearchCV

In [32]:
# mlp_params = {
#     'activation':['identity', 'logistic', 'tanh', 'relu'],
#     'solver':['lbfgs', 'sgd', 'adam'],
#     'alpha':[0.0001,0.001,0.01,0.1],
# }
# warnings.filterwarnings('ignore')
# mlp_grid = GridSearchCV(MLPClassifier(), mlp_params, scoring='f1_weighted')
# mlp_grid.fit(combined_encodings,combined_df['Labels'].to_list())
# print("Tuned Hyperparameters :", mlp_grid.best_params_)
# print("Weighted f1_score :",mlp_grid.best_score_)

Tuned Hyperparameters obtained from grid search : {'activation': 'logistic', 'alpha': 0.1, 'solver': 'adam'}

#### Model Training

In [33]:
clf = MLPClassifier(activation='logistic',alpha=0.1,solver='adam').fit(train_sentence_vectors, df['Labels'].to_list())
mlp_predictions = clf.predict(test_sentence_vectors)



In [34]:
report_mlp = classification_report(df_test['Labels'].to_list(),mlp_predictions,output_dict=True)
df_mlp = pd.DataFrame(report_mlp).transpose()
df_mlp

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,precision,recall,f1-score,support
0,0.315789,0.230769,0.266667,26.0
1,0.742788,0.742788,0.742788,416.0
2,0.5,0.25,0.333333,8.0
3,0.510638,0.566929,0.537313,127.0
4,0.269841,0.361702,0.309091,47.0
5,0.461538,0.375,0.413793,48.0
6,0.176471,0.12,0.142857,25.0
7,0.0,0.0,0.0,2.0
accuracy,0.610873,0.610873,0.610873,0.610873
macro avg,0.372133,0.330899,0.34323,699.0
