# Importing Libraries

In [1]:
!pip -q install simpletransformers
!pip -q install xgboost



In [2]:
import pandas as pd
import torch
from simpletransformers.language_representation import RepresentationModel
from sklearn.linear_model import LogisticRegression
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import GridSearchCV
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier

# Loading the datasets

In [3]:
df = pd.read_csv('../input/abusive-comment-detection/Abusive_Comment_Codemixed_train.csv',sep='\t',header=None)
df_test = pd.read_csv('../input/abusive-comment-detection/Abusive_Comment_Codemixed_test.csv',sep='\t',header=None)
df_eval = pd.read_csv('../input/abusive-comment-detection/Abusive_Comment_Codemixed_dev.csv',sep='\t',header=None)

In [4]:
df.rename(columns={0:'Labels',1:'Text'},inplace=True)
df_test.rename(columns={0:'Labels',1:'Text'},inplace=True)
df_test = df_test[['Text','Labels']]
df_eval.rename(columns={0:'Labels',1:'Text'},inplace=True)
df_eval = df_eval[['Text','Labels']]

In [5]:
num_labels = len(df['Labels'].unique())
keys = list(df['Labels'].unique())
values = list(range(0, num_labels))
label_dict = dict(zip(keys,values))
df['Labels'] = df['Labels'].apply(lambda x:label_dict[x])
df_test['Labels'] = df_test['Labels'].apply(lambda x:label_dict[x])
df_eval['Labels'] = df_eval['Labels'].apply(lambda x:label_dict[x])
num_labels

8

In [6]:
df_test = df_test.dropna()
df_test = df_test.reset_index().drop(['index'],axis=1)

In [7]:
df = df.dropna()
df = df.reset_index().drop(['index'],axis=1)

In [8]:
df_eval = df_eval.dropna()
df_eval = df_eval.reset_index().drop(['index'],axis=1)

# Getting Sentence Encodings with Context using Bert

In [9]:
model = RepresentationModel(
        model_type="bert",
        model_name="bert-base-multilingual-cased",
        use_cuda=True
    )

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/681M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForTextRepresentation: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForTextRepresentation from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTextRepresentation from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/972k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.87M [00:00<?, ?B/s]

In [10]:
train_sentence_vectors = model.encode_sentences(df['Text'].to_list(), combine_strategy="mean")

In [11]:
eval_sentences = df_eval['Text'].to_list()
eval_sentence_vectors = model.encode_sentences(eval_sentences, combine_strategy="mean")

In [12]:
test_sentences = df_test['Text'].to_list()
test_sentence_vectors = model.encode_sentences(test_sentences, combine_strategy="mean")

In [13]:
train_sentence_vectors.shape

(5943, 768)

# Model Training

## Logistic Regression

In [14]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

#### Grid Search CV

In [15]:
combined_df = pd.concat([df,df_eval],ignore_index=True)

In [16]:
combined_encodings = np.concatenate((train_sentence_vectors, eval_sentence_vectors))
combined_encodings.shape

(7429, 768)

In [17]:
# warnings.filterwarnings('ignore')
# parameters = {
#     'penalty' : ['l1','l2'], 
#     'C'       : [0.01,0.1,1,10,100],
#     'solver'  : ['newton-cg', 'lbfgs', 'liblinear']
# }

# logreg = LogisticRegression()
# clf = GridSearchCV(logreg,                    
#                    param_grid = parameters,   
#                    scoring='f1_weighted',        
#                    cv=10)
# clf.fit(combined_encodings,combined_df['Labels'].to_list())
# print("Tuned Hyperparameters :", clf.best_params_)
# print("Weighted f1_score :",clf.best_score_)

#### Model Training

In [18]:
lm = LogisticRegression(C=1.0,multi_class='ovr', solver='liblinear',penalty='l1')
lm.fit(train_sentence_vectors, df['Labels'].to_list())

LogisticRegression(multi_class='ovr', penalty='l1', solver='liblinear')

In [19]:
log_test_preds = lm.predict(test_sentence_vectors)

In [20]:
report_log = classification_report(df_test['Labels'].to_list(),log_test_preds,output_dict=True)

In [21]:
df_log = pd.DataFrame(report_log).transpose()
df_log

Unnamed: 0,precision,recall,f1-score,support
0,0.743972,0.919369,0.822423,1141.0
1,0.222222,0.034483,0.059701,58.0
2,0.322581,0.340909,0.331492,88.0
3,0.566116,0.469178,0.513109,292.0
4,0.357143,0.089286,0.142857,56.0
5,0.333333,0.085714,0.136364,70.0
6,0.728814,0.452632,0.558442,95.0
7,0.25,0.052632,0.086957,57.0
accuracy,0.686591,0.686591,0.686591,0.686591
macro avg,0.440522,0.305525,0.331418,1857.0


## Decision Trees

#### Grid Search CV

In [22]:
# warnings.filterwarnings('ignore')
# params = {'max_leaf_nodes': list(range(2, 100)), 'min_samples_split': [2, 3, 4]}
# grid_search_cv = GridSearchCV(DecisionTreeClassifier(random_state=42), params, scoring='f1_weighted',verbose=1, cv=5)
# grid_search_cv.fit(combined_encodings,combined_df['Labels'].to_list())
# print("Tuned Hyperparameters :", grid_search_cv.best_params_)
# print("Weighted f1_score :",grid_search_cv.best_score_)

Tuned Hyperparameters : {'max_leaf_nodes': 42, 'min_samples_split': 2}
Weighted f1_score : 0.5224775860387323

#### Model Training

In [23]:
dtree_model = DecisionTreeClassifier(max_leaf_nodes=42,min_samples_split=2).fit(train_sentence_vectors, df['Labels'].to_list())

In [24]:
tree_preds = dtree_model.predict(test_sentence_vectors)
report = classification_report(df_test['Labels'].to_list(),tree_preds,output_dict=True)
final_report = pd.DataFrame(report).transpose()
final_report

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,precision,recall,f1-score,support
0,0.650179,0.956179,0.774033,1141.0
1,0.0,0.0,0.0,58.0
2,0.242424,0.181818,0.207792,88.0
3,0.469027,0.181507,0.261728,292.0
4,0.0,0.0,0.0,56.0
5,0.0,0.0,0.0,70.0
6,0.0,0.0,0.0,95.0
7,0.0,0.0,0.0,57.0
accuracy,0.624663,0.624663,0.624663,0.624663
macro avg,0.170204,0.164938,0.155444,1857.0


## SVC

#### Grid Search CV

In [25]:
# param_grid = {'C': [0.1, 1, 10, 100, 1000],
#               'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
#               'kernel': ['rbf','linear','poly','sigmoid']}
# grid = GridSearchCV(SVC(), param_grid, scoring='f1_weighted',cv=5)
# grid.fit(combined_encodings,combined_df['Labels'].to_list())
# print("Tuned Hyperparameters :", grid.best_params_)
# print("Weighted f1_score :",grid.best_score_)

Tuned Hyperparameters : {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}
Weighted f1_score : 0.615556790854965

#### Model Training

In [26]:
svm = SVC(C=10,gamma=0.01,kernel='rbf').fit(train_sentence_vectors,df['Labels'].to_list())

In [27]:
svm_preds = svm.predict(test_sentence_vectors)
svm_report = classification_report(df_test['Labels'].to_list(),svm_preds,output_dict=True)
svm_report = pd.DataFrame(svm_report).transpose()
svm_report

Unnamed: 0,precision,recall,f1-score,support
0,0.739649,0.923751,0.821512,1141.0
1,0.4,0.034483,0.063492,58.0
2,0.3125,0.340909,0.326087,88.0
3,0.598291,0.479452,0.532319,292.0
4,0.461538,0.107143,0.173913,56.0
5,0.166667,0.042857,0.068182,70.0
6,0.754717,0.421053,0.540541,95.0
7,0.230769,0.052632,0.085714,57.0
accuracy,0.688207,0.688207,0.688207,0.688207
macro avg,0.458016,0.300285,0.32647,1857.0


## Random Forest

####  Grid Search CV

In [28]:
# # Number of trees in random forest
# n_estimators = [10,100,200,500,1000]
# # Number of features to consider at every split
# max_features = ['auto', 'sqrt']
# bootstrap = [True, False]

# param_grid = {'n_estimators': n_estimators,
#                'max_features': max_features,
#                'bootstrap': bootstrap}
# grid = GridSearchCV(RandomForestClassifier(), param_grid, scoring='f1_weighted',cv=2)
# grid.fit(combined_encodings,combined_df['Labels'].to_list())
# print("Tuned Hyperparameters :", grid.best_params_)
# print("Weighted f1_score :",grid.best_score_)

Tuned Hyperparameters obtained from GridSearchCV: {'bootstrap': False, 'max_features': 'sqrt', 'n_estimators': 100}

#### Model Training

In [29]:
rf_params = {'bootstrap': False, 'max_features': 'sqrt', 'n_estimators': 100}
random_clf = RandomForestClassifier(n_estimators=rf_params['n_estimators'],max_features=rf_params['max_features'],bootstrap=rf_params['bootstrap']).fit(train_sentence_vectors, df['Labels'].to_list())

In [30]:
random_pred = random_clf.predict(test_sentence_vectors)
report_random = classification_report(df_test['Labels'].to_list(),random_pred,output_dict=True)
df_random = pd.DataFrame(report_random).transpose()
df_random

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,precision,recall,f1-score,support
0,0.645533,0.981595,0.77886,1141.0
1,0.0,0.0,0.0,58.0
2,0.333333,0.238636,0.278146,88.0
3,0.677966,0.136986,0.22792,292.0
4,0.0,0.0,0.0,56.0
5,0.0,0.0,0.0,70.0
6,0.0,0.0,0.0,95.0
7,0.0,0.0,0.0,57.0
accuracy,0.635972,0.635972,0.635972,0.635972
macro avg,0.207104,0.169652,0.160616,1857.0


## XG Boost


#### Grid SearchCV

In [31]:
# xg_grid_params = {
#     'gamma': range(0,100,20),
#     'max_depth': [1,5,10],
#     'lambda':[0.01,0.1,1,10,100],
#     'alpha':[0.001,0.01,0.1,1,10],
#     'learning_rate':[0.01,0.1,1,10]
# }
# warnings.filterwarnings('ignore')
# grid_xg = grid = GridSearchCV(XGBClassifier(), xg_grid_params, scoring='f1_weighted')
# grid_xg.fit(combined_encodings,combined_df['Labels'].to_list())
# print("Tuned Hyperparameters :", grid_xg.best_params_)
# print("Weighted f1_score :",grid_xg.best_score_)

#### Model Training

In [32]:
model_xg = XGBClassifier().fit(train_sentence_vectors, df['Labels'].to_list())





In [33]:
y_preds_xg = model_xg.predict(test_sentence_vectors)
report_xg = classification_report(df_test['Labels'].to_list(),y_preds_xg,output_dict=True)
df_xg = pd.DataFrame(report_xg).transpose()
df_xg

Unnamed: 0,precision,recall,f1-score,support
0,0.693088,0.957932,0.804268,1141.0
1,0.666667,0.034483,0.065574,58.0
2,0.321429,0.306818,0.313953,88.0
3,0.5875,0.321918,0.415929,292.0
4,0.5,0.035714,0.066667,56.0
5,0.333333,0.014286,0.027397,70.0
6,0.73913,0.178947,0.288136,95.0
7,0.666667,0.035088,0.066667,57.0
accuracy,0.666667,0.666667,0.666667,0.666667
macro avg,0.563477,0.235648,0.256074,1857.0


## Multi Layer Perceptron

#### Grid SearchCV

In [34]:
# mlp_params = {
#     'activation':['identity', 'logistic', 'tanh', 'relu'],
#     'solver':['lbfgs', 'sgd', 'adam'],
#     'alpha':[0.0001,0.001,0.01,0.1],
# }
# warnings.filterwarnings('ignore')
# mlp_grid = GridSearchCV(MLPClassifier(), mlp_params, scoring='f1_weighted')
# mlp_grid.fit(combined_encodings,combined_df['Labels'].to_list())
# print("Tuned Hyperparameters :", mlp_grid.best_params_)
# print("Weighted f1_score :",mlp_grid.best_score_)

Tuned Hyperparameters obtained from grid search : {'activation': 'logistic', 'alpha': 0.1, 'solver': 'adam'}

#### Model Training

In [35]:
clf = MLPClassifier(activation='logistic',alpha=0.1,solver='adam').fit(train_sentence_vectors, df['Labels'].to_list())
mlp_predictions = clf.predict(test_sentence_vectors)

In [36]:
report_mlp = classification_report(df_test['Labels'].to_list(),mlp_predictions,output_dict=True)
df_mlp = pd.DataFrame(report_mlp).transpose()
df_mlp

Unnamed: 0,precision,recall,f1-score,support
0,0.731405,0.930762,0.819128,1141.0
1,0.25,0.017241,0.032258,58.0
2,0.333333,0.284091,0.306748,88.0
3,0.611399,0.40411,0.486598,292.0
4,0.411765,0.125,0.191781,56.0
5,0.3,0.085714,0.133333,70.0
6,0.6875,0.463158,0.553459,95.0
7,0.15625,0.087719,0.11236,57.0
accuracy,0.682822,0.682822,0.682822,0.682822
macro avg,0.435206,0.299724,0.329458,1857.0
