## Importing Libraries


In [1]:
!pip -q install simpletransformers



In [2]:
!pip -q install bs4



In [3]:
import pandas as pd
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import sklearn
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

## Loading Files From Dataset

In [4]:
df = pd.read_csv("../input/abusive-commentdetection/Abusive_Comment_Tamil_train.csv",header=None,sep='\t')
df_eval = pd.read_csv("../input/abusive-commentdetection/Abusive_Comment_Tamil_dev.csv",header=None,sep='\t')
df_test = pd.read_csv("../input/abusive-commentdetection/Abusive_Comment_Tamil_test.csv",header=None,sep='\t')

In [5]:
df_eval.rename(columns={0:'Labels',1:'Text'},inplace=True)
df_eval = df_eval[['Text','Labels']]
df.rename(columns={0:'Labels',1:'Text'},inplace=True)
df = df[['Text','Labels']]
df_test.rename(columns={0:'Labels',1:'Text'},inplace=True)
df_test = df_test[['Text','Labels']]

In [6]:
num_labels = len(df['Labels'].unique())
keys = list(df['Labels'].unique())
values = list(range(0, num_labels))
label_dict = dict(zip(keys,values))
df['Labels'] = df['Labels'].apply(lambda x:label_dict[x])
df_eval['Labels'] = df_eval['Labels'].apply(lambda x:label_dict[x])
df_test['Labels'] = df_test['Labels'].apply(lambda x:label_dict[x])
num_labels

9

## Balancing the imbalanced dataset

In [7]:
def oversample(df):
        classes = df['Labels'].value_counts().to_dict()
        most = max(classes.values())
        classes_list = []
        for key in classes:
            classes_list.append(df[df['Labels'] == key]) 
        classes_sample = []
        for i in range(1,len(classes_list)):
            classes_sample.append(classes_list[i].sample(most, replace=True))
        df_maybe = pd.concat(classes_sample)
        final_df = pd.concat([df_maybe,classes_list[0]], axis=0)
        final_df = final_df.reset_index(drop=True)
        return pd.DataFrame({'Text': final_df['Text'].tolist(), 'Labels': final_df['Labels'].tolist()})

In [8]:
def over_under_sample(df):
  unq_labels = list(set(df['Labels'].tolist()))
  texts = df['Text'].tolist()
  labels = df['Labels'].tolist()
  data_dict = dict()

  for l in unq_labels:
    data_dict[l] = []

  for i in range(len(texts)):
    data_dict[labels[i]].append(texts[i])

  req_len = len(labels)//len(unq_labels)

  for label in data_dict:
    if len(data_dict[label]) > req_len:
      data_dict[label] = data_dict[label][:req_len]

    new_texts = []

    new_labels = []      
    for l in data_dict:
      new_texts += data_dict[l]
      new_labels += [l]*len(data_dict[l])
    return oversample(pd.DataFrame({'Text': new_texts, 'Labels': new_labels}))

In [9]:
df = over_under_sample(df)

## Model Training

In [10]:
model_args = ClassificationArgs()

In [11]:
model_args.overwrite_output_dir=True
model_args.eval_batch_size=8
model_args.train_batch_size=8
model_args.learning_rate=4e-5



In [12]:
model = ClassificationModel(
    'bert',
    'bert-base-multilingual-cased',
    num_labels=9,
    args=model_args,
    tokenizer_type="bert",
    tokenizer_name='bert-base-multilingual-cased'
) 

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/681M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

Downloading:   0%|          | 0.00/972k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.87M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

In [13]:
for i in range(0,5):
    !rm -rf /kaggle/working/outputs
    model.train_model(df,eval_data=df_eval,acc=sklearn.metrics.classification_report)
    result, model_outputs, preds_list = model.eval_model(df_test,acc=sklearn.metrics.classification_report)
    for j in result.values():
        print(j)

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/11664 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/1458 [00:00<?, ?it/s]

  model.parameters(), args.max_grad_norm
  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/700 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/88 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


0.41328591143403637
              precision    recall  f1-score   support

         0.0       0.17      0.27      0.21        26
         1.0       0.82      0.66      0.73       417
         2.0       0.83      0.62      0.71         8
         3.0       0.53      0.63      0.58       127
         4.0       0.30      0.49      0.37        47
         5.0       0.50      0.50      0.50        48
         6.0       0.27      0.48      0.34        25
         7.0       0.00      0.00      0.00         2

    accuracy                           0.61       700
   macro avg       0.43      0.46      0.43       700
weighted avg       0.67      0.61      0.63       700

1.5737346898425708


  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/11664 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/1458 [00:00<?, ?it/s]

  model.parameters(), args.max_grad_norm
  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/700 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/88 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


0.4531774914943212
              precision    recall  f1-score   support

         0.0       0.20      0.23      0.21        26
         1.0       0.80      0.77      0.78       417
         2.0       0.71      0.62      0.67         8
         3.0       0.63      0.65      0.64       127
         4.0       0.30      0.47      0.37        47
         5.0       0.61      0.42      0.49        48
         6.0       0.43      0.36      0.39        25
         7.0       0.00      0.00      0.00         2

    accuracy                           0.67       700
   macro avg       0.46      0.44      0.44       700
weighted avg       0.68      0.67      0.67       700

1.9094213890758427


  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/11664 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/1458 [00:00<?, ?it/s]

  model.parameters(), args.max_grad_norm
  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/700 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/88 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


0.4532032295419347
              precision    recall  f1-score   support

         0.0       0.33      0.23      0.27        26
         1.0       0.79      0.79      0.79       417
         2.0       0.57      0.50      0.53         8
         3.0       0.66      0.65      0.65       127
         4.0       0.29      0.45      0.35        47
         5.0       0.58      0.44      0.50        48
         6.0       0.33      0.28      0.30        25
         7.0       0.00      0.00      0.00         2

    accuracy                           0.67       700
   macro avg       0.44      0.42      0.42       700
weighted avg       0.68      0.67      0.67       700

2.124491932547905


  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/11664 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/1458 [00:00<?, ?it/s]

  model.parameters(), args.max_grad_norm
  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/700 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/88 [00:00<?, ?it/s]

0.45201062482925597
              precision    recall  f1-score   support

         0.0       0.29      0.23      0.26        26
         1.0       0.77      0.81      0.79       417
         2.0       0.71      0.62      0.67         8
         3.0       0.63      0.61      0.62       127
         4.0       0.37      0.40      0.38        47
         5.0       0.54      0.44      0.48        48
         6.0       0.43      0.36      0.39        25
         7.0       0.00      0.00      0.00         2

    accuracy                           0.68       700
   macro avg       0.47      0.43      0.45       700
weighted avg       0.67      0.68      0.67       700

2.264978091147813


  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/11664 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/1458 [00:00<?, ?it/s]

  model.parameters(), args.max_grad_norm
  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/700 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/88 [00:00<?, ?it/s]

0.4092632262453872
              precision    recall  f1-score   support

         0.0       0.25      0.27      0.26        26
         1.0       0.75      0.80      0.77       417
         2.0       0.71      0.62      0.67         8
         3.0       0.63      0.61      0.62       127
         4.0       0.27      0.26      0.26        47
         5.0       0.53      0.42      0.47        48
         6.0       0.29      0.20      0.24        25
         7.0       0.00      0.00      0.00         2

    accuracy                           0.66       700
   macro avg       0.43      0.40      0.41       700
weighted avg       0.64      0.66      0.65       700

2.3778684640472587


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Results

In [14]:
df_test = df_test.dropna()
df_test = df_test.reset_index().drop(['index'],axis=1)

In [15]:
predictions, raw_outputs = model.predict(df_test['Text'].to_list())

  0%|          | 0/699 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

In [16]:
df_final = df_test.copy()
reverse_label_dict = {v:u for u,v in label_dict.items()}
reverse_label_dict
df_final['Predicted_Labels'] = predictions
df_final['Predicted_Labels'] = df_final['Predicted_Labels'].apply(lambda x:reverse_label_dict[x])
df_final['Labels'] = df_final['Labels'].apply(lambda x:reverse_label_dict[x])
df_final['pid'] = df_final.index
df_final = df_final[['pid','Predicted_Labels','Labels']]

In [17]:
df_final

Unnamed: 0,pid,Predicted_Labels,Labels
0,0,None-of-the-above,Xenophobia
1,1,Xenophobia,Xenophobia
2,2,Misandry,Misandry
3,3,None-of-the-above,Counter-speech
4,4,None-of-the-above,None-of-the-above
...,...,...,...
694,694,None-of-the-above,None-of-the-above
695,695,None-of-the-above,None-of-the-above
696,696,Counter-speech,None-of-the-above
697,697,Xenophobia,Misogyny


In [18]:
score = f1_score(df_final['Labels'],df_final['Predicted_Labels'],average='weighted')
print("The weighted average is:" + str(score))

The weighted average is:0.6484475008177113
