## Importing Libraries


In [1]:
!pip -q install simpletransformers



In [2]:
import pandas as pd
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import sklearn
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

## Loading Files From Dataset

In [3]:
df = pd.read_csv("../input/abusive-comment-detection/Abusive_Comment_Tamil_train.csv",header=None,sep='\t')
df_eval = pd.read_csv("../input/abusive-comment-detection/Abusive_Comment_Tamil_dev.csv",header=None,sep='\t')
df_test = pd.read_csv("../input/abusive-comment-detection/Abusive_Comment_Tamil_test.csv",header=None,sep='\t')

In [4]:
df_eval.rename(columns={0:'Labels',1:'Text'},inplace=True)
df_eval = df_eval[['Text','Labels']]
df.rename(columns={0:'Labels',1:'Text'},inplace=True)
df = df[['Text','Labels']]
df_test.rename(columns={0:'Labels',1:'Text'},inplace=True)
df_test = df_test[['Text','Labels']]

In [5]:
num_labels = len(df['Labels'].unique())
keys = list(df['Labels'].unique())
values = list(range(0, num_labels))
label_dict = dict(zip(keys,values))
df['Labels'] = df['Labels'].apply(lambda x:label_dict[x])
df_eval['Labels'] = df_eval['Labels'].apply(lambda x:label_dict[x])
df_test['Labels'] = df_test['Labels'].apply(lambda x:label_dict[x])

## Balancing the imbalanced dataset

In [6]:
def oversample(df):
        classes = df['Labels'].value_counts().to_dict()
        most = max(classes.values())
        classes_list = []
        for key in classes:
            classes_list.append(df[df['Labels'] == key]) 
        classes_sample = []
        for i in range(1,len(classes_list)):
            classes_sample.append(classes_list[i].sample(most, replace=True))
        df_maybe = pd.concat(classes_sample)
        final_df = pd.concat([df_maybe,classes_list[0]], axis=0)
        final_df = final_df.reset_index(drop=True)
        return pd.DataFrame({'Text': final_df['Text'].tolist(), 'Labels': final_df['Labels'].tolist()})

In [7]:
def over_under_sample(df):
  unq_labels = list(set(df['Labels'].tolist()))
  texts = df['Text'].tolist()
  labels = df['Labels'].tolist()
  data_dict = dict()

  for l in unq_labels:
    data_dict[l] = []

  for i in range(len(texts)):
    data_dict[labels[i]].append(texts[i])

  req_len = len(labels)//len(unq_labels)

  for label in data_dict:
    if len(data_dict[label]) > req_len:
      data_dict[label] = data_dict[label][:req_len]

    new_texts = []

    new_labels = []      
    for l in data_dict:
      new_texts += data_dict[l]
      new_labels += [l]*len(data_dict[l])
    return oversample(pd.DataFrame({'Text': new_texts, 'Labels': new_labels}))

In [8]:
df = over_under_sample(df)

## Model Training

In [9]:
model_args = ClassificationArgs()

In [10]:
model_args.overwrite_output_dir=True
model_args.eval_batch_size=8
model_args.train_batch_size=8
model_args.learning_rate=4e-5



In [11]:
model = ClassificationModel(
    'xlmroberta',
    'xlm-roberta-base',
    num_labels=9,
    args=model_args,
    tokenizer_type="xlmroberta",
    tokenizer_name='xlm-roberta-base'
) 

Downloading:   0%|          | 0.00/512 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.out_p

Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/8.68M [00:00<?, ?B/s]

  f"use_multiprocessing automatically disabled as {model_type}"


In [12]:
for i in range(0,4):
    !rm -rf /kaggle/working/outputs
    model.train_model(df,eval_data=df_eval,acc=sklearn.metrics.classification_report)
    result, model_outputs, preds_list = model.eval_model(df_test,acc=sklearn.metrics.classification_report)
    for j in result.values():
        print(j)

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/1458 [00:00<?, ?it/s]

  model.parameters(), args.max_grad_norm
  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/700 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/88 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


0.40742206310288437
              precision    recall  f1-score   support

         0.0       0.26      0.77      0.39        26
         1.0       0.89      0.48      0.63       417
         2.0       0.45      0.62      0.53         8
         3.0       0.51      0.68      0.58       127
         4.0       0.26      0.70      0.38        47
         5.0       0.41      0.40      0.40        48
         6.0       0.31      0.52      0.39        25
         7.0       0.00      0.00      0.00         2

    accuracy                           0.54       700
   macro avg       0.39      0.52      0.41       700
weighted avg       0.69      0.54      0.57       700

1.5430842692201787
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/1458 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/700 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/88 [00:00<?, ?it/s]

0.42357718967879804
              precision    recall  f1-score   support

         0.0       0.25      0.54      0.34        26
         1.0       0.83      0.65      0.73       417
         2.0       0.44      0.50      0.47         8
         3.0       0.55      0.68      0.61       127
         4.0       0.29      0.47      0.35        47
         5.0       0.46      0.40      0.43        48
         6.0       0.34      0.48      0.40        25
         7.0       0.00      0.00      0.00         2

    accuracy                           0.61       700
   macro avg       0.40      0.46      0.42       700
weighted avg       0.67      0.61      0.63       700

2.1226364123550328
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/1458 [00:00<?, ?it/s]



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/700 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/88 [00:00<?, ?it/s]

0.42887757446891567
              precision    recall  f1-score   support

         0.0       0.24      0.38      0.30        26
         1.0       0.83      0.69      0.75       417
         2.0       0.45      0.62      0.53         8
         3.0       0.58      0.69      0.63       127
         4.0       0.26      0.47      0.34        47
         5.0       0.44      0.31      0.37        48
         6.0       0.34      0.44      0.39        25
         7.0       0.00      0.00      0.00         2

    accuracy                           0.63       700
   macro avg       0.39      0.45      0.41       700
weighted avg       0.67      0.63      0.64       700

2.298449583690275
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/1458 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/700 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/88 [00:00<?, ?it/s]

0.44302570295395155
              precision    recall  f1-score   support

         0.0       0.30      0.38      0.34        26
         1.0       0.81      0.75      0.77       417
         2.0       0.50      0.75      0.60         8
         3.0       0.59      0.68      0.63       127
         4.0       0.31      0.40      0.35        47
         5.0       0.54      0.31      0.39        48
         6.0       0.30      0.40      0.34        25
         7.0       0.00      0.00      0.00         2

    accuracy                           0.65       700
   macro avg       0.42      0.46      0.43       700
weighted avg       0.67      0.65      0.66       700

2.2630368284881115


## Predicting Results

In [13]:
df_test = df_test.dropna()
df_test = df_test.reset_index()
df_test = df_test.drop(['index'],axis=1)

In [14]:
df_test.isnull().sum()

Text      0
Labels    0
dtype: int64

In [15]:
predictions, raw_outputs = model.predict(df_test['Text'].to_list())

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/699 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

In [16]:
df_final = df_test.copy()
reverse_label_dict = {v:u for u,v in label_dict.items()}
reverse_label_dict
df_final['Predicted_Labels'] = predictions
df_final['Predicted_Labels'] = df_final['Predicted_Labels'].apply(lambda x:reverse_label_dict[x])
df_final['Labels'] = df_final['Labels'].apply(lambda x:reverse_label_dict[x])
df_final['pid'] = df_final.index
df_final = df_final[['pid','Predicted_Labels','Labels']]

In [17]:
df_final

Unnamed: 0,pid,Predicted_Labels,Labels
0,0,None-of-the-above,Xenophobia
1,1,Xenophobia,Xenophobia
2,2,Misandry,Misandry
3,3,Counter-speech,Counter-speech
4,4,Hope-Speech,None-of-the-above
...,...,...,...
694,694,None-of-the-above,None-of-the-above
695,695,None-of-the-above,None-of-the-above
696,696,Counter-speech,None-of-the-above
697,697,Misandry,Misogyny


In [18]:
score = f1_score(df_final['Labels'],df_final['Predicted_Labels'],average='weighted')
print("The weighted average is:" + str(score))

The weighted average is:0.6574808206508256
