## Importing Libraries

In [1]:
!pip -q install simpletransformers
!pip -q install demoji
!pip -q install bs4



In [2]:
import pandas as pd
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import sklearn
from sklearn.metrics import f1_score
import pickle
import sentencepiece as spm
import re
import pdb
from nltk.corpus import words
from bs4 import BeautifulSoup
import nltk
nltk.download('words')
import nltk, string, re, spacy,unicodedata, random
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import ToktokTokenizer
import nltk, string, re, spacy,unicodedata, random

[nltk_data] Downloading package words to /usr/share/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Loading the dataset

In [3]:
df = pd.read_csv("../input/abusive-comment-detection/Abusive_Comment_Codemixed_train.csv",header=None,sep='\t')
df_eval = pd.read_csv("../input/abusive-comment-detection/Abusive_Comment_Codemixed_dev.csv",header=None,sep='\t')
df_test = pd.read_csv("../input/abusive-comment-detection/Abusive_Comment_Codemixed_test.csv",header=None,sep='\t')

In [4]:
df.rename(columns={0:'Labels',1:'Text'},inplace=True)
df_test.rename(columns={0:'Labels',1:'Text'},inplace=True)
df_test = df_test[['Text','Labels']]
df_eval.rename(columns={0:'Labels',1:'Text'},inplace=True)
df_eval = df_eval[['Text','Labels']]

In [5]:
df_eval = df_eval.dropna()
df_eval = df_eval.reset_index().drop(['index'],axis=1)

In [6]:
num_labels = len(df['Labels'].unique())
keys = list(df['Labels'].unique())
values = list(range(0, num_labels))
label_dict = dict(zip(keys,values))
df['Labels'] = df['Labels'].apply(lambda x:label_dict[x])
df_eval['Labels'] = df_eval['Labels'].apply(lambda x:label_dict[x])
df_test['Labels'] = df_test['Labels'].apply(lambda x:label_dict[x])

## Preprocessing Data

In [7]:
df['Text'] = df['Text'].apply(str)
df_eval['Text'] = df_eval['Text'].apply(str)
df_test['Text'] = df_test['Text'].apply(str)

In [8]:
def deEmojify(string):
    emoji_pattern = re.compile("["
                              u"\U0001F600-\U0001F64F"  # emoticons
                              u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                              u"\U0001F680-\U0001F6FF"  # transport & map symbols
                              u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                              u"\U00002702-\U000027B0"
                              u"\U00002702-\U000027B0"
                              u"\U000024C2-\U0001F251"
                              u"\U0001f926-\U0001f937"
                              u"\U00010000-\U0010ffff"
                              u"\u2640-\u2642"
                              u"\u2600-\u2B55"
                              u"\u200d"
                              u"\u23cf"
                              u"\u23e9"
                              u"\u231a"
                              u"\ufe0f"  # dingbats
                              u"\u3030"
                              "]+", flags=re.UNICODE)
    return str(emoji_pattern.sub(r'', string))

def preprocess(text):
    text = deEmojify(text) #convert emojis to their defns in words, they might be useful
    text = re.sub(r'([\.\'\"\/\-\_\--])',' ', text) # remove punctuations , removes @USER / some abbreviatins
    to_remove_url = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
      '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    text = re.sub(to_remove_url,'',text)  # remove url patterns
    text = re.sub(" \d+", " ", text)
    text = text.replace(","," ")
    text = re.sub(r'(?:^| )\w(?:$| )', ' ', text).strip()
    punctuation='!!"$%&()*+-/:;<=>?[\\]^_{|}~.'
    text = ''.join(ch for ch in text if ch not in set(punctuation))
    # text = text.translate(str.maketrans('', '', string.punctuation))
    text = BeautifulSoup(text, 'html.parser').get_text()
    # Stopword Removing
    tokenizer = ToktokTokenizer()
    # convert sentence into token of words
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    text = ' '.join(ch for ch in tokens)
    return text 

def clean(df):
    df['Text'] = df['Text'].apply(lambda x: preprocess(x))



In [9]:
clean(df)
clean(df_eval)
clean(df_test)

## Balancing the imbalanced dataset

In [10]:
def oversample(df):
        classes = df['Labels'].value_counts().to_dict()
        most = max(classes.values())
        classes_list = []
        for key in classes:
            classes_list.append(df[df['Labels'] == key]) 
        classes_sample = []
        for i in range(1,len(classes_list)):
            classes_sample.append(classes_list[i].sample(most, replace=True))
        df_maybe = pd.concat(classes_sample)
        final_df = pd.concat([df_maybe,classes_list[0]], axis=0)
        final_df = final_df.reset_index(drop=True)
        return pd.DataFrame({'Text': final_df['Text'].tolist(), 'Labels': final_df['Labels'].tolist()})

In [11]:
def over_under_sample(df):
  unq_labels = list(set(df['Labels'].tolist()))
  texts = df['Text'].tolist()
  labels = df['Labels'].tolist()
  data_dict = dict()

  for l in unq_labels:
    data_dict[l] = []

  for i in range(len(texts)):
    #print(labels[i])
    data_dict[labels[i]].append(texts[i])

  req_len = len(labels)//len(unq_labels)

  for label in data_dict:
    if len(data_dict[label]) > req_len:
      data_dict[label] = data_dict[label][:req_len]

    new_texts = []

    new_labels = []      
    for l in data_dict:
      new_texts += data_dict[l]
      new_labels += [l]*len(data_dict[l])
    return oversample(pd.DataFrame({'Text': new_texts, 'Labels': new_labels}))

In [12]:
df = over_under_sample(df)

## Model Training

In [13]:
model_args = ClassificationArgs()

In [14]:
model_args.overwrite_output_dir=True
model_args.learning_rate=4e-5
model_args.eval_batch_size=8
model_args.train_batch_size=8

In [15]:
model = ClassificationModel(
    'xlmroberta',
    'xlm-roberta-base',
    num_labels=8,
    args=model_args,
    tokenizer_type="xlmroberta",
    tokenizer_name='xlm-roberta-base'
) 

Downloading:   0%|          | 0.00/512 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.out_p

Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/8.68M [00:00<?, ?B/s]

  f"use_multiprocessing automatically disabled as {model_type}"


In [16]:
for i in range(0,10):
  !rm -rf /content/outputs
  model.train_model(df,eval_data=df_eval,acc=sklearn.metrics.classification_report)
  result, model_outputs, preds_list = model.eval_model(df_test,acc=sklearn.metrics.classification_report)
  for j in result.values():
    print(j)

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/830 [00:00<?, ?it/s]

  model.parameters(), args.max_grad_norm
  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/1859 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/233 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


0.22637984108769924
              precision    recall  f1-score   support

         0.0       0.89      0.41      0.56      1143
         1.0       0.06      0.28      0.10        58
         2.0       0.23      0.58      0.32        88
         3.0       0.34      0.25      0.29       292
         4.0       0.23      0.32      0.27        56
         5.0       0.11      0.34      0.17        70
         6.0       0.14      0.47      0.22        95
         7.0       0.00      0.00      0.00        57

    accuracy                           0.37      1859
   macro avg       0.25      0.33      0.24      1859
weighted avg       0.63      0.37      0.43      1859

1.8418395299256616
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/830 [00:00<?, ?it/s]



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/1859 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/233 [00:00<?, ?it/s]

0.34558955407967584
              precision    recall  f1-score   support

         0.0       0.93      0.43      0.58      1143
         1.0       0.13      0.36      0.19        58
         2.0       0.23      0.60      0.34        88
         3.0       0.56      0.49      0.52       292
         4.0       0.33      0.52      0.40        56
         5.0       0.12      0.43      0.19        70
         6.0       0.50      0.67      0.58        95
         7.0       0.12      0.49      0.20        57

    accuracy                           0.46      1859
   macro avg       0.37      0.50      0.37      1859
weighted avg       0.72      0.46      0.52      1859

1.5534504480627984
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/830 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/1859 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/233 [00:00<?, ?it/s]

0.40256227433126957
              precision    recall  f1-score   support

         0.0       0.93      0.52      0.66      1143
         1.0       0.20      0.48      0.29        58
         2.0       0.23      0.62      0.34        88
         3.0       0.53      0.63      0.57       292
         4.0       0.30      0.48      0.37        56
         5.0       0.16      0.39      0.23        70
         6.0       0.57      0.69      0.63        95
         7.0       0.15      0.33      0.20        57

    accuracy                           0.54      1859
   macro avg       0.38      0.52      0.41      1859
weighted avg       0.72      0.54      0.58      1859

1.500432232674611
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/830 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/1859 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/233 [00:00<?, ?it/s]

0.4101938253228404
              precision    recall  f1-score   support

         0.0       0.93      0.53      0.68      1143
         1.0       0.26      0.41      0.32        58
         2.0       0.26      0.65      0.37        88
         3.0       0.53      0.64      0.58       292
         4.0       0.25      0.46      0.32        56
         5.0       0.16      0.37      0.22        70
         6.0       0.56      0.68      0.61        95
         7.0       0.16      0.39      0.22        57

    accuracy                           0.55      1859
   macro avg       0.39      0.52      0.41      1859
weighted avg       0.72      0.55      0.59      1859

1.715058618349067
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/830 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/1859 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/233 [00:00<?, ?it/s]

0.41416258540991685
              precision    recall  f1-score   support

         0.0       0.94      0.53      0.68      1143
         1.0       0.23      0.40      0.29        58
         2.0       0.24      0.57      0.34        88
         3.0       0.52      0.71      0.60       292
         4.0       0.22      0.43      0.29        56
         5.0       0.17      0.37      0.23        70
         6.0       0.59      0.67      0.63        95
         7.0       0.14      0.33      0.20        57

    accuracy                           0.55      1859
   macro avg       0.38      0.50      0.41      1859
weighted avg       0.72      0.55      0.59      1859

2.113396842592264
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/830 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/1859 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/233 [00:00<?, ?it/s]

0.4354153286276742
              precision    recall  f1-score   support

         0.0       0.92      0.58      0.71      1143
         1.0       0.30      0.33      0.31        58
         2.0       0.24      0.62      0.34        88
         3.0       0.50      0.72      0.59       292
         4.0       0.30      0.45      0.36        56
         5.0       0.19      0.43      0.26        70
         6.0       0.68      0.68      0.68        95
         7.0       0.18      0.30      0.23        57

    accuracy                           0.58      1859
   macro avg       0.41      0.51      0.44      1859
weighted avg       0.72      0.58      0.62      1859

2.3318243872198425
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/830 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/1859 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/233 [00:00<?, ?it/s]

0.4239879277112292
              precision    recall  f1-score   support

         0.0       0.92      0.55      0.69      1143
         1.0       0.30      0.45      0.36        58
         2.0       0.21      0.57      0.31        88
         3.0       0.54      0.71      0.61       292
         4.0       0.31      0.43      0.36        56
         5.0       0.18      0.46      0.25        70
         6.0       0.52      0.69      0.60        95
         7.0       0.17      0.26      0.21        57

    accuracy                           0.56      1859
   macro avg       0.39      0.51      0.42      1859
weighted avg       0.72      0.56      0.60      1859

2.636651688411256
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/830 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/1859 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/233 [00:00<?, ?it/s]

0.43905304594539185
              precision    recall  f1-score   support

         0.0       0.93      0.59      0.72      1143
         1.0       0.32      0.40      0.36        58
         2.0       0.24      0.53      0.33        88
         3.0       0.49      0.75      0.59       292
         4.0       0.27      0.39      0.32        56
         5.0       0.18      0.39      0.25        70
         6.0       0.60      0.66      0.63        95
         7.0       0.22      0.32      0.26        57

    accuracy                           0.59      1859
   macro avg       0.41      0.50      0.43      1859
weighted avg       0.72      0.59      0.62      1859

2.7172461683530154
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/830 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/1859 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/233 [00:00<?, ?it/s]

0.42683876589941555
              precision    recall  f1-score   support

         0.0       0.93      0.54      0.68      1143
         1.0       0.35      0.40      0.37        58
         2.0       0.24      0.55      0.33        88
         3.0       0.45      0.78      0.57       292
         4.0       0.31      0.39      0.35        56
         5.0       0.18      0.40      0.25        70
         6.0       0.59      0.66      0.63        95
         7.0       0.21      0.37      0.27        57

    accuracy                           0.56      1859
   macro avg       0.41      0.51      0.43      1859
weighted avg       0.72      0.56      0.60      1859

3.063638039924556
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/830 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/1859 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/233 [00:00<?, ?it/s]

0.43248013475296093
              precision    recall  f1-score   support

         0.0       0.91      0.59      0.72      1143
         1.0       0.37      0.45      0.40        58
         2.0       0.23      0.44      0.30        88
         3.0       0.47      0.77      0.58       292
         4.0       0.30      0.41      0.35        56
         5.0       0.18      0.41      0.25        70
         6.0       0.73      0.59      0.65        95
         7.0       0.26      0.33      0.29        57

    accuracy                           0.59      1859
   macro avg       0.43      0.50      0.44      1859
weighted avg       0.71      0.59      0.62      1859

3.1699608659027985


## Predicting Test Data

In [17]:
predictions, raw_outputs = model.predict(df_test['Text'].to_list())

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/1859 [00:00<?, ?it/s]

  0%|          | 0/233 [00:00<?, ?it/s]

In [18]:
df_final = df_test.copy()
reverse_label_dict = {v:u for u,v in label_dict.items()}
df_final['Predicted_Labels'] = predictions
df_final['Predicted_Labels'] = df_final['Predicted_Labels'].apply(lambda x:reverse_label_dict[x])
df_final['Labels'] = df_final['Labels'].apply(lambda x:reverse_label_dict[x])
df_final['pid'] = df_final.index
df_final = df_final[['pid','Predicted_Labels','Labels']]

In [19]:
df_final

Unnamed: 0,pid,Predicted_Labels,Labels
0,0,None-of-the-above,None-of-the-above
1,1,Transphobic,Misogyny
2,2,Misandry,Misandry
3,3,None-of-the-above,None-of-the-above
4,4,Misogyny,None-of-the-above
...,...,...,...
1854,1854,Hope-Speech,None-of-the-above
1855,1855,Hope-Speech,None-of-the-above
1856,1856,None-of-the-above,None-of-the-above
1857,1857,Counter-speech,Counter-speech


In [20]:
score = f1_score(df_final['Labels'],df_final['Predicted_Labels'],average='weighted')
print("The weighted f1_score is:" + str(score))

The weighted f1_score is:0.6204215433815952
