## Importing Libraries

In [1]:
!pip -q install simpletransformers
!pip -q install demoji
!pip -q install bs4



In [2]:
import pandas as pd
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import sklearn
from sklearn.metrics import f1_score
import pickle
import sentencepiece as spm
import re
import pdb
from nltk.corpus import words
from bs4 import BeautifulSoup
import nltk
nltk.download('words')
import nltk, string, re, spacy,unicodedata, random
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import ToktokTokenizer
import nltk, string, re, spacy,unicodedata, random

[nltk_data] Downloading package words to /usr/share/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Loading the dataset

In [3]:
df = pd.read_csv("../input/abusive-commentdetection/Abusive_Comment_Codemixed_train.csv",header=None,sep='\t')
df_eval = pd.read_csv("../input/abusive-commentdetection/Abusive_Comment_Codemixed_dev.csv",header=None,sep='\t')
df_test = pd.read_csv("../input/abusive-commentdetection/Abusive_Comment_Codemixed_test.csv",header=None,sep='\t')

In [4]:
df.rename(columns={0:'Labels',1:'Text'},inplace=True)
df_test.rename(columns={0:'Labels',1:'Text'},inplace=True)
df_test = df_test[['Text','Labels']]
df_eval.rename(columns={0:'Labels',1:'Text'},inplace=True)
df_eval = df_eval[['Text','Labels']]

In [5]:
df_eval = df_eval.dropna()
df_eval = df_eval.reset_index().drop(['index'],axis=1)

In [6]:
num_labels = len(df['Labels'].unique())
keys = list(df['Labels'].unique())
values = list(range(0, num_labels))
label_dict = dict(zip(keys,values))
df['Labels'] = df['Labels'].apply(lambda x:label_dict[x])
df_eval['Labels'] = df_eval['Labels'].apply(lambda x:label_dict[x])
df_test['Labels'] = df_test['Labels'].apply(lambda x:label_dict[x])

## Preprocessing Data

In [7]:
df['Text'] = df['Text'].apply(str)
df_eval['Text'] = df_eval['Text'].apply(str)
df_test['Text'] = df_test['Text'].apply(str)

In [8]:
def deEmojify(string):
    emoji_pattern = re.compile("["
                              u"\U0001F600-\U0001F64F"  # emoticons
                              u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                              u"\U0001F680-\U0001F6FF"  # transport & map symbols
                              u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                              u"\U00002702-\U000027B0"
                              u"\U00002702-\U000027B0"
                              u"\U000024C2-\U0001F251"
                              u"\U0001f926-\U0001f937"
                              u"\U00010000-\U0010ffff"
                              u"\u2640-\u2642"
                              u"\u2600-\u2B55"
                              u"\u200d"
                              u"\u23cf"
                              u"\u23e9"
                              u"\u231a"
                              u"\ufe0f"  # dingbats
                              u"\u3030"
                              "]+", flags=re.UNICODE)
    return str(emoji_pattern.sub(r'', string))

def preprocess(text):
    text = deEmojify(text) #convert emojis to their defns in words, they might be useful
    text = re.sub(r'([\.\'\"\/\-\_\--])',' ', text) # remove punctuations , removes @USER / some abbreviatins
    to_remove_url = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
      '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    text = re.sub(to_remove_url,'',text)  # remove url patterns
    text = re.sub(" \d+", " ", text)
    text = text.replace(","," ")
    text = re.sub(r'(?:^| )\w(?:$| )', ' ', text).strip()
    punctuation='!!"$%&()*+-/:;<=>?[\\]^_{|}~.'
    text = ''.join(ch for ch in text if ch not in set(punctuation))
    # text = text.translate(str.maketrans('', '', string.punctuation))
    text = BeautifulSoup(text, 'html.parser').get_text()
    # Stopword Removing
    tokenizer = ToktokTokenizer()
    # convert sentence into token of words
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    text = ' '.join(ch for ch in tokens)
    return text 

def clean(df):
    df['Text'] = df['Text'].apply(lambda x: preprocess(x))



In [9]:
clean(df)
clean(df_eval)
clean(df_test)

## Balancing the imbalanced dataset

In [10]:
def oversample(df):
        classes = df['Labels'].value_counts().to_dict()
        most = max(classes.values())
        classes_list = []
        for key in classes:
            classes_list.append(df[df['Labels'] == key]) 
        classes_sample = []
        for i in range(1,len(classes_list)):
            classes_sample.append(classes_list[i].sample(most, replace=True))
        df_maybe = pd.concat(classes_sample)
        final_df = pd.concat([df_maybe,classes_list[0]], axis=0)
        final_df = final_df.reset_index(drop=True)
        return pd.DataFrame({'Text': final_df['Text'].tolist(), 'Labels': final_df['Labels'].tolist()})

In [11]:
def over_under_sample(df):
  unq_labels = list(set(df['Labels'].tolist()))
  texts = df['Text'].tolist()
  labels = df['Labels'].tolist()
  data_dict = dict()

  for l in unq_labels:
    data_dict[l] = []

  for i in range(len(texts)):
    #print(labels[i])
    data_dict[labels[i]].append(texts[i])

  req_len = len(labels)//len(unq_labels)

  for label in data_dict:
    if len(data_dict[label]) > req_len:
      data_dict[label] = data_dict[label][:req_len]

    new_texts = []

    new_labels = []      
    for l in data_dict:
      new_texts += data_dict[l]
      new_labels += [l]*len(data_dict[l])
    return oversample(pd.DataFrame({'Text': new_texts, 'Labels': new_labels}))

In [12]:
df = over_under_sample(df)

## Model Training

In [13]:
model_args = ClassificationArgs()

In [14]:
model_args.overwrite_output_dir=True
model_args.learning_rate=4e-5
model_args.eval_batch_size=8
model_args.train_batch_size=8

In [15]:
model = ClassificationModel(
    'bert',
    'bert-base-multilingual-cased',
    num_labels=8,
    args=model_args,
    tokenizer_type="bert",
    tokenizer_name='bert-base-multilingual-cased'
) 

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/681M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

Downloading:   0%|          | 0.00/972k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.87M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

In [16]:
for i in range(0,10):
  !rm -rf /content/outputs
  model.train_model(df,eval_data=df_eval,acc=sklearn.metrics.classification_report)
  result, model_outputs, preds_list = model.eval_model(df_test,acc=sklearn.metrics.classification_report)
  for j in result.values():
    print(j)

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/6640 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/830 [00:00<?, ?it/s]

  model.parameters(), args.max_grad_norm
  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/1859 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/233 [00:00<?, ?it/s]

0.3828322215813574
              precision    recall  f1-score   support

         0.0       0.93      0.52      0.67      1143
         1.0       0.11      0.36      0.16        58
         2.0       0.29      0.52      0.38        88
         3.0       0.54      0.56      0.55       292
         4.0       0.30      0.52      0.38        56
         5.0       0.16      0.41      0.23        70
         6.0       0.48      0.71      0.57        95
         7.0       0.12      0.32      0.18        57

    accuracy                           0.52      1859
   macro avg       0.37      0.49      0.39      1859
weighted avg       0.72      0.52      0.57      1859

1.4980980321573085


  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/6640 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/830 [00:00<?, ?it/s]

  model.parameters(), args.max_grad_norm
  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/1859 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/233 [00:00<?, ?it/s]

0.403122815356512
              precision    recall  f1-score   support

         0.0       0.89      0.57      0.69      1143
         1.0       0.24      0.41      0.31        58
         2.0       0.28      0.50      0.36        88
         3.0       0.47      0.68      0.55       292
         4.0       0.32      0.52      0.40        56
         5.0       0.17      0.37      0.23        70
         6.0       0.51      0.64      0.57        95
         7.0       0.17      0.28      0.21        57

    accuracy                           0.56      1859
   macro avg       0.38      0.50      0.42      1859
weighted avg       0.69      0.56      0.60      1859

1.7266943071021543


  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/6640 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/830 [00:00<?, ?it/s]

  model.parameters(), args.max_grad_norm
  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/1859 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/233 [00:00<?, ?it/s]

0.41253720783645237
              precision    recall  f1-score   support

         0.0       0.90      0.55      0.68      1143
         1.0       0.25      0.45      0.32        58
         2.0       0.28      0.55      0.37        88
         3.0       0.44      0.72      0.55       292
         4.0       0.35      0.45      0.39        56
         5.0       0.20      0.40      0.26        70
         6.0       0.67      0.63      0.65        95
         7.0       0.18      0.33      0.23        57

    accuracy                           0.56      1859
   macro avg       0.41      0.51      0.43      1859
weighted avg       0.70      0.56      0.60      1859

2.370775968048226


  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/6640 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/830 [00:00<?, ?it/s]

  model.parameters(), args.max_grad_norm
  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/1859 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/233 [00:00<?, ?it/s]

0.4289622224456651
              precision    recall  f1-score   support

         0.0       0.89      0.58      0.70      1143
         1.0       0.27      0.40      0.32        58
         2.0       0.26      0.47      0.33        88
         3.0       0.48      0.74      0.58       292
         4.0       0.31      0.50      0.38        56
         5.0       0.25      0.40      0.30        70
         6.0       0.60      0.68      0.64        95
         7.0       0.23      0.44      0.30        57

    accuracy                           0.58      1859
   macro avg       0.41      0.53      0.45      1859
weighted avg       0.70      0.58      0.61      1859

2.463702194040937


  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/6640 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/830 [00:00<?, ?it/s]

  model.parameters(), args.max_grad_norm
  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/1859 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/233 [00:00<?, ?it/s]

0.4382409241485106
              precision    recall  f1-score   support

         0.0       0.90      0.58      0.71      1143
         1.0       0.31      0.38      0.34        58
         2.0       0.30      0.57      0.40        88
         3.0       0.43      0.77      0.55       292
         4.0       0.37      0.46      0.41        56
         5.0       0.23      0.37      0.28        70
         6.0       0.59      0.72      0.65        95
         7.0       0.31      0.32      0.31        57

    accuracy                           0.59      1859
   macro avg       0.43      0.52      0.46      1859
weighted avg       0.70      0.59      0.62      1859

2.645342553809861


  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/6640 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/830 [00:00<?, ?it/s]

  model.parameters(), args.max_grad_norm
  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/1859 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/233 [00:00<?, ?it/s]

0.4534160349787628
              precision    recall  f1-score   support

         0.0       0.90      0.60      0.72      1143
         1.0       0.33      0.41      0.37        58
         2.0       0.29      0.53      0.38        88
         3.0       0.51      0.74      0.60       292
         4.0       0.31      0.41      0.35        56
         5.0       0.28      0.49      0.35        70
         6.0       0.56      0.75      0.64        95
         7.0       0.23      0.49      0.31        57

    accuracy                           0.60      1859
   macro avg       0.43      0.55      0.47      1859
weighted avg       0.71      0.60      0.63      1859

2.802220762415505


  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/6640 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/830 [00:00<?, ?it/s]

  model.parameters(), args.max_grad_norm
  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/1859 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/233 [00:00<?, ?it/s]

0.44688442168292647
              precision    recall  f1-score   support

         0.0       0.89      0.61      0.72      1143
         1.0       0.29      0.34      0.31        58
         2.0       0.26      0.44      0.33        88
         3.0       0.49      0.78      0.60       292
         4.0       0.33      0.45      0.38        56
         5.0       0.21      0.47      0.29        70
         6.0       0.74      0.66      0.70        95
         7.0       0.31      0.42      0.36        57

    accuracy                           0.61      1859
   macro avg       0.44      0.52      0.46      1859
weighted avg       0.71      0.61      0.63      1859

2.7893874438023873


  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/6640 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/830 [00:00<?, ?it/s]

  model.parameters(), args.max_grad_norm
  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/1859 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/233 [00:00<?, ?it/s]

0.44450365126657937
              precision    recall  f1-score   support

         0.0       0.91      0.58      0.71      1143
         1.0       0.34      0.31      0.32        58
         2.0       0.27      0.47      0.34        88
         3.0       0.44      0.83      0.58       292
         4.0       0.39      0.52      0.44        56
         5.0       0.22      0.36      0.27        70
         6.0       0.58      0.66      0.62        95
         7.0       0.28      0.40      0.33        57

    accuracy                           0.59      1859
   macro avg       0.43      0.52      0.45      1859
weighted avg       0.71      0.59      0.62      1859

3.0492173847082857


  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/6640 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/830 [00:00<?, ?it/s]

  model.parameters(), args.max_grad_norm
  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/1859 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/233 [00:00<?, ?it/s]

0.4542623905193891
              precision    recall  f1-score   support

         0.0       0.87      0.63      0.73      1143
         1.0       0.44      0.41      0.42        58
         2.0       0.30      0.41      0.35        88
         3.0       0.48      0.76      0.59       292
         4.0       0.41      0.54      0.47        56
         5.0       0.20      0.43      0.27        70
         6.0       0.62      0.67      0.65        95
         7.0       0.38      0.46      0.41        57

    accuracy                           0.62      1859
   macro avg       0.46      0.54      0.49      1859
weighted avg       0.70      0.62      0.64      1859

2.7968448778462514


  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/6640 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/830 [00:00<?, ?it/s]

  model.parameters(), args.max_grad_norm
  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/1859 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/233 [00:00<?, ?it/s]

0.43037061380601177
              precision    recall  f1-score   support

         0.0       0.90      0.57      0.70      1143
         1.0       0.35      0.47      0.40        58
         2.0       0.25      0.55      0.35        88
         3.0       0.45      0.74      0.56       292
         4.0       0.41      0.46      0.44        56
         5.0       0.19      0.34      0.25        70
         6.0       0.54      0.73      0.62        95
         7.0       0.26      0.35      0.30        57

    accuracy                           0.58      1859
   macro avg       0.42      0.53      0.45      1859
weighted avg       0.70      0.58      0.61      1859

3.331251514039068


## Predicting Test Data

In [17]:
predictions, raw_outputs = model.predict(df_test['Text'].to_list())

  0%|          | 0/1859 [00:00<?, ?it/s]

  0%|          | 0/233 [00:00<?, ?it/s]

In [18]:
df_final = df_test.copy()
reverse_label_dict = {v:u for u,v in label_dict.items()}
df_final['Predicted_Labels'] = predictions
df_final['Predicted_Labels'] = df_final['Predicted_Labels'].apply(lambda x:reverse_label_dict[x])
df_final['Labels'] = df_final['Labels'].apply(lambda x:reverse_label_dict[x])
df_final['pid'] = df_final.index
df_final = df_final[['pid','Predicted_Labels','Labels']]

In [19]:
df_final

Unnamed: 0,pid,Predicted_Labels,Labels
0,0,Misogyny,None-of-the-above
1,1,Misandry,Misogyny
2,2,Misandry,Misandry
3,3,None-of-the-above,None-of-the-above
4,4,None-of-the-above,None-of-the-above
...,...,...,...
1854,1854,None-of-the-above,None-of-the-above
1855,1855,Hope-Speech,None-of-the-above
1856,1856,None-of-the-above,None-of-the-above
1857,1857,Counter-speech,Counter-speech


In [20]:
score = f1_score(df_final['Labels'],df_final['Predicted_Labels'],average='weighted')
print("The weighted f1_score is:" + str(score))

The weighted f1_score is:0.6099557889352221
