## Importing Libraries


In [1]:
!pip -q install simpletransformers



In [2]:
!pip -q install bs4



In [3]:
import pandas as pd
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import sklearn
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

## Loading Files From Dataset

In [4]:
df = pd.read_csv("../input/emotion-analysis-in-tamil/Emotion_Analysis_Task_A_train.csv",header=None,sep='\t')
df_eval = pd.read_csv("../input/emotion-analysis-in-tamil/Emotion_Analysis_Task_A_dev.csv",header=None,sep='\t')
df_test = pd.read_csv("../input/emotion-analysis-in-tamil/Emotion_Analysis_Task_A_test.csv",header=None,sep='\t')

In [5]:
df

Unnamed: 0,0,1
0,Neutral,நாளைக்கு அரிசிக்கு இந்த நிலமை வந்தா 🙂
1,Anger,மானம் கேட்ட அன்புமணி
2,Neutral,தவறு இஸ்ரேல் இருக்காது இதை நான் கூறவில்லை ஹமாஸ...
3,Joy,கொங்கு நாட்டு சிங்கம் உன்மையும் நேர்மையும் உலை...
4,Neutral,இவர் யார்? ஒவ்வொரு வார்த்தையும் முன்னுக்கு பின...
...,...,...
14203,Trust,பெ மணியரசன் கூறுவதைஉணர்ந்து. செயலாற்றுவதேஇன்ற...
14204,Ambiguous,இன்னும் எத்தன நாள் வச்சி செய்வீங்க.
14205,Anticipation,அடுத்த ஏதோ தயார்பன்னிட்டான்
14206,Ambiguous,தமிழ் மற்றும் சமஸ்கிருதம்


In [6]:
df_eval.rename(columns={0:'Labels',1:'Text'},inplace=True)
df_eval = df_eval[['Text','Labels']]
df.rename(columns={0:'Labels',1:'Text'},inplace=True)
df = df[['Text','Labels']]
df_test.rename(columns={0:'Labels',1:'Text'},inplace=True)
df_test = df_test[['Text','Labels']]

In [7]:
num_labels = len(df['Labels'].unique())
keys = list(df['Labels'].unique())
values = list(range(0, num_labels))
label_dict = dict(zip(keys,values))
df['Labels'] = df['Labels'].apply(lambda x:label_dict[x])
df_eval['Labels'] = df_eval['Labels'].apply(lambda x:label_dict[x])
df_test['Labels'] = df_test['Labels'].apply(lambda x:label_dict[x])
num_labels

11

## Balancing the imbalanced dataset

In [8]:
def oversample(df):
        classes = df['Labels'].value_counts().to_dict()
        most = max(classes.values())
        classes_list = []
        for key in classes:
            classes_list.append(df[df['Labels'] == key]) 
        classes_sample = []
        for i in range(1,len(classes_list)):
            classes_sample.append(classes_list[i].sample(most, replace=True))
        df_maybe = pd.concat(classes_sample)
        final_df = pd.concat([df_maybe,classes_list[0]], axis=0)
        final_df = final_df.reset_index(drop=True)
        return pd.DataFrame({'Text': final_df['Text'].tolist(), 'Labels': final_df['Labels'].tolist()})

In [9]:
def over_under_sample(df):
  unq_labels = list(set(df['Labels'].tolist()))
  texts = df['Text'].tolist()
  labels = df['Labels'].tolist()
  data_dict = dict()

  for l in unq_labels:
    data_dict[l] = []

  for i in range(len(texts)):
    data_dict[labels[i]].append(texts[i])

  req_len = len(labels)//len(unq_labels)

  for label in data_dict:
    if len(data_dict[label]) > req_len:
      data_dict[label] = data_dict[label][:req_len]

    new_texts = []

    new_labels = []      
    for l in data_dict:
      new_texts += data_dict[l]
      new_labels += [l]*len(data_dict[l])
    return oversample(pd.DataFrame({'Text': new_texts, 'Labels': new_labels}))

In [10]:
df = over_under_sample(df)

## Model Training

In [11]:
model_args = ClassificationArgs()

In [12]:
model_args.overwrite_output_dir=True
model_args.eval_batch_size=8
model_args.train_batch_size=8
model_args.learning_rate=4e-5

In [13]:
model = ClassificationModel(
    'bert',
    'bert-base-multilingual-cased',
    num_labels=11,
    args=model_args,
    tokenizer_type="bert",
    tokenizer_name='bert-base-multilingual-cased'
) 

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/681M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

Downloading:   0%|          | 0.00/972k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.87M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

In [14]:
for i in range(0,3):
    !rm -rf /kaggle/working/outputs
    model.train_model(df,eval_data=df_eval,acc=sklearn.metrics.classification_report)
    result, model_outputs, preds_list = model.eval_model(df_test,acc=sklearn.metrics.classification_report)
    for j in result.values():
        print(j)

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/23474 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/2935 [00:00<?, ?it/s]

  model.parameters(), args.max_grad_norm
  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/4440 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/555 [00:00<?, ?it/s]

0.22021278721250007
              precision    recall  f1-score   support

         0.0       0.52      0.08      0.13      1538
         1.0       0.13      0.18      0.15       244
         2.0       0.59      0.52      0.55       702
         3.0       0.16      0.19      0.17       277
         4.0       0.21      0.26      0.23       377
         5.0       0.21      0.52      0.30       271
         6.0       0.50      0.53      0.52       500
         7.0       0.15      0.30      0.20       196
         8.0       0.04      0.18      0.06        61
         9.0       0.16      0.33      0.21       241
        10.0       0.14      0.30      0.19        33

    accuracy                           0.28      4440
   macro avg       0.25      0.31      0.25      4440
weighted avg       0.39      0.28      0.27      4440

1.9983677322800095


  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/23474 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/2935 [00:00<?, ?it/s]

  model.parameters(), args.max_grad_norm
  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/4440 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/555 [00:00<?, ?it/s]

0.25098483174263875
              precision    recall  f1-score   support

         0.0       0.54      0.21      0.30      1538
         1.0       0.18      0.26      0.21       244
         2.0       0.56      0.52      0.54       702
         3.0       0.16      0.19      0.18       277
         4.0       0.25      0.30      0.27       377
         5.0       0.23      0.48      0.31       271
         6.0       0.50      0.55      0.53       500
         7.0       0.15      0.27      0.20       196
         8.0       0.06      0.10      0.07        61
         9.0       0.20      0.39      0.26       241
        10.0       0.25      0.18      0.21        33

    accuracy                           0.33      4440
   macro avg       0.28      0.31      0.28      4440
weighted avg       0.41      0.33      0.34      4440

2.058903288626456


  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/23474 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/2935 [00:00<?, ?it/s]

  model.parameters(), args.max_grad_norm
  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/4440 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/555 [00:00<?, ?it/s]

0.2549884977734863
              precision    recall  f1-score   support

         0.0       0.53      0.19      0.28      1538
         1.0       0.18      0.27      0.22       244
         2.0       0.54      0.59      0.56       702
         3.0       0.15      0.20      0.17       277
         4.0       0.21      0.35      0.26       377
         5.0       0.24      0.41      0.30       271
         6.0       0.50      0.56      0.53       500
         7.0       0.19      0.25      0.22       196
         8.0       0.04      0.05      0.05        61
         9.0       0.23      0.34      0.28       241
        10.0       0.25      0.27      0.26        33

    accuracy                           0.34      4440
   macro avg       0.28      0.32      0.28      4440
weighted avg       0.40      0.34      0.34      4440

2.3593938445185754


In [15]:
predictions, raw_outputs = model.predict(df_test['Text'].to_list())

  0%|          | 0/4440 [00:00<?, ?it/s]

  0%|          | 0/555 [00:00<?, ?it/s]

In [16]:
df_final = df_test.copy()
reverse_label_dict = {v:u for u,v in label_dict.items()}
reverse_label_dict
df_final['Predicted_Labels'] = predictions
df_final['Predicted_Labels'] = df_final['Predicted_Labels'].apply(lambda x:reverse_label_dict[x])
df_final['Labels'] = df_final['Labels'].apply(lambda x:reverse_label_dict[x])
df_final['pid'] = df_final.index
df_final = df_final[['pid','Predicted_Labels','Labels']]

In [17]:
df_final

Unnamed: 0,pid,Predicted_Labels,Labels
0,0,Ambiguous,Ambiguous
1,1,Anger,Disguist
2,2,Ambiguous,Disguist
3,3,Ambiguous,Ambiguous
4,4,Joy,Joy
...,...,...,...
4435,4435,Neutral,Neutral
4436,4436,Anticipation,Trust
4437,4437,Trust,Anticipation
4438,4438,Joy,Joy


In [18]:
score = f1_score(df_final['Labels'],df_final['Predicted_Labels'],average='macro')
print("The macro average f1 score is:" + str(score))

The macro average f1 score is:0.2842712661973753
