# Importing Libraries

In [1]:
!pip -q install simpletransformers
!pip -q install xgboost



In [2]:
import pandas as pd
import torch
from simpletransformers.language_representation import RepresentationModel
from sklearn.linear_model import LogisticRegression
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import GridSearchCV
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier

# Loading the datasets

In [3]:
df = pd.read_csv('../input/emotion-analysis-in-tamil/Emotion_Analysis_Task_B_train.tsv',sep='\t')
df_test = pd.read_csv('../input/emotion-analysis-in-tamil/Emotion_Analysis_Task_B_test.csv',sep='\t')
df_eval = pd.read_csv('../input/emotion-analysis-in-tamil/Emotion_Analysis_Task_B_dev.tsv',sep='\t')

In [4]:
df_test

Unnamed: 0,text,category
0,ஹாஹா ஹாஹா ....வந்துடுச்சு 😂😂😂👍👍👍👍👍😉😉😉🙏🙏🙏,மகிழ்ச்சி
1,"உண்மைகள் வெளிவரும் தருணம் இது , தங்களுடைய தேவை...",மகிழ்ச்சி
2,இதற்கு ஒரே தீர்வு...; டிஷ் ஷுக்கு பணம் கட்டுறத...,எதிர்பார்ப்பு
3,மோடி ஆதரவாளர்கள் செய்யும் அட்டூழியம் தாங்க ம...,உண்மையை உணர்தல்
4,முழுசா படிச்சிருக்கேன் அதில் எனக்கு மிகவும் பி...,உண்மையை உணர்தல்
...,...,...
4264,என் நாடு...தமிழ் நாடு....,போற்றுதல்
4265,இல்வாழ்க்கையில் கணவன் மனைவி இருவரும் அன்பு செல...,உண்மையை உணர்தல்
4266,மொழியே தெய்வம் ....,போற்றுதல்
4267,எல்லாம் சரிதான். ஆனால் தலை முடியை பின்னிண்டு ப...,எதிர்பார்ப்பு


In [5]:
df.rename(columns={'category':'Labels','text':'Text'},inplace=True)
df = df[['Text','Labels']]
df_test.rename(columns={'category':'Labels','text':'Text'},inplace=True)
df_test = df_test[['Text','Labels']]
df_eval.rename(columns={'category':'Labels','text':'Text'},inplace=True)
df_eval = df_eval[['Text','Labels']]

In [6]:
num_labels = len(df['Labels'].unique())
keys = list(df['Labels'].unique())
values = list(range(0, num_labels))
label_dict = dict(zip(keys,values))
df['Labels'] = df['Labels'].apply(lambda x:label_dict[x])
df_test['Labels'] = df_test['Labels'].apply(lambda x:label_dict[x])
df_eval['Labels'] = df_eval['Labels'].apply(lambda x:label_dict[x])
num_labels

31

In [7]:
df_test = df_test.dropna()
df_test = df_test.reset_index().drop(['index'],axis=1)

# Getting Sentence Encodings with Context using Bert

In [8]:
model = RepresentationModel(
        model_type="bert",
        model_name="bert-base-multilingual-cased",
        use_cuda=True
    )

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/681M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForTextRepresentation: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTextRepresentation from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTextRepresentation from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/972k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.87M [00:00<?, ?B/s]

In [9]:
train_sentence_vectors = model.encode_sentences(df['Text'].to_list(), combine_strategy="mean")

In [10]:
eval_sentences = df_eval['Text'].to_list()
eval_sentence_vectors = model.encode_sentences(eval_sentences, combine_strategy="mean")

In [11]:
test_sentences = df_test['Text'].to_list()
test_sentence_vectors = model.encode_sentences(test_sentences, combine_strategy="mean")

In [12]:
train_sentence_vectors.shape

(30179, 768)

# Model Training

## Logistic Regression

In [13]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [14]:
combined_df = pd.concat([df,df_eval],ignore_index=True)

In [15]:
combined_encodings = np.concatenate((train_sentence_vectors, eval_sentence_vectors))
combined_encodings.shape

(34448, 768)

In [16]:
class_weights = compute_class_weight(
                                        class_weight = "balanced",
                                        classes = list(df['Labels'].unique()),
                                        y = df['Labels'].to_list()                                                    
                                    )
keys = range(11)
values = class_weights
weights = dict(zip(keys,values))

In [17]:
# warnings.filterwarnings('ignore')
# parameters = {
#     'penalty' : ['l1','l2'], 
#     'C'       : [0.01,0.1,1,10],
#     'solver'  : ['newton-cg', 'lbfgs', 'liblinear']
# }

# logreg = LogisticRegression(class_weight=weights)
# clf = GridSearchCV(logreg,                    
#                    param_grid = parameters,   
#                    scoring='f1_macro'
#                   )
# clf.fit(combined_encodings,combined_df['Labels'].to_list())
# print("Tuned Hyperparameters :", clf.best_params_)
# print("Weighted f1_score :",clf.best_score_)

#### Model Training

In [18]:
lm = LogisticRegression(class_weight=weights,max_iter=2000)
lm.fit(train_sentence_vectors, df['Labels'].to_list())

LogisticRegression(class_weight={0: 0.5253729784307922, 1: 0.4443250246610032,
                                 2: 0.7901916631755341, 3: 0.27822695885460363,
                                 4: 6.622558700899715, 5: 1.1062683284457477,
                                 6: 1.5576258064516129, 7: 2.149042227444278,
                                 8: 1.9587849678717466, 9: 0.20452019517484413,
                                 10: 0.7629436747901709},
                   max_iter=2000)

In [19]:
log_test_preds = lm.predict(test_sentence_vectors)

In [20]:
report_log = classification_report(df_test['Labels'].to_list(),log_test_preds,output_dict=True)

In [21]:
df_log = pd.DataFrame(report_log).transpose()
df_log

Unnamed: 0,precision,recall,f1-score,support
0,0.140845,0.074627,0.097561,268.0
1,0.325714,0.175385,0.228,325.0
2,0.164286,0.130682,0.14557,176.0
3,0.28934,0.119497,0.169139,477.0
4,0.014184,0.142857,0.025806,14.0
5,0.282407,0.535088,0.369697,114.0
6,0.054878,0.113924,0.074074,79.0
7,0.05102,0.178571,0.079365,56.0
8,0.020942,0.051282,0.02974,78.0
9,0.484177,0.232523,0.314168,658.0


## Decision Trees

#### Grid Search CV

In [22]:
# warnings.filterwarnings('ignore')
# params = {'max_leaf_nodes': list(range(2, 100,5)), 'min_samples_split': [2, 3, 4]}
# grid_search_cv = GridSearchCV(DecisionTreeClassifier(random_state=42), params, scoring='f1_macro',verbose=1, cv=5)
# grid_search_cv.fit(combined_encodings,combined_df['Labels'].to_list())
# print("Tuned Hyperparameters :", grid_search_cv.best_params_)
# print("Weighted f1_score :",grid_search_cv.best_score_)

#### Model Training

In [23]:
dtree_model = DecisionTreeClassifier().fit(train_sentence_vectors, df['Labels'].to_list())

In [24]:
tree_preds = dtree_model.predict(test_sentence_vectors)
report = classification_report(df_test['Labels'].to_list(),tree_preds,output_dict=True)
final_report = pd.DataFrame(report).transpose()
final_report

Unnamed: 0,precision,recall,f1-score,support
0,0.068376,0.059701,0.063745,268.0
1,0.102236,0.098462,0.100313,325.0
2,0.04023,0.039773,0.04,176.0
3,0.168675,0.176101,0.172308,477.0
4,0.0,0.0,0.0,14.0
5,0.046053,0.061404,0.052632,114.0
6,0.027027,0.025316,0.026144,79.0
7,0.0,0.0,0.0,56.0
8,0.0,0.0,0.0,78.0
9,0.261654,0.264438,0.263039,658.0


## SVC

#### Grid Search CV

In [25]:
# param_grid = {'C': [0.1, 1, 10, 100, 1000],
#               'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
#               'kernel': ['rbf','linear','poly','sigmoid']}
# grid = GridSearchCV(SVC(), param_grid, scoring='f1_macro',cv=5)
# grid.fit(combined_encodings,combined_df['Labels'].to_list())
# print("Tuned Hyperparameters :", grid.best_params_)
# print("Weighted f1_score :",grid.best_score_)

#### Model Training

In [26]:
svm = SVC(kernel='linear').fit(train_sentence_vectors,df['Labels'].to_list())

In [27]:
svm_preds = svm.predict(test_sentence_vectors)
svm_report = classification_report(df_test['Labels'].to_list(),svm_preds,output_dict=True)
svm_report = pd.DataFrame(svm_report).transpose()
svm_report

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,precision,recall,f1-score,support
0,0.162037,0.130597,0.144628,268.0
1,0.230068,0.310769,0.264398,325.0
2,0.117021,0.0625,0.081481,176.0
3,0.203237,0.473795,0.284456,477.0
4,0.0,0.0,0.0,14.0
5,0.421053,0.421053,0.421053,114.0
6,0.052632,0.012658,0.020408,79.0
7,0.071429,0.035714,0.047619,56.0
8,0.0,0.0,0.0,78.0
9,0.374026,0.656535,0.476558,658.0


## XG Boost


#### Grid SearchCV

In [28]:
# xg_grid_params = {
#     'gamma': range(0,100,20),
#     'max_depth': [1,5,10],
#     'lambda':[0.01,0.1,1,10,100],
#     'alpha':[0.001,0.01,0.1,1,10],
#     'learning_rate':[0.01,0.1,1,10]
# }
# warnings.filterwarnings('ignore')
# grid_xg = grid = GridSearchCV(XGBClassifier(), xg_grid_params, scoring='f1_macro')
# grid_xg.fit(combined_encodings,combined_df['Labels'].to_list())
# print("Tuned Hyperparameters :", grid_xg.best_params_)
# print("Weighted f1_score :",grid_xg.best_score_)

#### Model Training

In [29]:
model_xg = XGBClassifier().fit(train_sentence_vectors, df['Labels'].to_list())





In [30]:
y_preds_xg = model_xg.predict(test_sentence_vectors)
report_xg = classification_report(df_test['Labels'].to_list(),y_preds_xg,output_dict=True)
df_xg = pd.DataFrame(report_xg).transpose()
df_xg

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,precision,recall,f1-score,support
0,0.121693,0.085821,0.100656,268.0
1,0.222222,0.252308,0.236311,325.0
2,0.148148,0.045455,0.069565,176.0
3,0.194355,0.505241,0.280722,477.0
4,0.0,0.0,0.0,14.0
5,0.492063,0.27193,0.350282,114.0
6,0.166667,0.012658,0.023529,79.0
7,0.0,0.0,0.0,56.0
8,0.0,0.0,0.0,78.0
9,0.322698,0.676292,0.436917,658.0


## Multi Layer Perceptron

#### Grid SearchCV

In [31]:
# mlp_params = {
#     'activation':['identity', 'logistic', 'tanh', 'relu'],
#     'solver':['lbfgs', 'sgd', 'adam'],
#     'alpha':[0.0001,0.001,0.01,0.1],
# }
# warnings.filterwarnings('ignore')
# mlp_grid = GridSearchCV(MLPClassifier(), mlp_params, scoring='f1_macro')
# mlp_grid.fit(combined_encodings,combined_df['Labels'].to_list())
# print("Tuned Hyperparameters :", mlp_grid.best_params_)
# print("Weighted f1_score :",mlp_grid.best_score_)

#### Model Training

In [32]:
clf = MLPClassifier().fit(train_sentence_vectors, df['Labels'].to_list())
mlp_predictions = clf.predict(test_sentence_vectors)



In [33]:
report_mlp = classification_report(df_test['Labels'].to_list(),mlp_predictions,output_dict=True)
df_mlp = pd.DataFrame(report_mlp).transpose()
df_mlp

Unnamed: 0,precision,recall,f1-score,support
0,0.097902,0.104478,0.101083,268.0
1,0.206897,0.24,0.222222,325.0
2,0.103175,0.073864,0.086093,176.0
3,0.192012,0.262055,0.221631,477.0
4,0.0,0.0,0.0,14.0
5,0.262195,0.377193,0.309353,114.0
6,0.037037,0.025316,0.030075,79.0
7,0.1,0.053571,0.069767,56.0
8,0.025641,0.012821,0.017094,78.0
9,0.37751,0.428571,0.401423,658.0
