# Importing Libraries

In [1]:
!pip -q install simpletransformers
!pip -q install xgboost



In [2]:
import pandas as pd
import torch
from simpletransformers.language_representation import RepresentationModel
from sklearn.linear_model import LogisticRegression
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import GridSearchCV
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier

# Loading the datasets

In [3]:
df = pd.read_csv('../input/emotion-analysis-in-tamil/Emotion_Analysis_Task_A_train.csv',sep='\t',header=None)
df_test = pd.read_csv('../input/emotion-analysis-in-tamil/Emotion_Analysis_Task_A_test.csv',sep='\t',header=None)
df_eval = pd.read_csv('../input/emotion-analysis-in-tamil/Emotion_Analysis_Task_A_dev.csv',sep='\t',header=None)
df_b = pd.read_csv('../input/emotion-analysis-in-tamil/Emotion_Analysis_Task_B_train.tsv',sep='\t',header=None)

In [4]:
df.rename(columns={0:'Labels',1:'Text'},inplace=True)
df_test.rename(columns={0:'Labels',1:'Text'},inplace=True)
df_test = df_test[['Text','Labels']]
df_eval.rename(columns={0:'Labels',1:'Text'},inplace=True)
df_eval = df_eval[['Text','Labels']]

In [5]:
num_labels = len(df['Labels'].unique())
keys = list(df['Labels'].unique())
values = list(range(0, num_labels))
label_dict = dict(zip(keys,values))
df['Labels'] = df['Labels'].apply(lambda x:label_dict[x])
df_test['Labels'] = df_test['Labels'].apply(lambda x:label_dict[x])
df_eval['Labels'] = df_eval['Labels'].apply(lambda x:label_dict[x])
num_labels

11

In [6]:
df_test = df_test.dropna()
df_test = df_test.reset_index().drop(['index'],axis=1)

# Getting Sentence Encodings with Context using Bert

In [7]:
model = RepresentationModel(
        model_type="bert",
        model_name="bert-base-multilingual-cased",
        use_cuda=True
    )

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/681M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForTextRepresentation: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForTextRepresentation from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTextRepresentation from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/972k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.87M [00:00<?, ?B/s]

In [8]:
train_sentence_vectors = model.encode_sentences(df['Text'].to_list(), combine_strategy="mean")

In [9]:
eval_sentences = df_eval['Text'].to_list()
eval_sentence_vectors = model.encode_sentences(eval_sentences, combine_strategy="mean")

In [10]:
test_sentences = df_test['Text'].to_list()
test_sentence_vectors = model.encode_sentences(test_sentences, combine_strategy="mean")

In [11]:
train_sentence_vectors.shape

(14208, 768)

# Model Training

## Logistic Regression

In [12]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

#### Grid Search CV

In [13]:
combined_df = pd.concat([df,df_eval],ignore_index=True)

In [14]:
combined_encodings = np.concatenate((train_sentence_vectors, eval_sentence_vectors))
combined_encodings.shape

(17760, 768)

In [15]:
# warnings.filterwarnings('ignore')
# parameters = {
#     'penalty' : ['l1','l2'], 
#     'C'       : np.logspace(-3,3,7),
#     'solver'  : ['newton-cg', 'lbfgs', 'liblinear']
# }

# logreg = LogisticRegression()
# clf = GridSearchCV(logreg,                    
#                    param_grid = parameters,   
#                    scoring='f1_macro',        
#                    cv=10)
# clf.fit(combined_encodings,combined_df['Labels'].to_list())
# print("Tuned Hyperparameters :", clf.best_params_)
# print("Weighted f1_score :",clf.best_score_)

#### Model Training

In [16]:
class_weights = compute_class_weight(
                                        class_weight = "balanced",
                                        classes = list(df['Labels'].unique()),
                                        y = df['Labels'].to_list()                                                    
                                    )

In [17]:
keys = range(11)
values = class_weights
weights = dict(zip(keys,values))

In [18]:
lm = LogisticRegression(C=0.005,multi_class='ovr', solver='liblinear',class_weight=weights)
lm.fit(train_sentence_vectors, df['Labels'].to_list())

LogisticRegression(C=0.005,
                   class_weight={0: 0.26681189085650975, 1: 1.5487246566383257,
                                 2: 0.6052654000170401, 3: 1.4193806193806193,
                                 4: 1.0300130491518051, 5: 1.5599472990777339,
                                 6: 0.7647343775230099, 7: 1.9135353535353536,
                                 8: 5.2082111436950145, 9: 1.858469587965991,
                                 10: 12.916363636363636},
                   multi_class='ovr', solver='liblinear')

In [19]:
log_test_preds = lm.predict(test_sentence_vectors)

In [20]:
report_log = classification_report(df_test['Labels'].to_list(),log_test_preds,output_dict=True)

In [21]:
df_log = pd.DataFrame(report_log).transpose()
df_log

Unnamed: 0,precision,recall,f1-score,support
0,0.480119,0.314044,0.379717,1538.0
1,0.159383,0.254098,0.195893,244.0
2,0.457421,0.535613,0.493438,702.0
3,0.158879,0.122744,0.138493,277.0
4,0.231293,0.180371,0.202683,377.0
5,0.191617,0.354244,0.248705,271.0
6,0.41129,0.408,0.409639,500.0
7,0.137931,0.142857,0.140351,196.0
8,0.058824,0.114754,0.077778,61.0
9,0.155556,0.145228,0.150215,241.0


## Decision Trees

#### Grid Search CV

In [22]:
# warnings.filterwarnings('ignore')
# params = {'max_leaf_nodes': list(range(2, 100)), 'min_samples_split': [2, 3, 4]}
# grid_search_cv = GridSearchCV(DecisionTreeClassifier(random_state=42), params, scoring='f1_macro',verbose=1, cv=5)
# grid_search_cv.fit(combined_encodings,combined_df['Labels'].to_list())
# print("Tuned Hyperparameters :", grid_search_cv.best_params_)
# print("Weighted f1_score :",grid_search_cv.best_score_)

#### Model Training

In [23]:
dtree_model = DecisionTreeClassifier().fit(train_sentence_vectors, df['Labels'].to_list())

In [24]:
tree_preds = dtree_model.predict(test_sentence_vectors)
report = classification_report(df_test['Labels'].to_list(),tree_preds,output_dict=True)
final_report = pd.DataFrame(report).transpose()
final_report

Unnamed: 0,precision,recall,f1-score,support
0,0.362379,0.340702,0.351206,1538.0
1,0.0625,0.065574,0.064,244.0
2,0.281831,0.280627,0.281228,702.0
3,0.070707,0.075812,0.073171,277.0
4,0.091644,0.090186,0.090909,377.0
5,0.077441,0.084871,0.080986,271.0
6,0.183824,0.2,0.191571,500.0
7,0.070652,0.066327,0.068421,196.0
8,0.023256,0.032787,0.027211,61.0
9,0.04721,0.045643,0.046414,241.0


## SVC

#### Grid Search CV

In [25]:
# param_grid = {'C': [0.1, 1, 10, 100, 1000],
#               'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
#               'kernel': ['rbf','linear','poly','sigmoid']}
# grid = GridSearchCV(SVC(), param_grid, scoring='f1_macro',cv=5)
# grid.fit(combined_encodings,combined_df['Labels'].to_list())
# print("Tuned Hyperparameters :", grid.best_params_)
# print("Weighted f1_score :",grid.best_score_)

#### Model Training

In [26]:
svm = SVC(kernel='linear').fit(train_sentence_vectors,df['Labels'].to_list())

In [27]:
svm_preds = svm.predict(test_sentence_vectors)
svm_report = classification_report(df_test['Labels'].to_list(),svm_preds,output_dict=True)
svm_report = pd.DataFrame(svm_report).transpose()
svm_report

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,precision,recall,f1-score,support
0,0.396209,0.815345,0.533277,1538.0
1,0.277778,0.020492,0.038168,244.0
2,0.5088,0.452991,0.479277,702.0
3,0.0,0.0,0.0,277.0
4,0.291262,0.079576,0.125,377.0
5,0.287234,0.099631,0.147945,271.0
6,0.475783,0.334,0.392479,500.0
7,0.166667,0.035714,0.058824,196.0
8,0.0,0.0,0.0,61.0
9,0.294118,0.041494,0.072727,241.0


## Random Forest

####  Grid Search CV

In [28]:
# # Number of trees in random forest
# n_estimators = [10,100,200,500,1000]
# # Number of features to consider at every split
# max_features = ['auto', 'sqrt']
# bootstrap = [True, False]

# param_grid = {'n_estimators': n_estimators,
#                'max_features': max_features,
#                'bootstrap': bootstrap}
# grid = GridSearchCV(RandomForestClassifier(), param_grid, scoring='f1_macro',cv=2)
# grid.fit(combined_encodings,combined_df['Labels'].to_list())
# print("Tuned Hyperparameters :", grid.best_params_)
# print("Weighted f1_score :",grid.best_score_)

Tuned Hyperparameters obtained from GridSearchCV: {'bootstrap': False, 'max_features': 'sqrt', 'n_estimators': 100}

#### Model Training

In [29]:
rf_params = {'bootstrap': False, 'max_features': 'sqrt', 'n_estimators': 100}
random_clf = RandomForestClassifier().fit(train_sentence_vectors, df['Labels'].to_list())

In [30]:
random_pred = random_clf.predict(test_sentence_vectors)
report_random = classification_report(df_test['Labels'].to_list(),random_pred,output_dict=True)
df_random = pd.DataFrame(report_random).transpose()
df_random

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,precision,recall,f1-score,support
0,0.372939,0.926528,0.531816,1538.0
1,0.0,0.0,0.0,244.0
2,0.490196,0.320513,0.387597,702.0
3,0.0,0.0,0.0,277.0
4,0.4,0.01061,0.020672,377.0
5,0.0,0.0,0.0,271.0
6,0.604027,0.18,0.27735,500.0
7,0.0,0.0,0.0,196.0
8,0.0,0.0,0.0,61.0
9,0.0,0.0,0.0,241.0


## XG Boost


#### Grid SearchCV

In [31]:
# xg_grid_params = {
#     'gamma': range(0,100,20),
#     'max_depth': [1,5,10],
#     'lambda':[0.01,0.1,1,10,100],
#     'alpha':[0.001,0.01,0.1,1,10],
#     'learning_rate':[0.01,0.1,1,10]
# }
# warnings.filterwarnings('ignore')
# grid_xg = grid = GridSearchCV(XGBClassifier(), xg_grid_params, scoring='f1_macro')
# grid_xg.fit(combined_encodings,combined_df['Labels'].to_list())
# print("Tuned Hyperparameters :", grid_xg.best_params_)
# print("Weighted f1_score :",grid_xg.best_score_)

#### Model Training

In [32]:
model_xg = XGBClassifier().fit(train_sentence_vectors, df['Labels'].to_list())





In [33]:
y_preds_xg = model_xg.predict(test_sentence_vectors)
report_xg = classification_report(df_test['Labels'].to_list(),y_preds_xg,output_dict=True)
df_xg = pd.DataFrame(report_xg).transpose()
df_xg

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,precision,recall,f1-score,support
0,0.389324,0.839402,0.531932,1538.0
1,0.189189,0.028689,0.049822,244.0
2,0.518456,0.440171,0.476117,702.0
3,0.304348,0.025271,0.046667,277.0
4,0.255319,0.06366,0.101911,377.0
5,0.45,0.066421,0.115756,271.0
6,0.448387,0.278,0.34321,500.0
7,0.333333,0.020408,0.038462,196.0
8,0.0,0.0,0.0,61.0
9,0.166667,0.008299,0.01581,241.0


## Multi Layer Perceptron

#### Grid SearchCV

In [34]:
# mlp_params = {
#     'activation':['identity', 'logistic', 'tanh', 'relu'],
#     'solver':['lbfgs', 'sgd', 'adam'],
#     'alpha':[0.0001,0.001,0.01,0.1],
# }
# warnings.filterwarnings('ignore')
# mlp_grid = GridSearchCV(MLPClassifier(), mlp_params, scoring='f1_macro')
# mlp_grid.fit(combined_encodings,combined_df['Labels'].to_list())
# print("Tuned Hyperparameters :", mlp_grid.best_params_)
# print("Weighted f1_score :",mlp_grid.best_score_)

#### Model Training

In [35]:
clf = MLPClassifier().fit(train_sentence_vectors, df['Labels'].to_list())
mlp_predictions = clf.predict(test_sentence_vectors)



In [36]:
report_mlp = classification_report(df_test['Labels'].to_list(),mlp_predictions,output_dict=True)
df_mlp = pd.DataFrame(report_mlp).transpose()
df_mlp

Unnamed: 0,precision,recall,f1-score,support
0,0.418133,0.401821,0.409814,1538.0
1,0.121495,0.106557,0.113537,244.0
2,0.429664,0.400285,0.414454,702.0
3,0.138801,0.158845,0.148148,277.0
4,0.193955,0.204244,0.198966,377.0
5,0.165079,0.191882,0.177474,271.0
6,0.309033,0.39,0.344828,500.0
7,0.101695,0.091837,0.096515,196.0
8,0.034483,0.032787,0.033613,61.0
9,0.157895,0.112033,0.131068,241.0
