# This code is for HOS using multilingual Embeddings for three Dravidian CodeMix languages

## Packages to be installed

In [None]:
!pip install -U sentence-transformers

## Import the required packages

In [None]:
# packages
import pandas as pd
from collections import Counter
from sentence_transformers import SentenceTransformer
import numpy as np
import sklearn
from sklearn.metrics import (precision_score, recall_score,f1_score, accuracy_score,mean_squared_error,mean_absolute_error)

### Read the data

In [None]:
train = pd.read_csv("malayalam_offensive_train.csv")
test = pd.read_csv("mal_full_offensive_test.csv",header=None)

### Sperate the train and test senetnecs and labels to a list

In [None]:
train_sent_m=[]
train_label_m=[]
test_sent_m = []
test_label_m = []

for i in train['0']:
    train_sent_m.append(i.split('\t')[0])
    train_label_m.append(i.split('\t')[1])

for i in test[0]:
    test_sent_m.append(i.split('\t')[0])
    test_label_m.append(i.split('\t')[1])

### Label Encoding

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
train_labels_encoded = le.fit_transform(train_label_m)
dev_labels_encoded = le.fit_transform(test_label_m)

### Get Embeddings

In [None]:
trans_model = SentenceTransformer('bert-base-multilingual-cased')
# here other multilingual embeddings can be loaded

In [None]:
train_sentence_embeddings = trans_model.encode(train_sent_m)
dev_sentence_embeddings = trans_model.encode(test_sent_m)

### Weight calculation

In [None]:
class_weights = sklearn.utils.class_weight.compute_class_weight('balanced',
                                                 np.unique(train_label_m),
                                                 train_label_m)
print(class_weights)

### Classification

### Logistic regression

In [None]:

from sklearn.linear_model import LogisticRegression
class_weight = {0:0.22607331, 1:23.54117647 ,2:13.69505703 ,3:17.07014218 ,4:2.484}
model = LogisticRegression(class_weight=class_weight)
model.fit(train_sentence_embeddings, train_labels_encoded)

# make predictions
expected = dev_labels_encoded
predicted = model.predict(dev_sentence_embeddings)

print("eval scores")
accuracy = accuracy_score(expected, predicted)
recall = recall_score(expected, predicted , average="macro")
precision = precision_score(expected, predicted , average="macro")
f1 = f1_score(expected, predicted, average="macro")

print("macro")
print("accuracy")
print("%.3f" %accuracy)
print("precision")
print("%.3f" %precision)
print("racall")
print("%.3f" %recall)
print("f1score")
print("%.3f" %f1)


print("Classification report")
import sklearn
from sklearn.metrics import classification_report
sklearn.metrics.classification_report(expected, predicted)
target_names = ['Not_offensive', 'Offensive_Targeted_Insult_Group', 'Offensive_Targeted_Insult_Individual', 'Offensive_Untargetede', 'not-malayalam']
print(classification_report(expected, predicted, target_names=target_names))


#Saving the predictions
import csv 
predictions = list(le.inverse_transform(predicted))
classified_df = pd.DataFrame( {'tweets': test_sent_m, 'actual_label': test_label_m, 'predictions': predictions})
classified_df.to_csv('Logistic_regression.csv')
print("prediction saved")

### Confusion Matrix

In [None]:
from sklearn.metrics import plot_confusion_matrix
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix
cfm= confusion_matrix(expected, predicted)
classes = np.unique(train_label_m)

df_cfm = pd.DataFrame(cfm, index = classes, columns = classes)
plt.figure(figsize = (7,5))
cfm_plot = sn.heatmap(df_cfm, annot=True,  fmt='g')
cfm_plot.figure.savefig("cfm_LR.png")

### Naive Bayse

In [None]:
# NAive Baise
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import ComplementNB
model = GaussianNB()
model.fit(train_sentence_embeddings, train_labels_encoded)


# make predictions
expected = dev_labels_encoded
predicted = model.predict(dev_sentence_embeddings)


print("eval scores")
accuracy = accuracy_score(expected, predicted)
recall = recall_score(expected, predicted , average="macro")
precision = precision_score(expected, predicted , average="macro")
f1 = f1_score(expected, predicted, average="macro")

print("macro")
print("accuracy")
print("%.3f" %accuracy)
print("precision")
print("%.3f" %precision)
print("racall")
print("%.3f" %recall)
print("f1score")
print("%.3f" %f1)



print("Classification report")
import sklearn
from sklearn.metrics import classification_report
sklearn.metrics.classification_report(expected, predicted)
target_names = ['Not_offensive', 'Offensive_Targeted_Insult_Group', 'Offensive_Targeted_Insult_Individual', 'Offensive_Untargetede', 'not-malayalam']
print(classification_report(expected, predicted, target_names=target_names))


#Saving the predictions
import csv 
predictions = list(le.inverse_transform(predicted))
classified_df = pd.DataFrame( {'tweets': test_sent_m, 'actual_label': test_label_m, 'predictions': predictions})
classified_df.to_csv('Naive_baise.csv')
print("prediction saved")

## Confusion matrix

In [None]:
from sklearn.metrics import plot_confusion_matrix
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix
cfm= confusion_matrix(expected, predicted)
classes = np.unique(train_label_m)

df_cfm = pd.DataFrame(cfm, index = classes, columns = classes)
plt.figure(figsize = (7,5))
cfm_plot = sn.heatmap(df_cfm, annot=True,  fmt='g')
cfm_plot.figure.savefig("cfm_NB.png")

## Random Forest

In [None]:
#random forest
from sklearn.ensemble import RandomForestClassifier
class_weight = {0:0.22607331, 1:23.54117647 ,2:13.69505703 ,3:17.07014218 ,4:2.484}
model = RandomForestClassifier(n_estimators=100, class_weight=class_weight)
model = model.fit(train_sentence_embeddings, train_labels_encoded)

# make predictions
expected = dev_labels_encoded
predicted = model.predict(dev_sentence_embeddings)


print("eval scores")
accuracy = accuracy_score(expected, predicted)
recall = recall_score(expected, predicted , average="macro")
precision = precision_score(expected, predicted , average="macro")
f1 = f1_score(expected, predicted, average="macro")

print("macro")
print("accuracy")
print("%.3f" %accuracy)
print("precision")
print("%.3f" %precision)
print("racall")
print("%.3f" %recall)
print("f1score")
print("%.3f" %f1)



print("Classification report")
import sklearn
from sklearn.metrics import classification_report
sklearn.metrics.classification_report(expected, predicted)
target_names = ['Not_offensive', 'Offensive_Targeted_Insult_Group', 'Offensive_Targeted_Insult_Individual', 'Offensive_Untargetede', 'not-malayalam']
print(classification_report(expected, predicted, target_names=target_names))


#Saving the predictions
import csv 
predictions = list(le.inverse_transform(predicted))
classified_df = pd.DataFrame( {'tweets': test_sent_m, 'actual_label': test_label_m, 'predictions': predictions})
classified_df.to_csv('RF.csv')
print("prediction saved")

## Confusion Matrix

In [None]:
from sklearn.metrics import plot_confusion_matrix
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix
cfm= confusion_matrix(expected, predicted)
classes = np.unique(train_label_m)

df_cfm = pd.DataFrame(cfm, index = classes, columns = classes)
plt.figure(figsize = (7,5))
cfm_plot = sn.heatmap(df_cfm, annot=True,  fmt='g')
cfm_plot.figure.savefig("cfm_RF.png")

## SVM RBF

In [None]:
# SVM rbf
from sklearn.svm import SVC
from sklearn import svm
class_weight = {0:0.22607331, 1:23.54117647 ,2:13.69505703 ,3:17.07014218 ,4:2.484}
model = svm.SVC(kernel='rbf',C = 1000, class_weight =class_weight)
model = model.fit(train_sentence_embeddings, train_labels_encoded)

# make predictions
expected = dev_labels_encoded
predicted = model.predict(dev_sentence_embeddings)


print("eval scores")
accuracy = accuracy_score(expected, predicted)
recall = recall_score(expected, predicted , average="macro")
precision = precision_score(expected, predicted , average="macro")
f1 = f1_score(expected, predicted, average="macro")

print("macro")
print("accuracy")
print("%.3f" %accuracy)
print("precision")
print("%.3f" %precision)
print("racall")
print("%.3f" %recall)
print("f1score")
print("%.3f" %f1)



print("Classification report")
import sklearn
from sklearn.metrics import classification_report
sklearn.metrics.classification_report(expected, predicted)
target_names = ['Not_offensive', 'Offensive_Targeted_Insult_Group', 'Offensive_Targeted_Insult_Individual', 'Offensive_Untargetede', 'not-malayalam']
print(classification_report(expected, predicted, target_names=target_names))


#Saving the predictions
import csv 
predictions = list(le.inverse_transform(predicted))
classified_df = pd.DataFrame( {'tweets': test_sent_m, 'actual_label': test_label_m, 'predictions': predictions})
classified_df.to_csv('SVM_RBF.csv')
print("prediction saved")


## Confusion Matrix

In [None]:
from sklearn.metrics import plot_confusion_matrix
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix
cfm= confusion_matrix(expected, predicted)
classes = np.unique(train_label_m)

df_cfm = pd.DataFrame(cfm, index = classes, columns = classes)
plt.figure(figsize = (7,5))
cfm_plot = sn.heatmap(df_cfm, annot=True,  fmt='g')
cfm_plot.figure.savefig("cfm_SVM_rbf.png")

## SVM Poly

In [None]:
# SVM poly
from sklearn.svm import SVC
from sklearn import svm
model = svm.SVC(kernel='poly',C = 1000)
model = model.fit(train_sentence_embeddings, train_labels_encoded)

# make predictions
expected = dev_labels_encoded
predicted = model.predict(dev_sentence_embeddings)


print("eval scores")
accuracy = accuracy_score(expected, predicted)
recall = recall_score(expected, predicted , average="macro")
precision = precision_score(expected, predicted , average="macro")
f1 = f1_score(expected, predicted, average="macro")


print("macro")
print("accuracy")
print("%.3f" %accuracy)
print("precision")
print("%.3f" %precision)
print("racall")
print("%.3f" %recall)
print("f1score")
print("%.3f" %f1)



print("Classification report")
import sklearn
from sklearn.metrics import classification_report
sklearn.metrics.classification_report(expected, predicted)
target_names = ['Not_offensive', 'Offensive_Targeted_Insult_Group', 'Offensive_Targeted_Insult_Individual', 'Offensive_Untargetede', 'not-malayalam']
print(classification_report(expected, predicted, target_names=target_names))


#Saving the predictions
import csv 
predictions = list(le.inverse_transform(predicted))
classified_df = pd.DataFrame( {'tweets': test_sent_m, 'actual_label': test_label_m, 'predictions': predictions})
classified_df.to_csv('SVM_poly_labse.csv')
print("prediction saved")

#Save model
import pickle
# Save the trained model as a pickle string.
pkl_filename = "poly_labse.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(model, file)
file.close()

## Confusion Matrix

In [None]:
from sklearn.metrics import plot_confusion_matrix
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix
cfm= confusion_matrix(expected, predicted)
classes = np.unique(train_label_m)

df_cfm = pd.DataFrame(cfm, index = classes, columns = classes)
plt.figure(figsize = (7,5))
cfm_plot = sn.heatmap(df_cfm, annot =True,  fmt='g')
cfm_plot.figure.savefig("cfm_SVM_poly_labse.png")

## SVM Linear

In [None]:
# SVM Linear
from sklearn.svm import SVC
from sklearn import svm
model = svm.SVC(kernel='linear',C = 10)
model = model.fit(train_sentence_embeddings, train_labels_encoded)

# make predictions
expected = dev_labels_encoded
predicted = model.predict(dev_sentence_embeddings)


print("eval scores")
accuracy = accuracy_score(expected, predicted)
recall = recall_score(expected, predicted , average="macro")
precision = precision_score(expected, predicted , average="macro")
f1 = f1_score(expected, predicted, average="macro")

print("macro")
print("accuracy")
print("%.3f" %accuracy)
print("precision")
print("%.3f" %precision)
print("racall")
print("%.3f" %recall)
print("f1score")
print("%.3f" %f1)



print("Classification report")
import sklearn
from sklearn.metrics import classification_report
sklearn.metrics.classification_report(expected, predicted)
target_names = ['Not_offensive', 'Offensive_Targeted_Insult_Group', 'Offensive_Targeted_Insult_Individual', 'Offensive_Untargetede', 'not-malayalam']
print(classification_report(expected, predicted, target_names=target_names))


#Saving the predictions
import csv 
predictions = list(le.inverse_transform(predicted))
classified_df = pd.DataFrame( {'tweets': test_sent_m, 'actual_label': test_label_m, 'predictions': predictions})
classified_df.to_csv('SVM_linear_labse.csv')
print("prediction saved")

#Save model
import pickle
# Save the trained model as a pickle string.
pkl_filename = "linear_labse.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(model, file)
file.close()

## Confusion Matrix

In [None]:
from sklearn.metrics import plot_confusion_matrix
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix
cfm= confusion_matrix(expected, predicted)
classes = np.unique(train_label_m)

df_cfm = pd.DataFrame(cfm, index = classes, columns = classes)
plt.figure(figsize = (7,5))
cfm_plot = sn.heatmap(df_cfm, annot =True,  fmt='g')
cfm_plot.figure.savefig("cfm_SVM_linear_labse.png")

## Adaboost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
model = AdaBoostClassifier(n_estimators=100)
model.fit(train_sentence_embeddings, train_labels_encoded)

# make predictions
expected = dev_labels_encoded
predicted = model.predict(dev_sentence_embeddings)

print("eval scores")
accuracy = accuracy_score(expected, predicted)
recall = recall_score(expected, predicted , average="macro")
precision = precision_score(expected, predicted , average="macro")
f1 = f1_score(expected, predicted, average="macro")

print("macro")
print("accuracy")
print("%.3f" %accuracy)
print("precision")
print("%.3f" %precision)
print("racall")
print("%.3f" %recall)
print("f1score")
print("%.3f" %f1)



print("Classification report")
import sklearn
from sklearn.metrics import classification_report
sklearn.metrics.classification_report(expected, predicted)
target_names = ['Not_offensive', 'Offensive_Targeted_Insult_Group', 'Offensive_Targeted_Insult_Individual', 'Offensive_Untargetede', 'not-malayalam']
print(classification_report(expected, predicted, target_names=target_names))


#Saving the predictions
import csv 
predictions = list(le.inverse_transform(predicted))
classified_df = pd.DataFrame( {'tweets': test_sent_m, 'actual_label': test_label_m, 'predictions': predictions})
classified_df.to_csv('Adaboost.csv')
print("prediction saved")

## Confusion matrix

In [None]:
from sklearn.metrics import plot_confusion_matrix
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix
cfm= confusion_matrix(expected, predicted)
classes = np.unique(train_label_m)

df_cfm = pd.DataFrame(cfm, index = classes, columns = classes)
plt.figure(figsize = (7,5))
cfm_plot = sn.heatmap(df_cfm, annot=True,  fmt='g')
cfm_plot.figure.savefig("cfm_adaboost.png")

## KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(weights = 'distance')
model.fit(train_sentence_embeddings, train_labels_encoded)


# make predictions
expected = dev_labels_encoded
predicted = model.predict(dev_sentence_embeddings)


print("eval scores")
accuracy = accuracy_score(expected, predicted)
recall = recall_score(expected, predicted , average="macro")
precision = precision_score(expected, predicted , average="macro")
f1 = f1_score(expected, predicted, average="macro")

print("macro")
print("accuracy")
print("%.3f" %accuracy)
print("precision")
print("%.3f" %precision)
print("racall")
print("%.3f" %recall)
print("f1score")
print("%.3f" %f1)



print("Classification report")
import sklearn
from sklearn.metrics import classification_report
sklearn.metrics.classification_report(expected, predicted)
target_names = ['Not_offensive', 'Offensive_Targeted_Insult_Group', 'Offensive_Targeted_Insult_Individual', 'Offensive_Untargetede', 'not-malayalam']
print(classification_report(expected, predicted, target_names=target_names))


#Saving the predictions
import csv 
predictions = list(le.inverse_transform(predicted))
classified_df = pd.DataFrame( {'tweets': test_sent_m, 'actual_label': test_label_m, 'predictions': predictions})
classified_df.to_csv('KNN.csv')
print("prediction saved")


## Confusion Matrix

In [None]:
from sklearn.metrics import plot_confusion_matrix
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix
cfm= confusion_matrix(expected, predicted)
classes = np.unique(train_label_m)

df_cfm = pd.DataFrame(cfm, index = classes, columns = classes)
plt.figure(figsize = (7,5))
cfm_plot = sn.heatmap(df_cfm, annot=True,  fmt='g')
cfm_plot.figure.savefig("cfm_KNN.png")

## Decision Tree

In [None]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier
class_weight = {0:0.22607331, 1:23.54117647 ,2:13.69505703 ,3:17.07014218 ,4:2.484}
model = DecisionTreeClassifier(class_weight=class_weight)
model.fit(train_sentence_embeddings, train_labels_encoded)


# make predictions
expected = dev_labels_encoded
predicted = model.predict(dev_sentence_embeddings)


print("eval scores")
accuracy = accuracy_score(expected, predicted)
recall = recall_score(expected, predicted , average="macro")
precision = precision_score(expected, predicted , average="macro")
f1 = f1_score(expected, predicted, average="macro")

print("macro")
print("accuracy")
print("%.3f" %accuracy)
print("precision")
print("%.3f" %precision)
print("racall")
print("%.3f" %recall)
print("f1score")
print("%.3f" %f1)



print("Classification report")
import sklearn
from sklearn.metrics import classification_report
sklearn.metrics.classification_report(expected, predicted)
target_names = ['Not_offensive', 'Offensive_Targeted_Insult_Group', 'Offensive_Targeted_Insult_Individual', 'Offensive_Untargetede', 'not-malayalam']
print(classification_report(expected, predicted, target_names=target_names))


#Saving the predictions
import csv 
predictions = list(le.inverse_transform(predicted))
classified_df = pd.DataFrame( {'tweets': test_sent_m, 'actual_label': test_label_m, 'predictions': predictions})
classified_df.to_csv('DT.csv')
print("prediction saved")
