In [2]:
# Importing Essential Packages
import numpy as np
import tensorflow as tf
import pandas as pd

# Importing Essential Functions from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

# Importing Neural Networksfunctions from Tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Various Classifiers to Compare Accuracy
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

# Label Assignmnent
languages = ['eng_tamil']
label = {
    '0' : 'homophobic',
    '1' : 'normal',
    '2' : 'transphobic'
}
data = {}

# Load datasets
for lang in languages:
    data[lang] = {
        'train': pd.read_csv(f"/content/eng_tam_train_prepro_nonaug.csv"),
        'test': pd.read_csv(f"/content/eng_tam_test_prepro_nonaug.csv")
    }

# Encode labels
Label_Encoder = LabelEncoder()
for lang in languages:
    data[lang]['train']['label'] = Label_Encoder.fit_transform(data[lang]['train']['label'])
    data[lang]['test']['label'] = Label_Encoder.transform(data[lang]['test']['label'])

# Feature extraction
vectorizer = TfidfVectorizer(max_features=5000)
X_Train_Data = vectorizer.fit_transform(data[lang]['train']['text'])
X_Test_Data = vectorizer.transform(data[lang]['test']['text'])
Y_Train_Data = data[lang]['train']['label']
Y_Test_Data = data[lang]['test']['label']

# Build the ANN model with increased hidden layers and dropout
model = Sequential([
    Dense(512, activation = 'relu', input_shape=(X_Train_Data.shape[1],)),
    Dropout(0.5),
    Dense(256, activation = 'relu'),
    Dropout(0.5),
    Dense(128, activation = 'relu'),
    Dropout(0.5),
    Dense(64, activation = 'relu'),
    Dropout(0.5),
    Dense(3, activation = 'softmax')
])

# Compile the model
model.compile(optimizer=Adam(lr = 0.001),
              loss = 'sparse_categorical_crossentropy',
              metrics = ['accuracy'])

# Train the model
history = model.fit(X_Train_Data.toarray(),Y_Train_Data, epochs=25, batch_size=32, validation_split=0.1)

# Evaluate the model
Y_Pred_ANN = np.argmax(model.predict(X_Test_Data.toarray()), axis=-1)
print("ANN Confusion Matrix:")
print(confusion_matrix(Y_Test_Data, Y_Pred_ANN))
print("\n ANN Classification Report:")
print(classification_report(Y_Test_Data, Y_Pred_ANN))
print("-"*50)

knn = KNeighborsClassifier(n_neighbors=10)  # You can adjust the number of neighbors (k) as needed
knn.fit(X_Train_Data,Y_Train_Data)

# Predict and evaluate the model
Y_Pred_KNN = knn.predict(X_Test_Data)
print("KNN Confusion Matrix:")
print(confusion_matrix(Y_Test_Data, Y_Pred_KNN))
print("\n KNN Classification Report:")
print(classification_report(Y_Test_Data, Y_Pred_KNN))
print("-"*50)


def Train_and_Evaluate(X_Train_Data, Y_Train_Data, X_Test_Data, Y_Test_Datat, model, model_name):
    # Train the model
    model.fit(X_Train_Data, Y_Train_Data)

    # Predict on the test set
    y_pred = model.predict(X_Test_Data)

    # Print the confusion matrix
    Confusion_Matrix = confusion_matrix(Y_Test_Data, y_pred)
    print(f'{model_name} Confusion Matrix:')
    print(Confusion_Matrix)
    print()

    # Print the classification report
    Classification_Report = classification_report(Y_Test_Data, y_pred)
    print(f'{model_name} Classification Report:')
    print(Classification_Report)
    print()



for lang in languages:
    #X_train, X_test = extract_features(data[lang]['train']['Text'], data[lang]['test']['Text'])
    # SVM
    Support_Vector_Macine = SVC(kernel='linear', C=1)
    Train_and_Evaluate(X_Train_Data, data[lang]['train']['label'], X_Test_Data, data[lang]['test']['label'], Support_Vector_Macine, 'Support Vector Machine')
    print("-"*50)


    # Logistic Regression
    Logistic_Regression = LogisticRegression(max_iter=1000)
    Train_and_Evaluate(X_Train_Data, data[lang]['train']['label'], X_Test_Data, data[lang]['test']['label'], Logistic_Regression, 'Logistic Regression')
    print("-"*50)


    # Decision Tree
    DecisionTree = DecisionTreeClassifier()
    Train_and_Evaluate(X_Train_Data, data[lang]['train']['label'], X_Test_Data, data[lang]['test']['label'], DecisionTree, 'Decision Tree')
    print("-"*50)


    # Random Forest
    Random_Forest = RandomForestClassifier()
    Train_and_Evaluate(X_Train_Data, data[lang]['train']['label'], X_Test_Data, data[lang]['test']['label'], Random_Forest, 'Random Forest')
    print("-"*50)


    # Naive Bayes
    X_train_dense = X_Train_Data.toarray()
    X_test_dense = X_Test_Data.toarray()
    Naive_Bayes = GaussianNB()
    Train_and_Evaluate(X_train_dense, data[lang]['train']['label'], X_test_dense, data[lang]['test']['label'], Naive_Bayes, 'Naive Bayes')
    print("-"*50)


    # GradientBoosting
    Gradient_Boosting = GradientBoostingClassifier()
    Train_and_Evaluate(X_Train_Data, data[lang]['train']['label'], X_Test_Data, data[lang]['test']['label'],Gradient_Boosting , 'Gradient Boosting')
    print("-"*50)


    # Multi Layer Perceptron
    MLP = MLPClassifier()
    Train_and_Evaluate(X_Train_Data, data[lang]['train']['label'], X_Test_Data, data[lang]['test']['label'], MLP, 'Multilayer Perceptron')
    print("-"*50)


    # XGBoost
    XGB = XGBClassifier()
    Train_and_Evaluate(X_Train_Data, data[lang]['train']['label'], X_Test_Data, data[lang]['test']['label'], XGB, 'XGBoost')
    print("-"*50)


# Iterate through the test dataset, print text and predictions
for i in range(len(data[lang]['test']['text'])):
    text = data[lang]['test']['text'].iloc[i]
    true_label = Label_Encoder.inverse_transform([Y_Test_Data[i]])[0]
    predicted_label = Label_Encoder.inverse_transform([Y_Pred_ANN[i]])[0]

    if true_label == 0:
      true_label = "Homophobic"
    elif true_label == 1:
      true_label = "Normal"
    else:
      true_label = "Transphobic"

    if predicted_label == 0:
      predicted_label = "Homophobic"
    elif predicted_label == 1:
      predicted_label = "Normal"
    else:
      predicted_label = "Transphobic"

    print(f"Text: {text}")
    print(f"True Label: {true_label}")
    print(f"Predicted Label: {predicted_label}")
    print()





Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
ANN Confusion Matrix:
[[  21   61    6]
 [  36 1017   32]
 [   4   18   12]]

 ANN Classification Report:
              precision    recall  f1-score   support

           0       0.34      0.24      0.28        88
           1       0.93      0.94      0.93      1085
           2       0.24      0.35      0.29        34

    accuracy                           0.87      1207
   macro avg       0.50      0.51      0.50      1207
weighted avg       0.87      0.87      0.87      1207

--------------------------------------------------
KNN Confusion Matrix:
[[   0   88    0]
 [   0 1085    0]
 [   0   34    0]]

 KNN Classification Report:
              precision    recall  f1-score   support

          

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Support Vector Machine Confusion Matrix:
[[   8   80    0]
 [   7 1078    0]
 [   3   29    2]]

Support Vector Machine Classification Report:
              precision    recall  f1-score   support

           0       0.44      0.09      0.15        88
           1       0.91      0.99      0.95      1085
           2       1.00      0.06      0.11        34

    accuracy                           0.90      1207
   macro avg       0.78      0.38      0.40      1207
weighted avg       0.88      0.90      0.87      1207


--------------------------------------------------
Logistic Regression Confusion Matrix:
[[   4   84    0]
 [   2 1083    0]
 [   1   32    1]]

Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.57      0.05      0.08        88
           1       0.90      1.00      0.95      1085
           2       1.00      0.03      0.06        34

    accuracy                           0.90      1207
   macro avg   