In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, f1_score, classification_report
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report
import pandas as pd
import time


In [3]:
tamil_train_embeddings = torch.tensor(np.load(r"train_tamil_fasttext_embeddings.npy"), dtype=torch.float32)
tamil_dev_embeddings = torch.tensor(np.load(r"dev_tamil_fasttext_embeddings.npy"), dtype=torch.float32)
tamil_test_embeddings = torch.tensor(np.load(r"test_tamil_fasttext_embeddings.npy"), dtype=torch.float32)

tamil_eng_train_embeddings = torch.tensor(np.load(r"train_tamil_eng_fasttext_embeddings.npy"), dtype=torch.float32)
tamil_eng_dev_embeddings = torch.tensor(np.load(r"dev_tamil_eng_fasttext_embeddings.npy"), dtype=torch.float32)
tamil_eng_test_embeddings = torch.tensor(np.load(r"test_tamil_eng_fasttext_embeddings.npy"), dtype=torch.float32)

y_tamil_train = np.loadtxt(r"train_tamil_labels.txt", dtype=int)
y_tamil_dev = np.loadtxt(r"dev_tamil_labels.txt", dtype=int)

y_tamil_eng_train = np.loadtxt(r"train_tamil_eng_labels.txt", dtype=int)
y_tamil_eng_dev = np.loadtxt(r"dev_tamil_eng_labels.txt", dtype=int)



#______________________________________________________________

tamil_combined_embeddings = torch.cat((tamil_train_embeddings, tamil_dev_embeddings), dim=0)
y_tamil_combined = np.concatenate((y_tamil_train, y_tamil_dev))

#______________________________________________________________

tamil_eng_combined_embeddings = torch.cat((tamil_eng_train_embeddings, tamil_eng_dev_embeddings), dim=0)
y_tamil_eng_combined = np.concatenate((y_tamil_eng_train, y_tamil_eng_dev))

#______________________________________________________________

# ------------------ Tamil: Combine Original + English-Converted Embeddings ------------------
final_tamil_train_embeddings = torch.cat((tamil_combined_embeddings, tamil_eng_combined_embeddings), dim=1)
final_tamil_train_labels = y_tamil_combined  # Labels remain unchanged

final_tamil_test_embeddings = torch.cat((tamil_test_embeddings, tamil_eng_test_embeddings), dim=1)
#final_tamil_test_labels = y_tamil_test  # Labels remain unchanged

# ------------------ Tamil: Combine Original + English-Converted Embeddings ------------------
final_tamil_train_embeddings2= torch.cat((tamil_train_embeddings, tamil_eng_train_embeddings), dim=1)
final_tamil_train_labels2 = y_tamil_train  # Labels remain unchanged

final_tamil_test_embeddings2 = torch.cat((tamil_dev_embeddings, tamil_eng_dev_embeddings), dim=1)
final_tamil_test_labels2 = y_tamil_dev  # Labels remain unchanged



In [11]:
# Define all dataset combinations
DATASETS = {
    "Tamil": (tamil_train_embeddings, tamil_dev_embeddings, y_tamil_train, y_tamil_dev),
    
    "Tamil-English": (tamil_eng_train_embeddings, tamil_eng_dev_embeddings, y_tamil_eng_train, y_tamil_eng_dev),
    
    "Tamil (Original+English)": (final_tamil_train_embeddings2, final_tamil_test_embeddings2, final_tamil_train_labels2, final_tamil_test_labels2),
    }

# Define ML models
MODELS = {
    "XGBoost": xgb.XGBClassifier(objective='binary:logistic', eval_metric='aucpr', booster='gbtree', n_estimators=1000, max_depth=8, learning_rate=0.1, random_state=42, tree_method="hist", device="cuda"),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=1000, max_depth=8, random_state=42, class_weight='balanced', min_samples_split=2, min_samples_leaf=1),
    "SVM": SVC(kernel='linear', probability=True,  random_state=42, class_weight='balanced', C=1.0, gamma='scale'),
    "KNN": KNeighborsClassifier(n_neighbors=2, weights='uniform', algorithm='auto', n_jobs=-1),
    "MLP": MLPClassifier(hidden_layer_sizes=(100,250,500,300), max_iter=2500, random_state=42),
}


In [None]:
# Define all dataset combinations
DATASETS = {
    "Tamil": (tamil_combined_embeddings, tamil_test_embeddings, y_tamil_combined),
    
    "Tamil-English": (tamil_eng_combined_embeddings, tamil_eng_test_embeddings, y_tamil_eng_combined),
    
    "Tamil (Original+English)": (final_tamil_train_embeddings, final_tamil_test_embeddings, final_tamil_train_labels),
    }

# Define ML models
MODELS = {
    "XGBoost": xgb.XGBClassifier(objective='binary:logistic', eval_metric='aucpr', booster='gbtree', n_estimators=1000, max_depth=8, learning_rate=0.1, random_state=42, tree_method="hist", device="cuda"),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=200, max_depth=6, random_state=42),
    "SVM": SVC(kernel='linear', probability=True),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "MLP": MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42),
}


In [12]:
results = []

for dataset_name, (X_train, X_test, y_train, y_test) in DATASETS.items():
    print(f"\n==== Training on dataset: {dataset_name} ====")

    for model_name, model in MODELS.items():
        print(f"\nTraining model: {model_name}")

        # Handle GPU training for XGBoost
        if "XGB" not in model_name:
            # Convert PyTorch tensors to NumPy only for non-XGBoost models
            if isinstance(X_train, torch.Tensor):
                X_train = X_train.cpu().numpy()
            if isinstance(X_test, torch.Tensor):
                X_test = X_test.cpu().numpy()
            if isinstance(y_train, torch.Tensor):
                y_train = y_train.cpu().numpy()
            if isinstance(y_test, torch.Tensor):
                y_test = y_test.cpu().numpy()

        # Measure training time
        train_start = time.time()
        try:
            model.fit(X_train, y_train)
        except Exception as e:
            print(f"❌ Error training {model_name} on {dataset_name}: {e}")
            continue  # Skip to next model
        train_end = time.time()
        train_time = train_end - train_start  # Training duration

        # Measure evaluation time
        eval_start = time.time()
        try:
            y_pred = model.predict(X_test)
            acc = accuracy_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred, average="weighted")
            report = classification_report(y_test, y_pred, output_dict=True)

            print(f"✅ Accuracy: {acc:.4f}")
            print(f"✅ F1 Score: {f1:.4f}")
            print("Classification Report:\n", classification_report(y_test, y_pred))
            print(f"⏱️ Training Time: {train_time:.2f} sec")
            print("--------------------------------------")

            eval_end = time.time()
            eval_time = eval_end - eval_start  # Evaluation duration

            # Store results
            results.append({
                "Dataset": dataset_name,
                "Model": model_name,
                "Accuracy": acc,
                "F1 Score": f1,
                "Precision": report["weighted avg"]["precision"],
                "Recall": report["weighted avg"]["recall"],
                "Training Time (sec)": train_time,
                "Evaluation Time (sec)": eval_time
            })

        except Exception as e:
            print(f"❌ Error evaluating {model_name} on {dataset_name}: {e}")

# Convert results to DataFrame and save as CSV
#df_results = pd.DataFrame(results)
#df_results.to_csv("model_results.csv", index=False)
#print("\n✅ Results saved to 'model_results.csv'")


==== Training on dataset: Tamil ====

Training model: XGBoost
✅ Accuracy: 0.7878
✅ F1 Score: 0.7835
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.88      0.84       485
           1       0.77      0.64      0.70       302

    accuracy                           0.79       787
   macro avg       0.78      0.76      0.77       787
weighted avg       0.79      0.79      0.78       787

⏱️ Training Time: 18.02 sec
--------------------------------------

Training model: Logistic Regression
✅ Accuracy: 0.6366
✅ F1 Score: 0.5697
Classification Report:
               precision    recall  f1-score   support

           0       0.64      0.93      0.76       485
           1       0.59      0.17      0.27       302

    accuracy                           0.64       787
   macro avg       0.62      0.55      0.51       787
weighted avg       0.62      0.64      0.57       787

⏱️ Training Time: 0.03 sec
----------------------------

In [5]:
import pandas as pd
import torch
import time

# Load test set IDs
test_ids_df = pd.read_csv("test.csv")  # Replace with actual file
test_ids = test_ids_df["id"].tolist()  # Ensure IDs match the test set order

for dataset_name, (X_train, X_test, y_train) in DATASETS.items():
    print(f"\n==== Training on dataset: {dataset_name} ====")

    for model_name, model in MODELS.items():
        print(f"\nTraining model: {model_name}")

        # Handle GPU training for XGBoost
        if "XGB" not in model_name:
            if isinstance(X_train, torch.Tensor):
                X_train = X_train.cpu().numpy()
            if isinstance(X_test, torch.Tensor):
                X_test = X_test.cpu().numpy()
            if isinstance(y_train, torch.Tensor):
                y_train = y_train.cpu().numpy()

        # Train the model
        train_start = time.time()
        try:
            model.fit(X_train, y_train)
        except Exception as e:
            print(f"❌ Error training {model_name} on {dataset_name}: {e}")
            continue
        train_end = time.time()
        train_time = train_end - train_start

        # Predict on test set
        try:
            y_pred = model.predict(X_test)

            print(f"✅ Saving predictions for {dataset_name} using {model_name}")

            # Save predictions for this model-dataset combination
            submission_file = f"submission_{dataset_name.replace(' ', '_')}_{model_name.replace(' ', '_')}_new.csv"
            submission_df = pd.DataFrame(zip(test_ids, y_pred), columns=["id", "predictions"])
            submission_df.to_csv(submission_file, index=False, header=False)

            print(f"\n✅ Predictions saved to '{submission_file}'")

        except Exception as e:
            print(f"❌ Error evaluating {model_name} on {dataset_name}: {e}")



==== Training on dataset: Tamil ====

Training model: XGBoost
✅ Saving predictions for Tamil using XGBoost

✅ Predictions saved to 'submission_Tamil_XGBoost_new.csv'

Training model: Logistic Regression
✅ Saving predictions for Tamil using Logistic Regression

✅ Predictions saved to 'submission_Tamil_Logistic_Regression_new.csv'

Training model: Random Forest
✅ Saving predictions for Tamil using Random Forest

✅ Predictions saved to 'submission_Tamil_Random_Forest_new.csv'

Training model: SVM
✅ Saving predictions for Tamil using SVM

✅ Predictions saved to 'submission_Tamil_SVM_new.csv'

Training model: KNN
✅ Saving predictions for Tamil using KNN

✅ Predictions saved to 'submission_Tamil_KNN_new.csv'

Training model: MLP




✅ Saving predictions for Tamil using MLP

✅ Predictions saved to 'submission_Tamil_MLP_new.csv'

==== Training on dataset: Tamil-English ====

Training model: XGBoost
✅ Saving predictions for Tamil-English using XGBoost

✅ Predictions saved to 'submission_Tamil-English_XGBoost_new.csv'

Training model: Logistic Regression
✅ Saving predictions for Tamil-English using Logistic Regression

✅ Predictions saved to 'submission_Tamil-English_Logistic_Regression_new.csv'

Training model: Random Forest
✅ Saving predictions for Tamil-English using Random Forest

✅ Predictions saved to 'submission_Tamil-English_Random_Forest_new.csv'

Training model: SVM
✅ Saving predictions for Tamil-English using SVM

✅ Predictions saved to 'submission_Tamil-English_SVM_new.csv'

Training model: KNN
✅ Saving predictions for Tamil-English using KNN

✅ Predictions saved to 'submission_Tamil-English_KNN_new.csv'

Training model: MLP
✅ Saving predictions for Tamil-English using MLP

✅ Predictions saved to 'submissi