In [2]:
import pandas as pd
import numpy as np
import gensim.downloader as api
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import xgboost as xgb
import re

# Load normalized data
data = pd.read_csv(r"C:\Users\Dell\Downloads\normalized_data.csv")

# Download FastText model
fasttext_model = api.load("fasttext-wiki-news-subwords-300")

# Some useful functions for FastText
def get_average_fasttext(tokens_list, model, generate_missing=False, k=300):
    if len(tokens_list) < 1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [model[word] if word in model else np.random.rand(k) for word in tokens_list]
    else:
        vectorized = [model[word] if word in model else np.zeros(k) for word in tokens_list]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

def get_fasttext_embeddings(model, tokens, generate_missing=False):
    embeddings = []
    for token in tokens:
        embeddings.append(get_average_fasttext(token.split(), model, generate_missing=generate_missing))
    return embeddings

# Create tokens column from the description
data['tokens'] = data['description'].apply(lambda x: ' '.join(re.findall(r'\w+', x.lower())))

# Split data into train and test sets (70:30 ratio)
train_data, test_data = train_test_split(data, test_size=0.3, random_state=42)

# FastText embedding
X_train_fasttext = get_fasttext_embeddings(fasttext_model, train_data['tokens'])
X_test_fasttext = get_fasttext_embeddings(fasttext_model, test_data['tokens'])

y_train = train_data['label']
y_test = test_data['label']

# Convert embeddings to NumPy arrays
X_train_fasttext = np.array(X_train_fasttext)
X_test_fasttext = np.array(X_test_fasttext)

# Define models
models = {
    "Linear SVM": SVC(kernel='linear', verbose=True),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "LightGBM": lgb.LGBMClassifier(objective='multiclass', num_class=len(np.unique(y_train))),
    "XGBoost": xgb.XGBClassifier(objective='multi:softmax', num_class=len(np.unique(y_train)))
}

# Define label_dict with appropriate class labels and names
label_dict = {0: 'Class 0', 1: 'Class 1', 2: 'Class 2', 3: 'Class 3'}  # Adjust according to your actual class labels and names

# Initialize results dictionary to store accuracies and reports
results = {}

# Train and evaluate each model
for model_name, model in models.items():
    print(f"Training {model_name}...")
    
    model.fit(X_train_fasttext, y_train)
    
    if model_name in ["LightGBM", "XGBoost"]:
        y_pred_train = np.argmax(model.predict_proba(X_train_fasttext), axis=1)
        y_pred_test = np.argmax(model.predict_proba(X_test_fasttext), axis=1)
    else:
        y_pred_train = model.predict(X_train_fasttext)
        y_pred_test = model.predict(X_test_fasttext)

    train_accuracy = accuracy_score(y_train, y_pred_train)
    test_accuracy = accuracy_score(y_test, y_pred_test)
    
    precision = precision_score(y_test, y_pred_test, average='weighted')
    recall = recall_score(y_test, y_pred_test, average='weighted')
    f1 = f1_score(y_test, y_pred_test, average='weighted')
    
    test_classification_report = classification_report(y_test, y_pred_test, target_names=label_dict.values())
    
    results[model_name] = {
        "Training Accuracy": train_accuracy,
        "Test Accuracy": test_accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1,
        "Classification Report": test_classification_report
    }

# Print results
for model, result in results.items():
    print(f"\nResults for {model}")
    print("Training Accuracy:", result["Training Accuracy"])
    print("Test Accuracy:", result["Test Accuracy"])
    print("Precision:", result["Precision"])
    print("Recall:", result["Recall"])
    print("F1 Score:", result["F1 Score"])
    print("Test Classification Report:")
    print(result["Classification Report"])


Training Linear SVM...
[LibSVM]Training KNN...
Training Random Forest...
Training LightGBM...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.030940 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 76500
[LightGBM] [Info] Number of data points in the train set: 19461, number of used features: 300
[LightGBM] [Info] Start training from score -1.651200
[LightGBM] [Info] Start training from score -0.968555
[LightGBM] [Info] Start training from score -1.500308
[LightGBM] [Info] Start training from score -1.582368
Training XGBoost...

Results for Linear SVM
Training Accuracy: 0.9169621293869791
Test Accuracy: 0.9151180913559526
Precision: 0.916191323030186
Recall: 0.9151180913559526
F1 Score: 0.9151479313002661
Test Classification Report:
              precision    recall  f1-score   support

     Class 0       0.91      0.86      0.88      1575
     Class 1       0.88      0.93      0.91      3176
     Cl