In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
import xgboost as xgb
import re
from transformers import BertTokenizer, BertModel
import torch
from torch.utils.data import TensorDataset, DataLoader
from tqdm import tqdm

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Load normalized data
data = pd.read_csv("/kaggle/input/ecomnorm/normalized_data.csv")

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased').to(device)

# Set batch size
batch_size = 32

# Modify get_bert_embeddings function to accept batch_size parameter
def get_bert_embeddings(sentences, max_length=128, batch_size=batch_size):
    embeddings = []
    for i in tqdm(range(0, len(sentences), batch_size)):
        batch_sentences = sentences[i:i+batch_size]
        tokenized_texts = [tokenizer.tokenize("[CLS] " + sentence + " [SEP]")[:max_length] for sentence in batch_sentences]
        input_ids = [tokenizer.convert_tokens_to_ids(tokens) for tokens in tokenized_texts]
        input_ids = [ids + [0] * (max_length - len(ids)) for ids in input_ids]  # Padding
        input_ids = torch.tensor(input_ids).to(device)
        with torch.no_grad():
            outputs = bert_model(input_ids)
        batch_embeddings = outputs.last_hidden_state
        avg_embeddings = torch.mean(batch_embeddings, dim=1).cpu().numpy()
        embeddings.append(avg_embeddings)
    return np.concatenate(embeddings, axis=0)

# Create tokens column from the description
data['tokens'] = data['description'].apply(lambda x: ' '.join(re.findall(r'\w+', x.lower())))

# Split data into train and test sets
train_data, test_data = train_test_split(data, test_size=0.3, random_state=42)

# Get BERT embeddings
X_train_bert = get_bert_embeddings(train_data['tokens'])
X_test_bert = get_bert_embeddings(test_data['tokens'])

y_train = train_data['label']
y_test = test_data['label']

# Models
models = {
    "Linear SVM": SVC(kernel='linear', verbose=True),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "LightGBM": LGBMClassifier(),
    "XGBoost": xgb.XGBClassifier()
}

results = {}

# Train and evaluate each model
for model_name, model in models.items():
    print(f"Training {model_name}...")
    model.fit(X_train_bert, y_train)
    print(f"Finished training {model_name}.")

    print(f"Evaluating {model_name}...")
    y_train_pred = model.predict(X_train_bert)
    y_test_pred = model.predict(X_test_bert)

    # Calculate accuracies
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)

    # Precision, recall, F1-score
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_test_pred, average='weighted')

    # Classification report
    classification_report_text = classification_report(y_test, y_test_pred)

    results[model_name] = {
        "Train Accuracy": train_accuracy,
        "Test Accuracy": test_accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1-score": f1,
        "Classification Report": classification_report_text
    }

    print(f"Results for {model_name}:")
    print("Training Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-score:", f1)
    print("Classification Report:")
    print(classification_report_text)
    print("=" * 50)

# Organize and print final results
print("Final Results:")
for model_name, result in results.items():
    print(model_name)
    print("Training Accuracy:", result["Train Accuracy"])
    print("Test Accuracy:", result["Test Accuracy"])
    print("Precision:", result["Precision"])
    print("Recall:", result["Recall"])
    print("F1-score:", result["F1-score"])
    print("Classification Report:")
    print(result["Classification Report"])
    print("=" * 50)


Using device: cuda


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

  0%|          | 0/609 [00:00<?, ?it/s]We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
100%|██████████| 609/609 [03:21<00:00,  3.02it/s]
100%|██████████| 261/261 [01:29<00:00,  2.91it/s]


Training Linear SVM...
[LibSVM].........................................*..............................*....*
optimization finished, #iter = 74674
obj = -1599.627559, rho = -6.190021
nSV = 2067, nBSV = 1522
.................*............*....*
optimization finished, #iter = 33130
obj = -629.930654, rho = 0.483407
nSV = 971, nBSV = 556
.....*...*
optimization finished, #iter = 8481
obj = -250.686538, rho = -4.218498
nSV = 519, nBSV = 237
.......................*..............*.*
optimization finished, #iter = 38658
obj = -992.095123, rho = 3.047693
nSV = 1383, nBSV = 932
...........*.....*
optimization finished, #iter = 16505
obj = -707.099564, rho = 1.071838
nSV = 1079, nBSV = 726
......*..*
optimization finished, #iter = 8774
obj = -358.669956, rho = -3.867346
nSV = 652, nBSV = 371
Total nSV = 4418
Finished training Linear SVM.
Evaluating Linear SVM...
Results for Linear SVM:
Training Accuracy: 0.9457376291043625
Test Accuracy: 0.9220716940414818
Precision: 0.9221036333739958
Recall: 