In [1]:
import pandas as pd
import numpy as np
import re
import csv
from collections import defaultdict
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
import os
import sys

# Define preprocessing function
def preprocess_text(text):
    if isinstance(text, str):
        # Convert text to lowercase
        text = text.lower()
        # Remove punctuation and special characters
        text = re.sub(r'[^\w\s]', '', text)
        return text
    else:
        return ""

# Function to create bag of words
def create_bag_of_words(csv_file):
    bag_of_words = defaultdict(int)
    with open(csv_file, 'r', newline='', encoding='utf-8') as file:
        reader = csv.reader(file)
        next(reader, None)  # Skip header
        for row in reader:
            text = row[0]  # Assuming the text is in the first column
            text = preprocess_text(text)
            words = text.split()
            for word in words:
                bag_of_words[word] += 1
    return bag_of_words

# Function to save bag of words to a file
def save_bag_of_words(bag_of_words, file_path):
    with open(file_path, 'w') as file:
        for word, count in bag_of_words.items():
            file.write(f'{word}: {count}\n')

# Suppress LightGBM messages
class SuppressedOutput:
    def __enter__(self):
        self.devnull = open(os.devnull, 'w')
        self.stdout_orig = sys.stdout
        self.stderr_orig = sys.stderr
        sys.stdout = self.devnull
        sys.stderr = self.devnull

    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout = self.stdout_orig
        sys.stderr = self.stderr_orig
        self.devnull.close()

# Load dataset
csv_file = "/content/ecommerceDataset_normalized.csv"
df = pd.read_csv(csv_file)

# Apply text preprocessing
df['text_preprocessed'] = df['normalized description'].apply(preprocess_text)

# Generate bag of words
bag_of_words = create_bag_of_words(csv_file)

# Save bag of words to a file
save_bag_of_words(bag_of_words, "bag_of_words.txt")
print("Bag of words saved to 'bag_of_words.txt'")

# Convert bag of words to feature vectors using CountVectorizer
vectorizer = CountVectorizer(vocabulary=bag_of_words.keys())
X = vectorizer.fit_transform(df['text_preprocessed'])

# Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['label'])

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models
models = [
    SVC(kernel='linear'),
    LGBMClassifier(),
    KNeighborsClassifier(),
    RandomForestClassifier(),
    XGBClassifier()
]
# Convert input data to np.float32
X_train = X_train.astype(np.float32)
y_train = y_train.astype(np.float32)
X_test = X_test.astype(np.float32)
y_test = y_test.astype(np.float32)

from tabulate import tabulate
from sklearn.metrics import accuracy_score, classification_report
# Initialize an empty list to store the results
results = []
# Train and evaluate each model
for model in models:
    with SuppressedOutput():
        model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, target_names=label_encoder.classes_, output_dict=True)
    precision = report['macro avg']['precision']
    recall = report['macro avg']['recall']
    f1_score = report['macro avg']['f1-score']
    results.append([type(model).__name__, accuracy, precision, recall, f1_score])

# Print results as a table
headers = ["Model", "Accuracy", "Precision", "Recall", "F1 Score"]
print(tabulate(results, headers=headers, tablefmt="grid"))


Bag of words saved to 'bag_of_words.txt'
+------------------------+------------+-------------+----------+------------+
| Model                  |   Accuracy |   Precision |   Recall |   F1 Score |
| SVC                    |   0.931127 |    0.9298   | 0.930463 |   0.930127 |
+------------------------+------------+-------------+----------+------------+
| LGBMClassifier         |   0.9385   |    0.940254 | 0.937095 |   0.938577 |
+------------------------+------------+-------------+----------+------------+
| KNeighborsClassifier   |   0.802014 |    0.837442 | 0.814549 |   0.810083 |
+------------------------+------------+-------------+----------+------------+
| RandomForestClassifier |   0.933465 |    0.938668 | 0.930016 |   0.933902 |
+------------------------+------------+-------------+----------+------------+
| XGBClassifier          |   0.927711 |    0.931439 | 0.924988 |   0.927944 |
+------------------------+------------+-------------+----------+------------+
