<a href="https://colab.research.google.com/github/ScumpikLau/ml-product-category-laura-corbu/blob/main/notebook/model_comparison_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Loading and inspecting the cleaned dataset

In [1]:
import pandas as pd

# Load dataset from github
url = "https://raw.githubusercontent.com/ScumpikLau/ml-product-category-laura-corbu/main/notebook/data/data_clean.csv"

df = pd.read_csv(url)

print("Number of rows:", len(df))
print("First five rows:")
display(df.head())
# Show column data types and non-null counts
print("\nDataset info:")
df.info()

Number of rows: 34760
First five rows:


Unnamed: 0.1,Unnamed: 0,Product Title,Category Label,category,title_char_count,title_word_count,title_has_numbers,title_special_char,brand_found,title_has_brand,title_longest_word_len
0,0,apple iphone 8 plus 64gb silver,Mobile Phones,Mobile Phones,31,6,True,False,apple,True,6
1,1,apple iphone 8 plus 64 gb spacegrau,Mobile Phones,Mobile Phones,35,7,True,False,apple,True,9
2,2,apple mq8n2b/a iphone 8 plus 64gb 5.5 12mp sim...,Mobile Phones,Mobile Phones,70,13,True,True,apple,True,10
3,3,apple iphone 8 plus 64gb space grey,Mobile Phones,Mobile Phones,35,7,True,False,apple,True,6
4,4,apple iphone 8 plus gold 5.5 64gb 4g unlocked ...,Mobile Phones,Mobile Phones,54,11,True,True,apple,True,8



Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34760 entries, 0 to 34759
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Unnamed: 0              34760 non-null  int64 
 1   Product Title           34760 non-null  object
 2   Category Label          34760 non-null  object
 3   category                34760 non-null  object
 4   title_char_count        34760 non-null  int64 
 5   title_word_count        34760 non-null  int64 
 6   title_has_numbers       34760 non-null  bool  
 7   title_special_char      34760 non-null  bool  
 8   brand_found             10725 non-null  object
 9   title_has_brand         34760 non-null  bool  
 10  title_longest_word_len  34760 non-null  int64 
dtypes: bool(3), int64(4), object(4)
memory usage: 2.2+ MB


## Import requiered libraries and split the data into train and test sets

In [2]:
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix

# Features and label
x = df[['Product Title', 'title_char_count', 'title_longest_word_len']]
y = df['category']

# train-test split
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42, stratify=y
)

- Transforming the 'Product Title' column using **TF-IDF**
- Scaling 'title_char_count' and 'title_longest_word_len' using**MinMaxScaler**
- Using a **ColumnTransformer** to combine all features into a single input matrix and making a list of classifiers

In [3]:
# Preprocessor TF-IDF for text, MinMaxScaler for numeric feature
preprocessor = ColumnTransformer(
    transformers=[
        ("title", TfidfVectorizer(), "Product Title"),
        ("char_count", MinMaxScaler(), ["title_char_count"]),
        ("longest_word_len", MinMaxScaler(), ["title_longest_word_len"])
    ]
)

# List of classifiers
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Naive Bayes": MultinomialNB(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Support Vector Machine": LinearSVC()
}

## Wrapping all components into a unified **Pipeline** for each model to train all models and keep the predictions

In [4]:
trained_pipelines = {}
preds = {}

for name, model in models.items():
    pipe = Pipeline([
        ("preprocessing", preprocessor),
        ("classifier", model)
    ])
    pipe.fit(x_train, y_train)
    y_pred = pipe.predict(x_test)
    trained_pipelines[name] = pipe
    preds[name] = y_pred

print("Models are trained")

Models are trained


## Calculating Accuracy of predictions

In [5]:
acc_rows = []
for name, y_pred in preds.items():
    acc_rows.append({"model": name, "accuracy": accuracy_score(y_test, y_pred)})

# Display results in a DataFrame
acc_df = pd.DataFrame(acc_rows).sort_values("accuracy", ascending=False)
acc_df.reset_index(drop=True, inplace=True)
print(f"Models accuracy:{acc_df}")


Models accuracy:                    model  accuracy
0  Support Vector Machine  0.969217
1           Random Forest  0.962313
2     Logistic Regression  0.957854
3           Decision Tree  0.948504
4             Naive Bayes  0.932825
