### Feature Engineering
- Text: TF‑IDF pe titlu.
- Numeric:
- number_of_views → log-transform.
- merchant_rating → scalare.
- listing_date → extragem year, month.


In [6]:
import numpy as np
import pandas as pd

df = pd.read_csv("../data/products_modified.csv")

df["views_log"] = np.log1p(df["number_of_views"])
df["year"] = pd.to_datetime(df["listing_date"]).dt.year
df["month"] = pd.to_datetime(df["listing_date"]).dt.month


### Building the pipeline

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

text_vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=2, max_features=100000)

preprocessor = ColumnTransformer(
    transformers=[
        ("text", text_vectorizer, "product_title"),
        ("num", StandardScaler(), ["views_log", "merchant_rating", "year", "month"])
    ]
)

pipe = Pipeline([
    ("prep", preprocessor),
    ("clf", LogisticRegression(max_iter=2000, n_jobs=4, multi_class="multinomial", class_weight="balanced"))
])

### Training and assessment 

- Metrici: accuracy, precision, recall, F1.
- Vizualizare: matrice de confuzie.


In [12]:
print(df.isna().sum())
df = df.dropna(subset=["product_title"])

Unnamed: 0         0
product_id         0
product_title      0
merchant_id        0
category_label     0
product_code       0
number_of_views    0
merchant_rating    0
listing_date       0
views_log          0
year               0
month              0
dtype: int64


In [13]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

X = df[["product_title", "views_log", "merchant_rating", "year", "month"]]
y = df["category_label"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_val)

print("Accuracy:", accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred))



Accuracy: 0.9247698504027618
                  precision    recall  f1-score   support

             CPU       0.00      0.00      0.00        16
            CPUs       0.98      0.94      0.96       742
 Digital Cameras       1.00      0.99      0.99       532
     Dishwashers       0.91      0.96      0.93       675
        Freezers       0.87      0.95      0.91       436
 Fridge Freezers       0.96      0.89      0.93      1085
         Fridges       0.85      0.80      0.83       681
      Microwaves       0.98      0.97      0.97       461
    Mobile Phone       0.03      0.09      0.04        11
   Mobile Phones       0.93      0.95      0.94       794
             TVs       0.99      0.97      0.98       701
Washing Machines       0.98      0.94      0.96       794
          fridge       0.04      0.12      0.06        24

        accuracy                           0.92      6952
       macro avg       0.73      0.74      0.73      6952
    weighted avg       0.94      0.92    

### Save model in file category_model.pkl

In [14]:
import joblib
joblib.dump(pipe, "category_model.pkl")

['category_model.pkl']