# Feature Engineering & Model Training
U ovoj svesci vršimo pripremu podataka, testiranje više modela i treniranje finalnog klasifikatora proizvoda.

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv("../data/products.csv")
df = df.dropna(subset=["Product Title", "Category Label"])

X = df["Product Title"]
y = df["Category Label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Train size: {X_train.shape[0]}, Test size: {X_test.shape[0]}")

Train size: 2, Test size: 1


## Testiranje više modela
U ovom koraku testiramo više klasifikatora (Logistic Regression, Naive Bayes, Random Forest, SVM)
da bismo videli koji daje najbolje rezultate na test skupu.

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Naive Bayes": MultinomialNB(),
    "Random Forest": RandomForestClassifier(),
    "Linear SVM": LinearSVC()
}

for name, model in models.items():
    print(f"\n=== {name} ===")
    pipe = Pipeline([
        ("tfidf", TfidfVectorizer(max_features=5000, ngram_range=(1,2))),
        ("clf", model)
    ])
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)
    print(classification_report(y_test, preds))


=== Logistic Regression ===
                 precision    recall  f1-score   support

Digital Cameras       0.00      0.00      0.00       0.0
  Mobile Phones       0.00      0.00      0.00       1.0

       accuracy                           0.00       1.0
      macro avg       0.00      0.00      0.00       1.0
   weighted avg       0.00      0.00      0.00       1.0


=== Naive Bayes ===
                 precision    recall  f1-score   support

Digital Cameras       0.00      0.00      0.00       0.0
  Mobile Phones       0.00      0.00      0.00       1.0

       accuracy                           0.00       1.0
      macro avg       0.00      0.00      0.00       1.0
   weighted avg       0.00      0.00      0.00       1.0


=== Random Forest ===


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


                 precision    recall  f1-score   support

Fridge Freezers       0.00      0.00      0.00       0.0
  Mobile Phones       0.00      0.00      0.00       1.0

       accuracy                           0.00       1.0
      macro avg       0.00      0.00      0.00       1.0
   weighted avg       0.00      0.00      0.00       1.0


=== Linear SVM ===
                 precision    recall  f1-score   support

Fridge Freezers       0.00      0.00      0.00       0.0
  Mobile Phones       0.00      0.00      0.00       1.0

       accuracy                           0.00       1.0
      macro avg       0.00      0.00      0.00       1.0
   weighted avg       0.00      0.00      0.00       1.0



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


## Treniranje finalnog modela
Nakon testiranja, trenira se finalni model (najbolji) na celom datasetu i čuva u `model/` folderu.

In [None]:
import joblib

final_model = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=8000, ngram_range=(1,2))),
    ("clf", LogisticRegression(max_iter=2000))
])

final_model.fit(X, y)

joblib.dump(final_model, "../model/product_classifier.pkl")
print("✅ Model sačuvan u 'model/product_classifier.pkl'")

✅ Model sačuvan u 'model/product_classifier.pkl'
