<a href="https://colab.research.google.com/github/RodicaCIA/my-first-repo/blob/main/proiectfinal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# =============================
# TASK 3 - Clasificarea produselor după titlu
# =============================

# 1️⃣ Importuri
import pandas as pd
import pandas as pd

# load dataset from GitHub
url = "https://raw.githubusercontent.com/RodicaCIA/product-category-predictor/main/data/IMLP4_TASK_03-products.csv"
df = pd.read_csv(url)
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

# 2️⃣ Încărcare date
df = pd.read_csv("IMLP4_TASK_03-products.csv")

# Curățăm numele coloanelor (elimină spațiile înainte/după)
df.columns = df.columns.str.strip()

# 3️⃣ Explorare rapidă
print("Coloane disponibile:", df.columns.tolist())
print("\nPrimele rânduri:")
display(df.head(3))

# 4️⃣ Curățare și selecție coloane utile
df = df[['Product Title', 'Category Label']].dropna()
df['Product Title'] = df['Product Title'].astype(str).str.lower()

# 5️⃣ Împărțire în train/test
X_train, X_test, y_train, y_test = train_test_split(
    df['Product Title'], df['Category Label'],
    test_size=0.2, random_state=42, stratify=df['Category Label']
)

# 6️⃣ Vectorizare text TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# 7️⃣ Definim mai multe modele pentru comparație
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=150, random_state=42),
    "Linear SVM": LinearSVC()
}

results = {}

# 8️⃣ Antrenare + evaluare
for name, model in models.items():
    print(f"\n🔹 Antrenare model: {name}")
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)
    acc = accuracy_score(y_test, y_pred)
    results[name] = acc
    print(f"Acuratețe: {acc:.4f}")
    print(classification_report(y_test, y_pred))

# 9️⃣ Afișăm comparația acurateței
print("\n📊 Rezumat performanțe:")
for name, acc in results.items():
    print(f"{name}: {acc:.4f}")

best_model_name = max(results, key=results.get)
print(f"\n🏆 Cel mai bun model: {best_model_name} ({results[best_model_name]:.4f})")

best_model = models[best_model_name]

# 10️⃣ Matrice de confuzie
y_pred_best = best_model.predict(X_test_tfidf)
plt.figure(figsize=(10,6))
sns.heatmap(confusion_matrix(y_test, y_pred_best), annot=True, fmt='d', cmap='Blues')
plt.title(f"Matrice de confuzie - {best_model_name}")
plt.xlabel("Predicții")
plt.ylabel("Valori reale")
plt.show()

# 11️⃣ Salvăm modelul și vectorizatorul
with open("product_category_model.pkl", "wb") as f:
    pickle.dump(best_model, f)

with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

print("\n✅ Modelele au fost salvate cu succes: product_category_model.pkl și tfidf_vectorizer.pkl")

# 12️⃣ Test interactiv
def predict_category(title):
    title = title.lower()
    vector = vectorizer.transform([title])
    return best_model.predict(vector)[0]

# Exemple de testare
print("\n🔮 Exemple de testare:")
examples = [
    "iphone 7 32gb gold,4,3,Apple iPhone 7 32GB",
    "olympus e m10 mark iii geh use silber",
    "kenwood k20mss15 solo",
    "bosch wap28390gb 8kg 1400 spin",
    "bosch serie 4 kgv39vl31g",
    "smeg sbs8004po"
]

for e in examples:
    print(f"{e} ➡️ {predict_category(e)}")


FileNotFoundError: [Errno 2] No such file or directory: 'IMLP4_TASK_03-products.csv'

Pregătirea mediului și încărcarea datelor