## Product Categorization System

### Libraries needed

In [38]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

### prepare your data

In [39]:
from sklearn.model_selection import train_test_split

# Sample product data
products = [
    # Clothing
    "blue cotton t-shirt size M",
    "denim jeans blue",
    "red hoodie size L",
    "green polo shirt size S",
    "black tank top",
    "white dress shirt size M",
    "gray sweatpants",
    "yellow raincoat waterproof",
    
    # Accessories
    "leather wallet black",
    "sunglasses UV protection",
    "canvas backpack gray",
    "silver necklace with pendant",
    "wool scarf red",
    "brown leather belt",
    
    # Shoes
    "running shoes size 10",
    "black leather boots",
    "white sneakers size 9",
    "blue sandals size 8",
    "trail running shoes size 11",
    "formal shoes black",
    
    # Sports
    "sports water bottle",
    "yoga mat non-slip",
    "tennis racket graphite",
    "basketball official size",
    "gym duffel bag",
    "protein shaker bottle",
    
    # Electronics
    "wireless headphones black",
    "smartphone charger USB-C",
    "LED monitor 24 inch",
    "Bluetooth speaker portable",
    "fitness tracker waterproof"
]

# Corresponding categories
categories = [
    'Clothing', 'Clothing', 'Clothing', 'Clothing', 'Clothing', 'Clothing', 'Clothing', 'Clothing',
    'Accessories', 'Accessories', 'Accessories', 'Accessories', 'Accessories', 'Accessories',
    'Shoes', 'Shoes', 'Shoes', 'Shoes', 'Shoes', 'Shoes',
    'Sports', 'Sports', 'Sports', 'Sports', 'Sports', 'Sports',
    'Electronics', 'Electronics', 'Electronics', 'Electronics', 'Electronics'
]

# split data
X_train, X_test, y_train, y_test = train_test_split(products, categories, test_size=0.2, random_state=42, stratify=categories)


### Create a product categorization Pipeline

In [40]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV

def create_product_classifier():
    return Pipeline([
        ('vectorizer', TfidfVectorizer(stop_words='english')),
        ('classifier', MultinomialNB())
    ])

pipeline = create_product_classifier()

param_grid = {
    'vectorizer__ngram_range': [(1,1), (1,2), (1,3)],
    'vectorizer__min_df': [1, 2],
    'vectorizer__max_df': [0.75, 0.85, 1.0],
    'vectorizer__max_features': [500, 1000, None],
    'classifier__alpha': [0.1, 0.5, 1.0]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best params:", grid_search.best_params_)
print("Best CV accuracy:", grid_search.best_score_)



Best params: {'classifier__alpha': 0.1, 'vectorizer__max_df': 0.75, 'vectorizer__max_features': 500, 'vectorizer__min_df': 1, 'vectorizer__ngram_range': (1, 1)}
Best CV accuracy: 0.4166666666666667


### Evaluate the model

In [41]:
model = grid_search.best_estimator_
predictions = model.predict(X_test)

from sklearn.metrics import accuracy_score, classification_report
print("Test accuracy:", accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions))

Test accuracy: 0.14285714285714285
              precision    recall  f1-score   support

 Accessories       0.00      0.00      0.00         1
    Clothing       0.17      0.50      0.25         2
 Electronics       0.00      0.00      0.00         1
       Shoes       0.00      0.00      0.00         1
      Sports       0.00      0.00      0.00         2

    accuracy                           0.14         7
   macro avg       0.03      0.10      0.05         7
weighted avg       0.05      0.14      0.07         7



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [42]:
new_products = [
    "white cotton socks pack",
    "black leather belt",
    "men's running sneakers size 9",
    "waterproof sports watch",
    "blue denim jacket",
    "wireless bluetooth headphones",
    "kids basketball shoes",
    "women's wool scarf",
    "compact digital camera",
    "outdoor hiking backpack",
    "leather wallet brown",
    "yoga mat non-slip",
    "cotton t-shirt size L",
    "sports water bottle 1 liter"
]

# Make predictions
predictions = model.predict(new_products)

# Print results
for product, category in zip(new_products, predictions):
    print(f"Product: {product}")
    print(f"Predicted Category: {category}\n")

Product: white cotton socks pack
Predicted Category: Clothing

Product: black leather belt
Predicted Category: Accessories

Product: men's running sneakers size 9
Predicted Category: Shoes

Product: waterproof sports watch
Predicted Category: Sports

Product: blue denim jacket
Predicted Category: Clothing

Product: wireless bluetooth headphones
Predicted Category: Electronics

Product: kids basketball shoes
Predicted Category: Shoes

Product: women's wool scarf
Predicted Category: Clothing

Product: compact digital camera
Predicted Category: Clothing

Product: outdoor hiking backpack
Predicted Category: Accessories

Product: leather wallet brown
Predicted Category: Accessories

Product: yoga mat non-slip
Predicted Category: Clothing

Product: cotton t-shirt size L
Predicted Category: Clothing

Product: sports water bottle 1 liter
Predicted Category: Sports

