In [None]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import SGDClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from transformers import AutoTokenizer, AutoModel
import torch
import glob
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import make_pipeline
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import FunctionTransformer

In [12]:
# Load data
# Load cleaned data
files = sorted(glob.glob('../data/products_clean_part_*.parquet'))
df_clean = pd.concat([pd.read_parquet(f) for f in files], ignore_index=True)

df_sample=df_clean.sample(100_000, random_state=42)

In [13]:
# Load the pretrained model 
sentence_model = SentenceTransformer('all-MiniLM-L6-v2') 

def embed_text(text_series):
    return sentence_model.encode(text_series.tolist(), show_progress_bar=False)

# Wrap it for sklearn, for compatibility  
text_embed_transformer = FunctionTransformer(embed_text, validate=False)

text_embed_pipeline = Pipeline([
    ('embed', text_embed_transformer)
])

In [14]:
# Numeric scaler pipeline
numeric_transformer = Pipeline([
    ('scaler', StandardScaler())
])

# Categorical encoder pipeline
categorical_transformer = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [15]:
# Combine all preprocessors
df_sample['text'] = (
    df_sample['title'].fillna('') + ' ' +
    df_sample['description'].fillna('') + ' ' +
    df_sample['feature'].fillna('')
)

preprocessor = ColumnTransformer(
    transformers=[
        ('text_embeds', text_embed_pipeline, 'text'),  # You can also combine multiple text fields differently
        ('num', numeric_transformer, 'price'),
        ('cat', categorical_transformer, 'brand'),
    ],
    remainder='drop'
)



In [17]:
## SGDClassifier (linear model)

model = SGDClassifier(loss='log_loss', class_weight='balanced', max_iter=1000, n_jobs=-1, random_state=42)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', model)  
])

# Split into predictors and target variable
X = df_sample[['text', 'brand', 'price']]
y = df_sample['main_cat_grouped']
# Divide into train and test subsets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

pipeline.fit(X_train, y_train)


KeyboardInterrupt: 

In [None]:
## Evaluate:
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
# Confusion Matrix

In [None]:
# Save Final model and final preprocessor in api folder as a joblib