In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

import re

from utils import load_tfidf_classifier_model
from constants import (
    FULL_TRAIN_DATASET_PATH, 
    FULL_TEST_DATASET_PATH, 
    FULL_EMBEDDING_MODEL_OUTPUT_DATASET_PATH,
    TFIDF_CLASSIFIER_CONFIG_PATH
)

Logger initialized. All logs will be saved to: c:\internship\Prodify-V2.0\src\logs\log.txt
[2025-09-14 23:04:30] - logger:_log - INFO - Logger initialized. Logs will be saved to c:\internship\Prodify-V2.0\src\logs\log.txt


In [2]:
def clean(text: str) -> str:
    text = text.lower()
    text = re.sub(r"[^\w\s]", " ", text)

    return " ".join(text.strip().split())

In [None]:
df_train = pd.read_csv(FULL_TRAIN_DATASET_PATH)

df_train["product_name"] = df_train["product_name"].astype(str)

In [4]:
X_train = df_train["product_name"].tolist()
y_train = df_train[["segment", "family", "class"]].values.tolist()

In [5]:
model = load_tfidf_classifier_model(TFIDF_CLASSIFIER_CONFIG_PATH)

In [6]:
model.fit(X_train, y_train)

In [7]:
df_test = pd.read_csv(FULL_TEST_DATASET_PATH)

df_test["product_name"] = df_test["product_name"].astype(str)

In [10]:
X_test = df_test["product_name"].tolist()
segments = df_test["segment"].tolist()
families = df_test["family"].tolist()
classes = df_test["class"].tolist()

In [13]:
y_pred = model.predict(X_test)

In [15]:
pred_segments = [pred[0] for pred in y_pred]
pred_families = [pred[1] for pred in y_pred]
pred_classes = [pred[2] for pred in y_pred]

In [16]:
accuracy_score(segments, pred_segments)

0.9674284213291305

In [17]:
accuracy_score(families, pred_families)

0.9512739690044655

In [18]:
accuracy_score(classes, pred_classes)

0.9359075387444181

# Embedding model

In [3]:
df = pd.read_csv(FULL_EMBEDDING_MODEL_OUTPUT_DATASET_PATH)

In [5]:
df["pred_segment"] = df["pred_segment"].apply(clean)
df["pred_family"] = df["pred_family"].apply(clean)
df["pred_class"] = df["pred_class"].apply(clean)

In [6]:
accuracy_score(df["segment"].tolist(), df["pred_segment"].tolist())

0.3044391909640137

In [7]:
accuracy_score(df["family"].tolist(), df["pred_family"].tolist())

0.11583924349881797

In [8]:
accuracy_score(df["class"], df["pred_class"])

0.050170738114000524