In [24]:
import pandas as pd
from sklearn.metrics import accuracy_score

import re

from utils import (
    load_tfidf_classifier_model,
    load_embedding_svm_model,
    split_dataset,
    load_embedding_classifier_model
)

from modules.db import TeradataDatabase

from constants import (
    FULL_DATASET_PATH,
    FULL_TRAIN_DATASET_PATH,
    FULL_TEST_DATASET_PATH,
    FULL_EMBEDDING_MODEL_OUTPUT_DATASET_PATH,
    TFIDF_CLASSIFIER_CONFIG_PATH,
    EMBEDDING_SVM_CONFIG_PATH,
    ENSEMBLE_CONFIG_PATH,
    FULL_ENSEMBLE_MODEL_OUTPUT_DATASET_PATH,
    EMBEDDING_CLASSIFIER_CONFIG_PATH
)

In [5]:
df_gpc = pd.read_csv(FULL_DATASET_PATH)
df_gpc.head()

Unnamed: 0.1,Unnamed: 0,product_name,description,segment,family,class,brick,source,text,dedup_key
0,0,Sterling Silver Angel Charm,This little angel charm is just heavenly,Personal Accessories,Personal Accessories,Jewellery,,MWPD_FULL,sterling silver angel charm,sterling silver angel charm
1,1,HP Pavilion xi . cm (�) IPS Monitor,"Share photos, videos, and games with everyone ...",Computing,Computers/Video Games,Computer/Video Game Peripherals,,MWPD_FULL,hp pavilion xi . cm ips monitor,hp pavilion xi cm ips monitor
2,2,East Carolina Pirates Ladies Personalized Bask...,Feel like a bona fide member of East Carolina ...,Clothing,Clothing,Upper Body Wear/Tops,,MWPD_FULL,east carolina pirates ladies personalized bask...,east carolina pirates ladies personalized bask...
3,3,"Tekonsha P Electric Brake Control, - Trailer ...",Receive free shipping on this item. Enter coup...,Automotive,Automotive Accessories and Maintenance,Automotive Maintenance/Repair,,MWPD_FULL,tekonsha p electric brake control - trailer axles,tekonsha p electric brake control trailer axles
4,4,RN-XV WiFly Module - Wire Antenna,Description: The RN-XV module by Roving Networ...,Electrical Supplies,Electronic Communication Components,Electronic Communication Components,,MWPD_FULL,rn-xv wifly module - wire antenna,rn xv wifly module wire antenna


In [6]:
split_dataset(FULL_DATASET_PATH, FULL_TRAIN_DATASET_PATH, FULL_TEST_DATASET_PATH)

In [3]:
def clean(text: str) -> str:
    text = text.lower()
    text = re.sub(r"[^\w\s]", " ", text)

    return " ".join(text.strip().split())

In [4]:
df_train = pd.read_csv(FULL_TRAIN_DATASET_PATH)

df_train["product_name"] = df_train["product_name"].astype(str)

In [5]:
df_test = pd.read_csv(FULL_TEST_DATASET_PATH)

df_test["product_name"] = df_test["product_name"].astype(str)

In [10]:
X_train = df_train["product_name"].tolist()
y_train = df_train["class"].tolist()

In [11]:
X_test = df_test["product_name"].tolist()
segments = df_test["segment"].tolist()
families = df_test["family"].tolist()
classes = df_test["class"].tolist()

# TF-IDF with SVM

In [12]:
model = load_tfidf_classifier_model(TFIDF_CLASSIFIER_CONFIG_PATH)

In [13]:
model.fit(X_train, y_train)

In [15]:
y_pred = model.predict(X_test)

In [16]:
class_to_segment = df_train.set_index("class")["segment"].to_dict()
class_to_family = df_train.set_index("class")["family"].to_dict()

pred_segments = [class_to_segment.get(c, None) for c in y_pred]
pred_families = [class_to_family.get(c, None) for c in y_pred]

In [17]:
accuracy_score(segments, pred_segments)

0.9667695540815656

In [18]:
accuracy_score(families, pred_families)

0.9502200039403691

In [19]:
accuracy_score(classes, y_pred)

0.9431930124121626

In [20]:
model.save()

# Embedding With SVM

In [7]:
model = load_embedding_svm_model(EMBEDDING_SVM_CONFIG_PATH)

In [8]:
model.fit(X_train, y_train)

Batches:   0%|          | 0/1904 [00:00<?, ?it/s]

In [9]:
y_pred = model.predict(X_test)

Batches:   0%|          | 0/476 [00:00<?, ?it/s]

In [10]:
pred_segments = [pred[0] for pred in y_pred]
pred_families = [pred[1] for pred in y_pred]
pred_classes = [pred[2] for pred in y_pred]

In [11]:
accuracy_score(segments, pred_segments)

0.9326241134751773

In [12]:
accuracy_score(families, pred_families)

0.9061597058050959

In [13]:
accuracy_score(classes, pred_classes)

0.8815996847911741

In [None]:
model.save()

# Embedding model

In [25]:
embed_clf = load_embedding_classifier_model(EMBEDDING_CLASSIFIER_CONFIG_PATH)

FileNotFoundError: [Errno 2] No such file or directory: 'data/gpc.csv'

In [21]:
df = pd.read_csv(FULL_EMBEDDING_MODEL_OUTPUT_DATASET_PATH)

FileNotFoundError: [Errno 2] No such file or directory: 'c:\\Users\\ss255385\\OneDrive - Teradata Corporation\\Desktop\\Prodify-V2.0\\data\\full_embedding_model_output_dataset.csv'

In [5]:
df["pred_segment"] = df["pred_segment"].apply(clean)
df["pred_family"] = df["pred_family"].apply(clean)
df["pred_class"] = df["pred_class"].apply(clean)

In [6]:
accuracy_score(df["segment"].tolist(), df["pred_segment"].tolist())

0.3044391909640137

In [7]:
accuracy_score(df["family"].tolist(), df["pred_family"].tolist())

0.11583924349881797

In [8]:
accuracy_score(df["class"], df["pred_class"])

0.050170738114000524

# Test Ensemble Pipeline

In [2]:
df = pd.read_csv(FULL_ENSEMBLE_MODEL_OUTPUT_DATASET_PATH)

In [3]:
accuracy_score(df["segment"].tolist(), df["pred_segment"].tolist())

0.9583661675860258

In [4]:
accuracy_score(df["family"].tolist(), df["pred_family"].tolist())

0.9506829524560021

In [5]:
accuracy_score(df["class"], df["pred_class"])

0.9359075387444181