In [24]:
import pandas as pd
import gzip
from sklearn.model_selection import train_test_split
import gzip_knn_classifier as gkc

In [12]:
df_data = (
    pd.read_csv(
        "./data/Ecommerce Text Classification.csv",
        header=None,
        encoding="iso-8859-1",
    )
    .dropna()
)

In [13]:
df_data.shape

(50424, 2)

In [14]:
df_data.columns = ["class_label", "text"]
df_data.head()

Unnamed: 0,class_label,text
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...


In [15]:
(
    df_data
    .groupby("class_label")
    .size()
)

class_label
Books                     11820
Clothing & Accessories     8670
Electronics               10621
Household                 19313
dtype: int64

In [16]:
df_data["text_bytes"] = (
    df_data["text"]
    .str.strip()
    .str.lower()
    .str.encode("iso-8859-1")
    .apply(gzip.compress)
)

In [17]:
class_labels_to_idx = {
    class_label: idx
    for idx, class_label in enumerate(df_data["class_label"].unique())
}

idx_to_class_label = {
    idx: class_label
    for class_label, idx in class_labels_to_idx.items()
}

df_data["class_label_idx"] = df_data["class_label"].apply(class_labels_to_idx.get)

In [18]:
df_data.head(5)

Unnamed: 0,class_label,text,text_bytes,class_label_idx
0,Household,Paper Plane Design Framed Wall Hanging Motivat...,b'\x1f\x8b\x08\x00C\xbd\xa3e\x02\xff]T[\x8e\xd...,0
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",b'\x1f\x8b\x08\x00C\xbd\xa3e\x02\xffE\x8f\xddn...,0
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...,"b""\x1f\x8b\x08\x00C\xbd\xa3e\x02\xff\xadT\xd1n...",0
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1...",b'\x1f\x8b\x08\x00C\xbd\xa3e\x02\xff\xadTKn\xd...,0
4,Household,Incredible Gifts India Wooden Happy Birthday U...,"b""\x1f\x8b\x08\x00C\xbd\xa3e\x02\xff]SKn\x1b1\...",0


In [26]:
seed = 16021998

df_train, df_test = train_test_split(df_data, test_size=0.3, stratify=df_data["class_label"], random_state=seed)

print(f"Training size: {df_train.shape[0]}")
display(df_train.head(5))
display(df_train.groupby("class_label").size())

print(f"Testing size: {df_test.shape[0]}")
display(df_test.head(5))
display(df_test.groupby("class_label").size())

Training size: 35296


Unnamed: 0,class_label,text,text_bytes,class_label_idx
28379,Books,The Gene: An Intimate History About the Author...,b'\x1f\x8b\x08\x00D\xbd\xa3e\x02\xff]\x93Q\x96...,1
43872,Electronics,AmazonBasics USB 2.0 Cable - A-Male to B-Male ...,"b""\x1f\x8b\x08\x00E\xbd\xa3e\x02\xffm\x8dA\x0e...",3
10948,Household,Rena Germany Knife Sharpening Rod - Stainless ...,b'\x1f\x8b\x08\x00D\xbd\xa3e\x02\xffm\x90Kn\xc...,0
38129,Clothing & Accessories,Carriwell Halter Neck Maternity Tankini Swimsu...,"b""\x1f\x8b\x08\x00E\xbd\xa3e\x02\xffU\x90KN\xc...",2
2063,Household,AMZ Exclusive Premium Quality Soft Rocking Cha...,b'\x1f\x8b\x08\x00C\xbd\xa3e\x02\xff\xadUK\x8e...,0


class_label
Books                      8274
Clothing & Accessories     6069
Electronics                7434
Household                 13519
dtype: int64

Testing size: 15128


Unnamed: 0,class_label,text,text_bytes,class_label_idx
44119,Electronics,Cosmos â 10'' Diameter Heavy Duty 360 â Ro...,"b""\x1f\x8b\x08\x00E\xbd\xa3e\x02\xff\xb5\x8eA\...",3
135,Household,M.G Enterprise Grey Mix 200 Gm Wool Ball Hand ...,b'\x1f\x8b\x08\x00C\xbd\xa3e\x02\xffE\x90\xc1n...,0
11346,Household,"Embassy Stainless Steel Trivet/Table Ring, Rou...","b""\x1f\x8b\x08\x00D\xbd\xa3e\x02\xffu\x91QN\xc...",0
46961,Electronics,Railway to the Point General Knowledge & Gener...,"b""\x1f\x8b\x08\x00E\xbd\xa3e\x02\xff+J\xcc\xcc...",3
49156,Electronics,Mivi Conquer Wireless Bluetooth Earphones - Gu...,b'\x1f\x8b\x08\x00E\xbd\xa3e\x02\xff\xb5TK\x92...,3


class_label
Books                     3546
Clothing & Accessories    2601
Electronics               3187
Household                 5794
dtype: int64

In [28]:
clf = gkc.new(
    df_train["text_bytes"],
    df_train["class_label_idx"],
)

In [29]:
y_test = df_test["class_label_idx"]
y_preds = [pred.label for pred in gkc.classify_many(clf, df_test["text_bytes"], 5)]

In [30]:
(y_test == y_preds).value_counts()

class_label_idx
True     7947
False    7181
Name: count, dtype: int64