In [1]:
!pip install fasttext -q

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pybind11>=2.2
  Using cached pybind11-2.10.4-py3-none-any.whl (222 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp39-cp39-linux_x86_64.whl size=4395599 sha256=2cacd6bd8ab912813d278455cd46ffca434fb793ddb25cd1f11c756c64ffcb02
  Stored in directory: /root/.cache/pip/wheels/64/57/bc/1741406019061d5664914b070bd3e71f6244648732bc96109e
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Successfully installed fasttext-0.9.2 pybind11-2.10.4


In [49]:
import fasttext

import pandas as pd

import re

from sklearn.model_selection import train_test_split

## Loading the Dataset

In [24]:
from pathlib import Path
import zipfile


zip_path = Path("/content/ecommerce_classification.zip")
dest_dir = Path("/content")

if not dest_dir.is_file():
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        print(f"[INFO] Unzipping dataset `{zip_path}` to `{dest_dir}`...")
        zip_ref.extractall(dest_dir)

print(f"[INFO] Dataset succesfully downloaded to `{dest_dir}`..")

[INFO] Unzipping dataset `/content/ecommerce_classification.zip` to `/content`...
[INFO] Dataset succesfully downloaded to `/content`..


## Preprocessing the Dataset

In [25]:
df = pd.read_csv(dest_dir / "ecommerceDataset.csv", names=["label", "text"])

df

Unnamed: 0,label,text
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...
...,...,...
50420,Electronics,Strontium MicroSD Class 10 8GB Memory Card (Bl...
50421,Electronics,CrossBeats Wave Waterproof Bluetooth Wireless ...
50422,Electronics,Karbonn Titanium Wind W4 (White) Karbonn Titan...
50423,Electronics,"Samsung Guru FM Plus (SM-B110E/D, Black) Colou..."


In [26]:
df.value_counts("label")

label
Household                 19313
Books                     11820
Electronics               10621
Clothing & Accessories     8671
dtype: int64

In [27]:
# Dropping all Nan values
df.dropna(inplace=True)

df.value_counts("label") 

label
Household                 19313
Books                     11820
Electronics               10621
Clothing & Accessories     8670
dtype: int64

By default fastText expects the file to be train that has the format: 
* ...
* \_\_label\_\_$label_i$ $text_i$
* ...

So we don't want the labels to contain extra spaces.

In [28]:
# Change the name of the column `Clothing & Accessories`
df["label"].replace("Clothing & Accessories", "Clothing_Accessories", inplace=True)

df["label"].unique()

array(['Household', 'Books', 'Clothing_Accessories', 'Electronics'],
      dtype=object)

In [29]:
# Prefix the labels
df["label"] = "__label__" + df["label"].astype(str)

df["label"].unique()

array(['__label__Household', '__label__Books',
       '__label__Clothing_Accessories', '__label__Electronics'],
      dtype=object)

In [31]:
# Creating a DataFrame that contains both the prefixed label and the text in one column
processed_df = pd.DataFrame(df["label"] + " " + df["text"], columns=["data"])

processed_df

Unnamed: 0,data
0,__label__Household Paper Plane Design Framed W...
1,__label__Household SAF 'Floral' Framed Paintin...
2,__label__Household SAF 'UV Textured Modern Art...
3,__label__Household SAF Flower Print Framed Pai...
4,__label__Household Incredible Gifts India Wood...
...,...
50420,__label__Electronics Strontium MicroSD Class 1...
50421,__label__Electronics CrossBeats Wave Waterproo...
50422,__label__Electronics Karbonn Titanium Wind W4 ...
50423,__label__Electronics Samsung Guru FM Plus (SM-...


In [44]:
# Perform processing to remove punctuation, extra spaces and convert every word into lower case
def preprocessed(text):
    alpharithmetic_text = re.sub(r"[^\w\s]", " ", text)
    processed_text = re.sub(" +", " ", alpharithmetic_text)

    return processed_text.strip().lower()

In [45]:
processed_df["data"][1]

"__label__household saf 'floral' framed painting wood 30 inch x 10 inch special effect uv print textured sao297 painting made up in synthetic frame with uv textured print which gives multi effects and attracts towards it this is an special series of paintings which makes your wall very beautiful and gives a royal touch a perfect gift for your special ones"

In [46]:
preprocessed(processed_df["data"][1])

'__label__household saf floral framed painting wood 30 inch x 10 inch special effect uv print textured sao297 painting made up in synthetic frame with uv textured print which gives multi effects and attracts towards it this is an special series of paintings which makes your wall very beautiful and gives a royal touch a perfect gift for your special ones'

In [47]:
# Applying this function to the entire DataFrame
processed_df["data"] = processed_df["data"].map(preprocessed)

In [48]:
processed_df

Unnamed: 0,data
0,__label__household paper plane design framed w...
1,__label__household saf floral framed painting ...
2,__label__household saf uv textured modern art ...
3,__label__household saf flower print framed pai...
4,__label__household incredible gifts india wood...
...,...
50420,__label__electronics strontium microsd class 1...
50421,__label__electronics crossbeats wave waterproo...
50422,__label__electronics karbonn titanium wind w4 ...
50423,__label__electronics samsung guru fm plus sm b...


## Splitting the Dataset into Training and Testing Sets

In [53]:
train, test = train_test_split(processed_df, test_size=0.2)

print(train.shape, test.shape)

(40339, 1) (10085, 1)


## Converting the DataFrames into CSVs

In [54]:
train.to_csv("ecommerce.train", columns=["data"], index=False, header=None)
test.to_csv("ecommerce.test", columns=["data"], index=False, header=None)

## Training the Model

In [55]:
model = fasttext.train_supervised(input="ecommerce.train")

## Evaluating the Model

In [56]:
model.test("ecommerce.test")

(10084, 0.9702499008330028, 0.9702499008330028)

where `10084` is the size of the test file and `(x, y)` the presicion and recall respectively.

## Making Predictions

In [57]:
model.predict("wintech assemble desktop pc cpu 500 gb sata hdd 4 gb ram intel c2d processor")

(('__label__electronics',), array([0.99636447]))

In [58]:
model.predict("think and grow rich deluxe edition")

(('__label__books',), array([1.00000989]))

## We can also Get the most Similar Words for a given Word

In [62]:
model.get_nearest_neighbors("painting")

[(0.9985460042953491, 'sistema'),
 (0.9985453486442566, 'undertakings'),
 (0.9985406994819641, 'lushed'),
 (0.9985387325286865, '850pa'),
 (0.9985387325286865, 'pickups'),
 (0.9985387325286865, 'irsensors'),
 (0.9985387325286865, '150min'),
 (0.9985352754592896, '0150901'),
 (0.9985352754592896, 'ottman'),
 (0.9985349774360657, 'furnxt')]