In [1]:
import pandas as pd
df = pd.read_csv("/content/ecommerceDataset.csv", names=["category", "description"])
df

Unnamed: 0,category,description
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...
...,...,...
50420,Electronics,Strontium MicroSD Class 10 8GB Memory Card (Bl...
50421,Electronics,CrossBeats Wave Waterproof Bluetooth Wireless ...
50422,Electronics,Karbonn Titanium Wind W4 (White) Karbonn Titan...
50423,Electronics,"Samsung Guru FM Plus (SM-B110E/D, Black) Colou..."


In [2]:
df["category"].value_counts()

Household                 19313
Books                     11820
Electronics               10621
Clothing & Accessories     8671
Name: category, dtype: int64

In [3]:
# Renaming "Clothing & Accessories" to "Clothing_Accessories"
df["category"].replace("Clothing & Accessories", "Clothing_Accessories", inplace=True)
df["category"].unique()

array(['Household', 'Books', 'Clothing_Accessories', 'Electronics'],
      dtype=object)

In [4]:
# Adding "__label__" with each category
df["category"] = "__label__" + df["category"].astype(str)
df

Unnamed: 0,category,description
0,__label__Household,Paper Plane Design Framed Wall Hanging Motivat...
1,__label__Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,__label__Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,__label__Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,__label__Household,Incredible Gifts India Wooden Happy Birthday U...
...,...,...
50420,__label__Electronics,Strontium MicroSD Class 10 8GB Memory Card (Bl...
50421,__label__Electronics,CrossBeats Wave Waterproof Bluetooth Wireless ...
50422,__label__Electronics,Karbonn Titanium Wind W4 (White) Karbonn Titan...
50423,__label__Electronics,"Samsung Guru FM Plus (SM-B110E/D, Black) Colou..."


In [5]:
# Adding new category with description column
df["category_description"] = df["category"] + " " + df["description"].astype(str)
df

Unnamed: 0,category,description,category_description
0,__label__Household,Paper Plane Design Framed Wall Hanging Motivat...,__label__Household Paper Plane Design Framed W...
1,__label__Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",__label__Household SAF 'Floral' Framed Paintin...
2,__label__Household,SAF 'UV Textured Modern Art Print Framed' Pain...,__label__Household SAF 'UV Textured Modern Art...
3,__label__Household,"SAF Flower Print Framed Painting (Synthetic, 1...",__label__Household SAF Flower Print Framed Pai...
4,__label__Household,Incredible Gifts India Wooden Happy Birthday U...,__label__Household Incredible Gifts India Wood...
...,...,...,...
50420,__label__Electronics,Strontium MicroSD Class 10 8GB Memory Card (Bl...,__label__Electronics Strontium MicroSD Class 1...
50421,__label__Electronics,CrossBeats Wave Waterproof Bluetooth Wireless ...,__label__Electronics CrossBeats Wave Waterproo...
50422,__label__Electronics,Karbonn Titanium Wind W4 (White) Karbonn Titan...,__label__Electronics Karbonn Titanium Wind W4 ...
50423,__label__Electronics,"Samsung Guru FM Plus (SM-B110E/D, Black) Colou...",__label__Electronics Samsung Guru FM Plus (SM-...


In [6]:
# Now we are working with this "category_description" column
df["category_description"]

0        __label__Household Paper Plane Design Framed W...
1        __label__Household SAF 'Floral' Framed Paintin...
2        __label__Household SAF 'UV Textured Modern Art...
3        __label__Household SAF Flower Print Framed Pai...
4        __label__Household Incredible Gifts India Wood...
                               ...                        
50420    __label__Electronics Strontium MicroSD Class 1...
50421    __label__Electronics CrossBeats Wave Waterproo...
50422    __label__Electronics Karbonn Titanium Wind W4 ...
50423    __label__Electronics Samsung Guru FM Plus (SM-...
50424    __label__Electronics Micromax Canvas Win W121 ...
Name: category_description, Length: 50425, dtype: object

In [7]:
# first element of "category_description" column
df["category_description"][0]

'__label__Household Paper Plane Design Framed Wall Hanging Motivational Office Decor Art Prints (8.7 X 8.7 inch) - Set of 4 Painting made up in synthetic frame with uv textured print which gives multi effects and attracts towards it. This is an special series of paintings which makes your wall very beautiful and gives a royal touch. This painting is ready to hang, you would be proud to possess this unique painting that is a niche apart. We use only the most modern and efficient printing technology on our prints, with only the and inks and precision epson, roland and hp printers. This innovative hd printing technique results in durable and spectacular looking prints of the highest that last a lifetime. We print solely with top-notch 100% inks, to achieve brilliant and true colours. Due to their high level of uv resistance, our prints retain their beautiful colours for many years. Add colour and style to your living space with this digitally printed painting. Some are for pleasure and so

In [8]:
# Regex keeps only lower-case words with one white-space
import re
def preprocess(text):
    text = re.sub(r"[^\w\s]", " ", text)
    text = re.sub(r"[ \n]+", " ", text)
    return text.strip().lower()

In [9]:
text = "__label__Household Paper Plane Design Framed Wall Hanging Motivational Office Decor Art Prints (8.7 X 8.7 inch) - Set of 4 Painting made up in."
text

'__label__Household Paper Plane Design Framed Wall Hanging Motivational Office Decor Art Prints (8.7 X 8.7 inch) - Set of 4 Painting made up in.'

In [10]:
preprocess(text)

'__label__household paper plane design framed wall hanging motivational office decor art prints 8 7 x 8 7 inch set of 4 painting made up in'

In [11]:
# Filtering "category_description" column with regex
df["category_description"] = df["category_description"].apply(preprocess)
df

Unnamed: 0,category,description,category_description
0,__label__Household,Paper Plane Design Framed Wall Hanging Motivat...,__label__household paper plane design framed w...
1,__label__Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",__label__household saf floral framed painting ...
2,__label__Household,SAF 'UV Textured Modern Art Print Framed' Pain...,__label__household saf uv textured modern art ...
3,__label__Household,"SAF Flower Print Framed Painting (Synthetic, 1...",__label__household saf flower print framed pai...
4,__label__Household,Incredible Gifts India Wooden Happy Birthday U...,__label__household incredible gifts india wood...
...,...,...,...
50420,__label__Electronics,Strontium MicroSD Class 10 8GB Memory Card (Bl...,__label__electronics strontium microsd class 1...
50421,__label__Electronics,CrossBeats Wave Waterproof Bluetooth Wireless ...,__label__electronics crossbeats wave waterproo...
50422,__label__Electronics,Karbonn Titanium Wind W4 (White) Karbonn Titan...,__label__electronics karbonn titanium wind w4 ...
50423,__label__Electronics,"Samsung Guru FM Plus (SM-B110E/D, Black) Colou...",__label__electronics samsung guru fm plus sm b...


In [12]:
# second value of "df["category_description"]"
df["category_description"][1]

'__label__household saf floral framed painting wood 30 inch x 10 inch special effect uv print textured sao297 painting made up in synthetic frame with uv textured print which gives multi effects and attracts towards it this is an special series of paintings which makes your wall very beautiful and gives a royal touch a perfect gift for your special ones'

In [13]:
# Splitting the dataset into training and test set
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.2, random_state=2023)

In [14]:
print(df.shape, train.shape, test.shape)

(50425, 3) (40340, 3) (10085, 3)


In [15]:
train.head(5)

Unnamed: 0,category,description,category_description
7356,__label__Household,"VOGUE INDIA - February, 2019 Before it's in fa...",__label__household vogue india february 2019 b...
4955,__label__Household,Paper Plane Design 'Rangoli Pattern' Floor Sti...,__label__household paper plane design rangoli ...
4301,__label__Household,JaipurCrafts WebelKart Decorative Unique Desig...,__label__household jaipurcrafts webelkart deco...
26484,__label__Books,Investment Adviser (Level 1) (X-A) (Reprint Ap...,__label__books investment adviser level 1 x a ...
14560,__label__Household,HUILE Manual pump for bottled water Drinking w...,__label__household huile manual pump for bottl...


In [16]:
test.head(5)

Unnamed: 0,category,description,category_description
707,__label__Household,Trevi Split 3-Door Wardrobe Without Mirror and...,__label__household trevi split 3 door wardrobe...
27875,__label__Books,The Black Swan: The Impact of the Highly Impro...,__label__books the black swan the impact of th...
14811,__label__Household,V-Guard Mini Crystal with 3 Pin Socket Voltage...,__label__household v guard mini crystal with 3...
5918,__label__Household,Goyal's Superior Quality Mink Single Bed Blank...,__label__household goyal s superior quality mi...
21865,__label__Books,Indian Polity (Tamil) (For TNPSC and UPSC Comp...,__label__books indian polity tamil for tnpsc a...


In [17]:
# Now, we need to keep this filtered df value to text,as fasttext train_unsupervised() fuction takes text not pandas Series
# We store this in text format
df.to_csv("ecommerce.train", columns = ["category_description"], index=False, header=None)
df.to_csv("ecommerce.test", columns = ["category_description"], index=False, header=None)

In [18]:
!pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/68.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━[0m [32m61.4/68.8 kB[0m [31m1.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.11.1-py3-none-any.whl (227 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp310-cp310-linux_x86_64.whl size=4199773 sha256=506cad1e68fe02d73ce2ed2680530cea02f0693f6f5c23ec573a3f1b9fa84f34
  Stored in directory: /root/.cache/pip/wheels/a5/13/75/f811c84a8ab36eedbaef977a6a58a98990e8e0f1967f98f394
Successfully built fa

In [19]:
# importing library
import fasttext

# train the model on "ecommerce.train" dataset
model = fasttext.train_supervised("/content/ecommerce.train")
# test the model to cheack the model performance
model.test("/content/ecommerce.test")

(50425, 0.28624690133862174, 0.28624690133862174)



1.   50425 ==> test dataset size
2.   0.286 ==> Precision
3.   0.286 ==> Recall





In [20]:
model.predict("saf floral framed painting wood 30 inch x 10 inch special effect uv print textured sao297 painting made up in synthetic frame with uv textured print which gives multi effects and attracts towards it this is an special series of paintings which makes your wall very beautiful and gives a royal touch a perfect gift for your special ones")

(('__label__clothing_accessories',), array([0.79556465]))

In [21]:
model.predict("Quantam Machanics")

(('__label__clothing_accessories',), array([0.99984813]))