## 25. fastText tutorial | Text Classification Using fastText

In [22]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [23]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/4.Data Science/Extra Learning/Data Science - Codebasics/9. Natural Language Processing (NLP)/25. fastText tutorial | Text Classification Using fastText/ecommerce_dataset.csv",names = ['category','description'],header=None)
print(df.shape)
df.head()

(50425, 2)


Unnamed: 0,category,description
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...


In [24]:
df.category.value_counts()

category
Household                 19313
Books                     11820
Electronics               10621
Clothing & Accessories     8671
Name: count, dtype: int64

In [25]:
df.dropna(inplace=True) # drop missing values
df.shape

(50424, 2)

In [26]:
#removing spaces in category labels- Because fasttext expects labels without spaces but "_" inplace of spaces.

df.category.replace('Clothing & Accessories','Clothing_Accessories',inplace=True)
df.category.unique()

array(['Household', 'Books', 'Clothing_Accessories', 'Electronics'],
      dtype=object)

In [27]:
# adding'_label_' as a prefix for labels

df['category'] = '_label_' + df['category'].astype(str)
df.head()

Unnamed: 0,category,description
0,_label_Household,Paper Plane Design Framed Wall Hanging Motivat...
1,_label_Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,_label_Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,_label_Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,_label_Household,Incredible Gifts India Wooden Happy Birthday U...


In [28]:
#merging two columns

df['category_description'] = df['category'] +" "+ df['description']
df.head()

Unnamed: 0,category,description,category_description
0,_label_Household,Paper Plane Design Framed Wall Hanging Motivat...,_label_Household Paper Plane Design Framed Wal...
1,_label_Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",_label_Household SAF 'Floral' Framed Painting ...
2,_label_Household,SAF 'UV Textured Modern Art Print Framed' Pain...,_label_Household SAF 'UV Textured Modern Art P...
3,_label_Household,"SAF Flower Print Framed Painting (Synthetic, 1...",_label_Household SAF Flower Print Framed Paint...
4,_label_Household,Incredible Gifts India Wooden Happy Birthday U...,_label_Household Incredible Gifts India Wooden...


In [29]:
#preprocessing category_description column using regular expression

import re

def preprocess(text):
  text = re.sub(r'[^\w\s\']',' ',text) #remove punctuation
  text = re.sub(r' +',' ',text) #remove extra spaces
  return text.strip().lower() # Remove leading and trailing whitespace and converting to lower case



In [30]:
df['category_description'] = df['category_description'].map(preprocess)
df.head()


Unnamed: 0,category,description,category_description
0,_label_Household,Paper Plane Design Framed Wall Hanging Motivat...,_label_household paper plane design framed wal...
1,_label_Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",_label_household saf 'floral' framed painting ...
2,_label_Household,SAF 'UV Textured Modern Art Print Framed' Pain...,_label_household saf 'uv textured modern art p...
3,_label_Household,"SAF Flower Print Framed Painting (Synthetic, 1...",_label_household saf flower print framed paint...
4,_label_Household,Incredible Gifts India Wooden Happy Birthday U...,_label_household incredible gifts india wooden...


In [31]:
#splitting train and test samples

from sklearn.model_selection import train_test_split

train,test = train_test_split(df,test_size=0.2,random_state=2024)

In [32]:
train.shape,test.shape

((40339, 3), (10085, 3))

In [34]:
!pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.13.1-py3-none-any.whl (238 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.3-cp310-cp310-linux_x86_64.whl size=4246765 sha256=6b561206e9f1cf26986b453c2e6693181df7a3d582ba4e23fb00178c3154179d
  Stored in directory: /root/.cache/pip/wheels/0d/a2/00/81db54d3e6a8199b829d58e02cec2ddb20ce3e59fad8d3c92a
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Successfully installed fasttext-0.9.3 pybind11-2.13.1


In [37]:
train.to_csv("/content/drive/MyDrive/4.Data Science/Extra Learning/Data Science - Codebasics/9. Natural Language Processing (NLP)/25. fastText tutorial | Text Classification Using fastText/ecommerce.train",columns=["category_description"],index=False,header=False)
test.to_csv("/content/drive/MyDrive/4.Data Science/Extra Learning/Data Science - Codebasics/9. Natural Language Processing (NLP)/25. fastText tutorial | Text Classification Using fastText/ecommerce.test",columns=["category_description"],index=False,header=False)

In [38]:
import fasttext

model = fasttext.train_supervised(input="/content/drive/MyDrive/4.Data Science/Extra Learning/Data Science - Codebasics/9. Natural Language Processing (NLP)/25. fastText tutorial | Text Classification Using fastText/ecommerce.train") # model training
model.test("/content/drive/MyDrive/4.Data Science/Extra Learning/Data Science - Codebasics/9. Natural Language Processing (NLP)/25. fastText tutorial | Text Classification Using fastText/ecommerce.test")

(0, nan, nan)

In [39]:
model.predict("wintech assemble desktop pc cpu 500gb sata hdd 4gb ram")

((), array([], dtype=float64))

In [41]:
model.get_nearest_neighbors("painting")

[(0.33469337224960327, 'delicately'),
 (0.32087182998657227, 'mah'),
 (0.31482163071632385, 'eye'),
 (0.30570945143699646, 'amount'),
 (0.30306559801101685, "she's"),
 (0.3018181622028351, 'â'),
 (0.29906028509140015, 'baseball'),
 (0.2982535660266876, 'existing'),
 (0.2972378134727478, 'uses'),
 (0.29479917883872986, 'vera')]