In [1]:
!pip install fasttext

In [2]:
!pip install nltk

In [3]:
# Mounting Drive
from google.colab import drive
drive.mount('/content/drive')

In [4]:
# importing libraries
import pandas as pd
import numpy as np
import re

In [5]:
# Load the dataset
df = pd.read_csv('../Datasets/fsa_prod_dump.csv', on_bad_lines='skip')
df.head()

Unnamed: 0,L0_Category,L1_Class,L2_Category,L3_Dictionary_SubCategory,L4_dictionarySubgroup,ProdName,ItemCode,Vendor,ProdType
0,,,,,,GROUND CHUCK FLAP 50% CHUCK NECK 50%,623,Golden Gate Meat Company,Canonical
1,Food,Food - Meat & Game,Beef -- Ground & Shapes,Beef (Bulk) - Ground,Beef Ground >10% Fat (NAMP 136),FINE GRIND BEEF TUBES 81/19 6/10#,624,Golden Gate Meat Company,Canonical
2,Food,Food - Meat & Game,Beef - Foodservice Whole Cuts,Beef (Bulk) - Plate Cuts,Outside Skirt - Skinned (NAMP # 121E),AUST OUTSIDE SKIRT PEELED SOLD BY CASE,1717,Golden Gate Meat Company,Canonical
3,Food,Food - Meat & Game,Beef Deli & Charcuterie,Beef - Deli & Charcuterie,Beef Pastrami (NAMP # 611),PASTRAMI - NEW YORK STYLE- SABRA,1820,Golden Gate Meat Company,Canonical
4,Food,Food - Meat & Game,Beef Deli & Charcuterie,Beef - Deli & Charcuterie,Beef Pastrami (NAMP # 611),PASTRAMI SLICED 5# BAGS (50# CASE),1821,Golden Gate Meat Company,Canonical


In [6]:
df.shape

(1021560, 9)

In [7]:
# Droppig missing values
df = df.dropna(axis=0)

# Drop duplicates
df.drop_duplicates(keep='first', inplace=True)

# **L0 Category**

In [8]:
# Selecting columns for L1 category
columns = ['L0_Category', 'ProdName']

df = df[columns]
df.head(3)

Unnamed: 0,L0_Category,ProdName
1,Food,FINE GRIND BEEF TUBES 81/19 6/10#
2,Food,AUST OUTSIDE SKIRT PEELED SOLD BY CASE
3,Food,PASTRAMI - NEW YORK STYLE- SABRA


In [9]:
df.L0_Category.unique()

array(['Food', 'Administrative', 'Operationals', 'Beverage'], dtype=object)

In [10]:
# Counting L2 Category classes
len(df.L0_Category.unique())

4

In [11]:
# Importing Libraries
# import nltk
# from nltk.corpus import stopwords

# # Download stopwords
# nltk.download("stopwords")

In [12]:
# Preprocessing fuction for category
def class_pre(text):
  new_text = re.sub(r'&', 'and', text)
  new_text = re.sub(r'[^\w\s]',' ', new_text)
  new_text = re.sub(r' +', ' ', new_text)
  new_text = re.sub(r'\s', '_', new_text)
  return new_text

In [13]:
# Did a slight modification, convert & into and
# Then removes extra whitespaces

# def clean(string):
#     string = re.sub('&', 'and ', string)
#     raw_text = re.sub("[^a-zA-Z]+", " ", string)
#     raw_text = re.sub(' +', ' ', raw_text)
#     raw_text = raw_text.strip()
#     words = raw_text.lower().split()
#     stops = set(stopwords.words("english"))
#     meaningful_words = [
#         word for word in words if ((not word in stops) and (len(word) >= 3))
#     ]
#     string = " ".join(meaningful_words)
#     return string

In [14]:
# uniques = df.L2_Category.unique()

# for word in uniques:
#   pp = class_pre(word)
#   print(pp)

In [15]:
# making the category
df['L0'] = df['L0_Category'].apply(class_pre)

In [16]:
# Creating label of the class according to the fasttext format
df['category'] = '__label__' + df['L0'].astype(str)

In [17]:
# Drop used columns
df = df.drop(['L0_Category', 'L0'], axis=1)

In [18]:
# Creating catehory description with producbt name and vendor
df['category_description'] = df['category'] + ' ' + df['ProdName']
# df['category_description'] = df['L0_Category'] + ' ' + df['L1_Class'] + ' ' + df['ProdName']

In [19]:
#  Drop the columns : ProdName, L0_Category, L1_Class
df = df.drop(['ProdName'], axis=1)

In [20]:
df.head()

Unnamed: 0,category,category_description
1,__label__Food,__label__Food FINE GRIND BEEF TUBES 81/19 6/10#
2,__label__Food,__label__Food AUST OUTSIDE SKIRT PEELED SOLD B...
3,__label__Food,__label__Food PASTRAMI - NEW YORK STYLE- SABRA
4,__label__Food,__label__Food PASTRAMI SLICED 5# BAGS (50# CASE)
5,__label__Food,__label__Food ROAST BEEF EYE OF ROUND


In [21]:
# Replace all white spaces whith single space
# Remove white spaces in begining and end of the text
# Turn text into lower case

# Preprocessing Function
def preprocess(text):
    text = re.sub(r'&', 'and', text)
    text = re.sub(r'[^\w\s]',' ', text)
    text = re.sub(' +', ' ', text)
    return text.strip().lower()

In [22]:
# Apply preprocessing function to 'category_description'

df['category_description'] = df['category_description'].apply(preprocess)
# df['category_description'] = df['category'].apply(preprocess) + ' ' + df['category_description'].apply(clean)

In [24]:
# Test Train splitting
from sklearn.model_selection import train_test_split

# 98% training | 2% testing
train, test = train_test_split(df, test_size=0.02, random_state=42)

In [25]:
# Save test and train dataset into two differnet files
train.to_csv('train_set.train',
             columns=["category_description"],
             index=False,
             header=False)

test.to_csv('test_set.train',
             columns=["category_description"],
             index=False,
             header=False)

In [26]:
# Fast text model
import fasttext

model = fasttext.train_supervised(input='train_set.train',
                                  dim=600,
                                  loss='hs',
                                  wordNgrams=3,
                                  minn=3,
                                  maxn=5)

model.test('test_set.train')

(13917, 0.9906589063735001, 0.9906589063735001)

In [27]:
text = 'Bag Paper Kraft wine large'
text = preprocess(text)
model.predict(text, k=5)
# # True L2 Class = Bags - Service - Disposable

(('__label__operationals', '__label__food', '__label__administrative'),
 array([9.99904156e-01, 1.18458607e-04, 1.59696465e-05]))

In [28]:
model.save_model("../Models/L0/tuned_model.bin")