In [1]:
!pip install fasttext

In [2]:
!pip install nltk

In [3]:
# Mounting Drive
from google.colab import drive
drive.mount('/content/drive')

In [4]:
# Importing libraries
import pandas as pd
import numpy as np
import re

In [5]:
# Load the dataset
df = pd.read_csv('../../Datasets/fsa_prod_dump.csv', on_bad_lines='skip')
df.head()

Unnamed: 0,L0_Category,L1_Class,L2_Category,L3_Dictionary_SubCategory,L4_dictionarySubgroup,ProdName,ItemCode,Vendor,ProdType
0,,,,,,GROUND CHUCK FLAP 50% CHUCK NECK 50%,623,Golden Gate Meat Company,Canonical
1,Food,Food - Meat & Game,Beef -- Ground & Shapes,Beef (Bulk) - Ground,Beef Ground >10% Fat (NAMP 136),FINE GRIND BEEF TUBES 81/19 6/10#,624,Golden Gate Meat Company,Canonical
2,Food,Food - Meat & Game,Beef - Foodservice Whole Cuts,Beef (Bulk) - Plate Cuts,Outside Skirt - Skinned (NAMP # 121E),AUST OUTSIDE SKIRT PEELED SOLD BY CASE,1717,Golden Gate Meat Company,Canonical
3,Food,Food - Meat & Game,Beef Deli & Charcuterie,Beef - Deli & Charcuterie,Beef Pastrami (NAMP # 611),PASTRAMI - NEW YORK STYLE- SABRA,1820,Golden Gate Meat Company,Canonical
4,Food,Food - Meat & Game,Beef Deli & Charcuterie,Beef - Deli & Charcuterie,Beef Pastrami (NAMP # 611),PASTRAMI SLICED 5# BAGS (50# CASE),1821,Golden Gate Meat Company,Canonical


In [6]:
# Separate dataset into 4 dataset based on L0_Category
# Preprocessing
# Droppig missing values
df = df.dropna(axis=0)

# Drop duplicates
df.drop_duplicates(keep='first', inplace=True)

In [7]:
df.L0_Category.unique()

array(['Food', 'Administrative', 'Operationals', 'Beverage'], dtype=object)

In [8]:
# Separate into 4 diffrent categories
df_food = df[df['L0_Category']=='Food']
df_administrative = df[df['L0_Category']=='Administrative']
df_operationals = df[df['L0_Category']=='Operationals']
df_beverage = df[df['L0_Category']=='Beverage']

In [9]:
print(df_food.L0_Category.unique())
print(df_administrative.L0_Category.unique())
print(df_operationals.L0_Category.unique())
print(df_beverage.L0_Category.unique())

['Food']
['Administrative']
['Operationals']
['Beverage']


In [10]:
# Save into four different csv
df_food.to_csv('../../Datasets/food.csv', header=True, index=False)
df_administrative.to_csv('../../Datasets/administrative.csv', header=True, index=False)
df_operationals.to_csv('../../Datasets/operationals.csv', header=True, index=False)
df_beverage.to_csv('../../Datasets/beverage.csv', header=True, index=False)

# **L4 Category**

In [11]:
# Preprocessing fuction for category
def class_pre(text):
  new_text = re.sub(r'&', 'and', text)
  new_text = re.sub(r'[^\w\s]',' ', new_text)
  new_text = re.sub(r' +', ' ', new_text)
  new_text = re.sub(r'\s', '_', new_text)
  return new_text

In [12]:
# Replace all white spaces whith single space
# Remove white spaces in begining and end of the text
# Turn text into lower case

# Preprocessing Function
def preprocess(text):
    text = re.sub(r'&', 'and', text)
    text = re.sub(r'[^\w\s]',' ', text)
    text = re.sub(' +', ' ', text)
    return text.strip().lower()

In [13]:
# Cleaning function currently using
# import nltk
# from nltk.corpus import stopwords

# # Download stopwords
# nltk.download("stopwords")

In [14]:
# Did a slight modification, convert & into and
# Then removes extra whitespaces

# def clean(string):
#     string = re.sub('&', 'and ', string)
#     raw_text = re.sub("[^a-zA-Z]+", " ", string)
#     raw_text = re.sub(' +', ' ', raw_text)
#     raw_text = raw_text.strip()
#     words = raw_text.lower().split()
#     stops = set(stopwords.words("english"))
#     meaningful_words = [
#         word for word in words if ((not word in stops) and (len(word) >= 3))
#     ]
#     string = " ".join(meaningful_words)
#     return string

## Operational

In [15]:
df = pd.read_csv('../../Datasets/operationals.csv')
df.head()

Unnamed: 0,L0_Category,L1_Class,L2_Category,L3_Dictionary_SubCategory,L4_dictionarySubgroup,ProdName,ItemCode,Vendor,ProdType
0,Operationals,Operationals - Office & Financial,Office Supplies,Packing - Shipping,Twine,COTTON TWINE,9656,Golden Gate Meat Company,Canonical
1,Operationals,Operationals - Production (BOH) Supplies,Boards - Liners - Sheets - Disposable,Food Wrapping Paper,Wax Paper - Bone Guard for Meat,"BONE GAURD WAX 9x150 FEET LONG""",9657,Golden Gate Meat Company,Canonical
2,Operationals,Operationals - Equipment Incl Appliances,Equipment Maintenance,Lubricants - Sealants - Other Chemicals,Lubricants,MINERAL OIL - GALLON,9663,Golden Gate Meat Company,Canonical
3,Operationals,Operationals - Guest & Staff,Staff Apparel,Apparel & Accessories - All Other,Vendor Apparel - Hats - Aprons - T-Shirts Etc,GOLDEN GATE APPAREL,9672,Golden Gate Meat Company,Canonical
4,Operationals,Operationals - Production (BOH) Supplies,Bags - Production - Disposable,Production Bags,Vacuum Seal Bags,CRY-O-VAC BAGS 8 X 10***1000/case,9680,Golden Gate Meat Company,Canonical


In [16]:
# Counting L4 Category classes
len(df.L4_dictionarySubgroup.unique())

1910

In [17]:
value_counts = df.L4_dictionarySubgroup.value_counts()
filtered_counts = value_counts[value_counts < 15]
print(filtered_counts)

L4_dictionarySubgroup
File - Horizontal Tray Type                                                    14
Clips - Bulldog                                                                14
Spoon Rests                                                                    14
Plugs or Stoppers for Non-Disposable Cup Lids                                  14
Donut Box                                                                      14
                                                                               ..
Paper Bags - Flat Bottom - No Handle - White 1/6 BBL                            1
File - Desktop Drawers                                                          1
Wax Paper - Bone Guard for Meat                                                 1
Paper Bags - Flat Bottom - No Handle - Natural/Brown/Kraft Size Unspecified     1
Labels - Subwrap                                                                1
Name: count, Length: 563, dtype: int64


In [18]:
# Check value counts in a plot
# Import libraries
# import matplotlib.pyplot as plt

# # Get the values count
# counts = df.L4_dictionarySubgroup.value_counts()


# # Change the figure size
# # fig = plt.figure(figsize=(10, 15))

# for index, value in enumerate(counts.values):
#     plt.text(value + 1, index, str(value))

# plt.barh(counts.index, counts.values)
# plt.show()

In [19]:
# Checking the number of data points
df.shape

(250086, 9)

In [20]:
# making the category
df['L4'] = df['L4_dictionarySubgroup'].apply(class_pre)

In [21]:
# Making the category using labels into format
df['category'] = '__label__' + df['L4'].astype(str)

In [22]:
# Dropping irrelavant columns
columns = ['L0_Category', 'L4_dictionarySubgroup', 'ItemCode', 'Vendor', 'ProdType', 'L4']

# Dropping
df = df.drop(columns, axis=1)

In [23]:
df.head()

Unnamed: 0,L1_Class,L2_Category,L3_Dictionary_SubCategory,ProdName,category
0,Operationals - Office & Financial,Office Supplies,Packing - Shipping,COTTON TWINE,__label__Twine
1,Operationals - Production (BOH) Supplies,Boards - Liners - Sheets - Disposable,Food Wrapping Paper,"BONE GAURD WAX 9x150 FEET LONG""",__label__Wax_Paper_Bone_Guard_for_Meat
2,Operationals - Equipment Incl Appliances,Equipment Maintenance,Lubricants - Sealants - Other Chemicals,MINERAL OIL - GALLON,__label__Lubricants
3,Operationals - Guest & Staff,Staff Apparel,Apparel & Accessories - All Other,GOLDEN GATE APPAREL,__label__Vendor_Apparel_Hats_Aprons_T_Shirts_Etc
4,Operationals - Production (BOH) Supplies,Bags - Production - Disposable,Production Bags,CRY-O-VAC BAGS 8 X 10***1000/case,__label__Vacuum_Seal_Bags


In [24]:
# Creating catehory description with producbt name and vendor
# df['category_description'] = df['category'] + ' ' + df['L1_Class'] + ' ' + df['L2_Category'] + ' ' + df['L3_Dictionary_SubCategory'] + ' ' + df['ProdName']
df['category_description'] = df['category'] + ' ' + df['L3_Dictionary_SubCategory'] + ' ' + df['ProdName']

In [25]:
# Drop used columns
columns = ['L1_Class', 'L2_Category', 'L3_Dictionary_SubCategory', 'ProdName']

df = df.drop(columns, axis=1)

In [26]:
# Apply preprocessing function to 'category_description'

df['category_description'] = df['category_description'].apply(preprocess)

In [40]:
# Test Train splitting
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.02, random_state=42)

In [41]:
# Save test and train dataset into two differnet files
train.to_csv('operationals_train.train',
             columns=["category_description"],
             index=False,
             header=False)

test.to_csv('operationals_test.train',
             columns=["category_description"],
             index=False,
             header=False)

In [55]:
# Fast text model
import fasttext

model = fasttext.train_supervised(input='operationals_train.train',
                                  dim=300,
                                  loss='hs',
                                  wordNgrams=2,
                                  lr=0.5,
                                  epoch=10,
                                  ws=15,
                                  minn=3,
                                  maxn=5)

model.test('operationals_test.train')

(5002, 0.9050379848060776, 0.9050379848060776)

In [56]:
text = 'Ladles - Spoons - Scoops SPOON REST white ceramic'
text = preprocess(text)
model.predict(text, k=5)
# Correct class - Shot machine

(('__label__spoon_rests',
  '__label__basting_spoon_solid_not_perforated_',
  '__label__spoons_non_disposable',
  '__label__basting_spoon_perforated_or_slotted',
  '__label__skimmers'),
 array([0.43388844, 0.34116268, 0.14173618, 0.02561265, 0.01563608]))

In [57]:
model.save_model('../../Models/L4/operational/tuned_model.bin')

Selecting datapoints with for prediction

In [32]:
sample = pd.read_csv('../../Datasets/operationals.csv')
sample.head()

Unnamed: 0,L0_Category,L1_Class,L2_Category,L3_Dictionary_SubCategory,L4_dictionarySubgroup,ProdName,ItemCode,Vendor,ProdType
0,Operationals,Operationals - Office & Financial,Office Supplies,Packing - Shipping,Twine,COTTON TWINE,9656,Golden Gate Meat Company,Canonical
1,Operationals,Operationals - Production (BOH) Supplies,Boards - Liners - Sheets - Disposable,Food Wrapping Paper,Wax Paper - Bone Guard for Meat,"BONE GAURD WAX 9x150 FEET LONG""",9657,Golden Gate Meat Company,Canonical
2,Operationals,Operationals - Equipment Incl Appliances,Equipment Maintenance,Lubricants - Sealants - Other Chemicals,Lubricants,MINERAL OIL - GALLON,9663,Golden Gate Meat Company,Canonical
3,Operationals,Operationals - Guest & Staff,Staff Apparel,Apparel & Accessories - All Other,Vendor Apparel - Hats - Aprons - T-Shirts Etc,GOLDEN GATE APPAREL,9672,Golden Gate Meat Company,Canonical
4,Operationals,Operationals - Production (BOH) Supplies,Bags - Production - Disposable,Production Bags,Vacuum Seal Bags,CRY-O-VAC BAGS 8 X 10***1000/case,9680,Golden Gate Meat Company,Canonical


In [33]:
drop_columns = ['L0_Category', 'L1_Class', 'L2_Category', 'ItemCode', 'Vendor', 'ProdType']
sample = sample.drop(drop_columns, axis=1)

In [34]:
sample.head(10)

Unnamed: 0,L3_Dictionary_SubCategory,L4_dictionarySubgroup,ProdName
0,Packing - Shipping,Twine,COTTON TWINE
1,Food Wrapping Paper,Wax Paper - Bone Guard for Meat,"BONE GAURD WAX 9x150 FEET LONG"""
2,Lubricants - Sealants - Other Chemicals,Lubricants,MINERAL OIL - GALLON
3,Apparel & Accessories - All Other,Vendor Apparel - Hats - Aprons - T-Shirts Etc,GOLDEN GATE APPAREL
4,Production Bags,Vacuum Seal Bags,CRY-O-VAC BAGS 8 X 10***1000/case
5,Production Bags,Vacuum Seal Bags,CRY-O-VAC BAGS 10 X 12*****1000/case
6,Production Bags,Vacuum Seal Bags,CRY-O-VAC BAGS 12 X 14****1000/case
7,Production Bags,Vacuum Seal Bags,CRY-O-VAC BAGS 14 X 16****500/case
8,Production Bags,Vacuum Seal Bags,CRY-O-VAC BAGS 14 X 20****500/case
9,Production Bags,Vacuum Seal Bags,CRY-O-VAC BAGS 16 X 24****500/case


In [35]:
uniques = sample.L4_dictionarySubgroup.unique()
for unique in uniques:
  print(unique)

Twine
Wax Paper - Bone Guard for Meat
Lubricants
Vendor Apparel - Hats - Aprons - T-Shirts Etc
Vacuum Seal Bags
Liner - Garbage Bags Size Undeclared
Corrugated Boxes (non Culinary Use)
Wine Glasses - Whites
Decanters - Wine
Martini Glasses
Coffee Filters
Wine Glasses - Reds
Champagne Flutes
Water Glasses - Stem
Cleaner - Ice and Beverage Equipment
Paper Filters
Whisky Glasses
Tea Chests - Boxes
Polishing Cloth
Cocktail Glasses
Floorstanding Upright Freezers
Wine Glasses - All Other
Countertop Display Freezers
Water Glasses
Coffee Equipment Chemical Cleaners
Shot Machine
Tableservice Espresso Cups and Saucers
Wine - Ice Buckets and Chillers
Beer Glasses
Cold Drink Cups - All Other
Lids Only - for Cups - Disposable
Hot Drink Print Cups
Shot Glasses
Old Fashioned Glasses
Hiball Glasses
Tumbler
Wine Bags
Beverage Dispensers
Mugs
Brandy Glasses
Cups Disposable
Tableservice Cappuccino Cups and Saucers
Coffee Equipment Parts
Demitasse Spoon
Wine Preservation Systems
Ashtrays
Decanters
Mixing 

In [44]:
sample_test = sample[sample['L4_dictionarySubgroup']=='Spoon Rests']
sample_test.head()

Unnamed: 0,L3_Dictionary_SubCategory,L4_dictionarySubgroup,ProdName
64978,Ladles - Spoons - Scoops,Spoon Rests,"REST, SPOON 8.75 S/S MIRROR FINISH"
85186,Ladles - Spoons - Scoops,Spoon Rests,"Spoon Rest, 9X4 Charcoal Ceramic China"
85284,Ladles - Spoons - Scoops,Spoon Rests,"SPOON REST, UTENSIL 3X5 S/S HEAVY-DUTY"
89536,Ladles - Spoons - Scoops,Spoon Rests,"SPOON REST, UTENSIL 9X3.75X1.5 S/S BRUSHED FINISH"
110332,Ladles - Spoons - Scoops,Spoon Rests,"Spoon Rest, 4 Kyoto Euro White Porcelain China"""


### **Lable preprocessing Function**

This will remove the unwanted parts from the predicted lable and return the lable in much readable manner

In [None]:
prediction = model.predict(text)
print(prediction[0][0])

In [None]:
# This function process the prediction by getting the input
def get_lable(text):
  # Processing the text
  text = preprocess(text)
  # Get the prediction
  prediction = model.predict(text)
  return prediction[0][0]

In [None]:
from_func = get_lable(text)
print(from_func)

In [None]:
# Function to preprocess the lable
def label_processing(label):
  label = re.sub('__label__', '', label)
  label = re.sub('_', ' ', label)
  label = re.sub(' +', ' ', label)
  return label.strip().lower()

In [None]:
print(label_processing(from_func))