# Imports

In [514]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import math

import os

# Dataset

## Training Data

In [517]:
init_training_data = pd.read_csv('data/training/data_train.csv', sep=',', names=['country', 'sku_id', 'title', 'category_lvl_1', 'category_lvl_2', 'category_lvl_3', 'short_description', 'price', 'product_type'])
clarity_training_data = pd.read_csv('data/training/clarity_train.csv', sep=',', names=['clarity',''], usecols=['clarity'])
conciseness_training_data = pd.read_csv('data/training/conciseness_train.csv', sep=',', names=['conciseness',''], usecols=['conciseness'])

# Separately storing the data
training_data = init_training_data.copy()
training_data['clarity'] = clarity_training_data['clarity']
training_data['conciseness'] = conciseness_training_data['conciseness']

## Validation Data

In [518]:
validation_data = pd.read_csv('data/validation/data_valid.csv', sep=',', names = ['country', 'sku_id', 'title', 'category_lvl_1', 'category_lvl_2', 'category_lvl_3', 'short_description', 'price', 'product_type'])
clarity_validation_data = pd.read_csv('data/validation/clarity_valid.csv', sep=',', names=['clarity',''], usecols=['clarity'])
conciseness_validation_data = pd.read_csv('data/validation/conciseness_valid.csv', sep=',', names=['conciseness',''], usecols=['conciseness'])

## Testing Data

In [519]:
testing_data = pd.read_csv('data/testing/data_test.csv', sep=',', names = ['country', 'sku_id', 'title', 'category_lvl_1', 'category_lvl_2', 'category_lvl_3', 'short_description', 'price', 'product_type'])
clarity_testing_data = pd.read_csv('data/testing/clarity_test.csv', sep=',', names=['clarity',''], usecols=['clarity'])
conciseness_testing_data = pd.read_csv('data/testing/conciseness_test.csv', sep=',', names=['conciseness',''], usecols=['conciseness'])

There are a lot of fields which depend on the string values. These all have to be preprocessed accordingly and made into newer features before we move on to exploring what these features contain. Accordingly, finding the correlation of these features also does not give anything of value.

In [520]:
training_data.corr()

Unnamed: 0,price,clarity,conciseness
price,1.0,0.003322,0.009616
clarity,0.003322,1.0,0.361611
conciseness,0.009616,0.361611,1.0


# Preprocessing

## Missing Values - Imputation

### Column - Category_Lvl_3

In [521]:
def printColumnWithNA(data):
    for column in data.columns:
        print(column, data[column].isnull().sum())

In [522]:
printColumnWithNA(training_data)

country 0
sku_id 0
title 0
category_lvl_1 0
category_lvl_2 0
category_lvl_3 2135
short_description 33
price 0
product_type 277
clarity 0
conciseness 0


In [523]:
for idx in range(0 , len(training_data)):
    training_data.loc[idx, 'category_lvl_3'] = training_data.loc[idx]['category_lvl_1'] + " " + training_data.loc[idx]['category_lvl_2']

In [524]:
training_data[4:30]

Unnamed: 0,country,sku_id,title,category_lvl_1,category_lvl_2,category_lvl_3,short_description,price,product_type,clarity,conciseness
4,my,AR511HBAXNWAANMY,Argital Argiltubo Green Clay For Face and Body...,Health & Beauty,Men's Care,Health & Beauty Men's Care,<ul> <li>100% Authentic</li> <li>Rrefresh and ...,114.8,international,1,1
5,my,AS575ELCMZ4WANMY,Asus TP300LJ-DW004H Transformer Book Flip 4GB ...,Computers & Laptops,Laptops,Computers & Laptops Laptops,"<div class=""prod_content""> <div class=""prod_de...",2599.0,local,1,1
6,my,AS727ELAA9LLV1ANMY,NG-40C Ring-Shaped 40W 3166lm 5400K Macro Phot...,Cameras,Camera Accessories,Cameras Camera Accessories,<ul> <li>1. Color Temperature: 5400K</li> <li>...,388.99,international,1,1
7,my,BU512HBAA4WUVTANMY,Buytra Exfoliating Peel Foot Mask 1Pair,Health & Beauty,Bath & Body,Health & Beauty Bath & Body,<ul> <li>Reviving like a new born baby.</li> <...,10.4,international,1,1
8,my,CL787ELAW29LANMY,CLiPtec OCC121 Slim Flat USB 3.0 Extension Cab...,Computers & Laptops,Laptops,Computers & Laptops Laptops,"<ul style= ""padding: 0px; margin: 20px 0px 0px...",29.0,local,1,1
9,my,CO633HLAABREKOANMY,McDonald's Coke Can Glass Limited Edition 12oz...,Home & Living,Kitchen & Dining,Home & Living Kitchen & Dining,<ul> <li>Genuine issued McDonald's Coca Cola m...,25.0,local,1,1
10,my,EL802HLAA51ZZVANMY,ELENXS Stainless Steel Tea Ball Strainer Mesh ...,Home & Living,Kitchen & Dining,Home & Living Kitchen & Dining,<ul> <li>Stainless Steel Filter</li> <li>Stain...,9.48,international,1,1
11,my,EM688OTAA9H8S8ANMY,7mm Natural Prehnite Crystal Bracelet(Green),Watches Sunglasses Jewellery,Jewellery,Watches Sunglasses Jewellery Jewellery,<ul> <li>Material: Genuine Prehnite Crystal Be...,78.0,local,1,1
12,my,FE090OTAAAPW6VANMY,Feelontop Punk Rock Rhinestone Star Shape Long...,Watches Sunglasses Jewellery,Jewellery,Watches Sunglasses Jewellery Jewellery,<ul> <li>Lead and Nickle Free</li> <li>Good Qu...,15.55,international,1,1
13,my,FI087ELAA7Z6D7ANMY,"Fitbit Charge Wireless Activity Wristband, Blu...","TV, Audio / Video, Gaming & Wearables",Wearable Technology,"TV, Audio / Video, Gaming & Wearables Wearable...",<ul> <li>Accurately track all-day stats like s...,499.0,international,1,1


In [525]:
printColumnWithNA(training_data)

country 0
sku_id 0
title 0
category_lvl_1 0
category_lvl_2 0
category_lvl_3 0
short_description 33
price 0
product_type 277
clarity 0
conciseness 0


### Column - Short Description

In [526]:
training_data[training_data['short_description'].isna()][:5]

Unnamed: 0,country,sku_id,title,category_lvl_1,category_lvl_2,category_lvl_3,short_description,price,product_type,clarity,conciseness
924,my,BG674ELALY2IANMY,EGO-LT-6 Aluminum Alloy Electronic Cigarette w...,"TV, Audio / Video, Gaming & Wearables",Gadgets,"TV, Audio / Video, Gaming & Wearables Gadgets",,94.0,international,1,1
1381,ph,AS595ELAA1FGGPANPH,"$Asus Fonepad 7"" Android 4.1 WiFi/3G ME371MG-7...",Mobiles & Tablets,Tablets,Mobiles & Tablets Tablets,,25700.0,,1,1
2797,ph,BE718HLAX94DANPH,"Klestar Pinoy Fan 10"" stand fan KSF-10 (blue)#",Home Appliances,Cooling & Heating,Home Appliances Cooling & Heating,,1400.0,,1,1
4463,ph,OR212HBABMTYANPH,Ogx Renewing Moroccan Argan Oil 3-piece Starte...,Health & Beauty,Hair Care,Health & Beauty Hair Care,,699.0,,1,1
5349,ph,OL364HBAA3LYPLANPH,Olay Regenerist Micro-Sculpting Cream Moisturi...,Health & Beauty,Skin Care,Health & Beauty Skin Care,,999.0,,1,1


In [527]:
# How to check for NaN
np.isnan(training_data.loc[924, 'short_description'])

True

In [528]:
training_data['new_description'] = training_data['short_description']

for idx in range(0, len(training_data)):
    entry = training_data.loc[idx, 'short_description']

    if not type(entry) is float:

        if '<ul>' not in entry and '<li>' not in entry:
            training_data.loc[idx, 'new_description'] = entry
        else:
            soup = BeautifulSoup(entry)
            
            soup_li = soup.find_all('li')
            new_desc = ''
            for soup_len in range(0, len(soup_li)):
                new_desc += "\n" + soup_li[soup_len].get_text()

            training_data.loc[idx, 'new_description'] = new_desc

In [529]:
training_data.iloc[50:70]

Unnamed: 0,country,sku_id,title,category_lvl_1,category_lvl_2,category_lvl_3,short_description,price,product_type,clarity,conciseness,new_description
50,my,NO037HLAA6KY20ANMY,Creative Decoration 50*70cm Prints &amp; Poste...,Home & Living,Home Décor,Home & Living Home Décor,<ul> <li>Fine Workmanship</li> <li>Quality Fab...,122.0,international,1,1,\nFine Workmanship\nQuality Fabrics\nExquisite...
51,my,NO037OTAA3BK6UANMY,HKS Jin Yuji Austria Crystal Bracelet (Purple),Watches Sunglasses Jewellery,Jewellery,Watches Sunglasses Jewellery Jewellery,Color : PurpleJewelry Type : BraceletMaterial ...,83.0,international,1,1,Color : PurpleJewelry Type : BraceletMaterial ...
52,my,NO037OTAA4WU6OANMY,Wholesale Nickle Free Antiallergic 18K Real Go...,Watches Sunglasses Jewellery,Jewellery,Watches Sunglasses Jewellery Jewellery,<ul> <li>Model Number:N922-A N922-B N922-C N92...,31.4,international,1,0,\nModel Number:N922-A N922-B N922-C N922-D\nMa...
53,my,NO177HBAAAL7V7ANMY,NoTS Begin 28 Organic Daily Moisturizer,Health & Beauty,Skin Care,Health & Beauty Skin Care,<ul> <li>Skin Care</li> <li>Skin Essence</li> ...,135.0,international,1,1,\nSkin Care\nSkin Essence\nMoisture effect
54,my,NO990ELAABUH3GANMY,Ultrathin i6 Cover For iphone 6 Transparent So...,Mobiles & Tablets,Accessories,Mobiles & Tablets Accessories,CompatibleiPhoneModel: iPhone6 / CompatibleBra...,22.0,international,1,0,CompatibleiPhoneModel: iPhone6 / CompatibleBra...
55,my,OE702ELAA5E5OGANMY,High Speed 4 Port USB 2.0 Multi HUB Splitter E...,Computers & Laptops,Computer Accessories,Computers & Laptops Computer Accessories,<ul> <li>Country/Region of Manufacture:China</...,13.7,international,1,1,\nCountry/Region of Manufacture:China\nConnect...
56,my,OE702ELAA7KBC2ANMY,2 Units Compatible Laser Toner Brother TN-2150...,Computers & Laptops,Printers & Accessories,Computers & Laptops Printers & Accessories,"<div> <ol style=""list-style-type: decimal;""> <...",71.0,local,1,0,"<div> <ol style=""list-style-type: decimal;""> <..."
57,my,OE702ELAA7LP9KANMY,BYT KST Pattern Solid Color Tablet Leather Fli...,Mobiles & Tablets,Accessories,Mobiles & Tablets Accessories,<ul> <li>Ultra Slim Design and Stylish</li> <l...,31.7,international,1,1,\nUltra Slim Design and Stylish\nKST Pattern P...
58,my,OE702ELAA92006ANMY,Laptop Battery For HP Envy 4-1245TU SLEEKBOOK,Computers & Laptops,Computer Accessories,Computers & Laptops Computer Accessories,<ul> <li>Brand New Replacement Product</li> <l...,300.0,local,1,1,\nBrand New Replacement Product\nWorks as genu...
59,my,OE702ELAA9TA8GANMY,iPad 2/3/4 Kiddie Case - AOOBCC EVA Shock Proo...,Mobiles & Tablets,Accessories,Mobiles & Tablets Accessories,"<div> <ul> <li>Designed for Apple iPad 2,3,4</...",55.92,international,1,0,"\nDesigned for Apple iPad 2,3,4\nSuper Light W..."


In [530]:
printColumnWithNA(training_data)

country 0
sku_id 0
title 0
category_lvl_1 0
category_lvl_2 0
category_lvl_3 0
short_description 33
price 0
product_type 277
clarity 0
conciseness 0
new_description 33


In [531]:
for idx in range(0, len(training_data)):

    entry = training_data.loc[idx, 'short_description']

    if type(entry) is float:
        training_data.loc[idx, 'new_description'] = training_data.loc[idx, 'category_lvl_3']

In [532]:
printColumnWithNA(training_data)

country 0
sku_id 0
title 0
category_lvl_1 0
category_lvl_2 0
category_lvl_3 0
short_description 33
price 0
product_type 277
clarity 0
conciseness 0
new_description 0


In [533]:
del training_data['short_description']

### Column - Product Type

In [534]:
international_perc = list(training_data['product_type'].value_counts())[0] / len(training_data)
local_perc = list(training_data['product_type'].value_counts())[1] / len(training_data)

international_count = 0
local_count = 0

international_floor_count = math.floor(international_perc * 277)
local_floor_count = math.floor(local_perc * 277)

for idx in range(0, len(training_data)):

    entry = training_data.loc[idx, 'product_type']

    if type(entry) is float:

        if international_count < international_floor_count:

            training_data.loc[idx, 'product_type'] = 'international'
            international_count += 1

        elif local_count < local_floor_count:

            training_data.loc[idx, 'product_type'] = 'local'
            local_count +=1

In [535]:
printColumnWithNA(training_data)

country 0
sku_id 0
title 0
category_lvl_1 0
category_lvl_2 0
category_lvl_3 0
price 0
product_type 3
clarity 0
conciseness 0
new_description 0


### Final Preprocessing Results

In [536]:
training_data.columns

Index(['country', 'sku_id', 'title', 'category_lvl_1', 'category_lvl_2',
       'category_lvl_3', 'price', 'product_type', 'clarity', 'conciseness',
       'new_description'],
      dtype='object')

In [537]:
column_names = ['sku_id', 'country', 'title', 'new_description', 'price', 'product_type', 'category_lvl_1', 'category_lvl_2', 'category_lvl_3', 'clarity', 'conciseness']

training_data = training_data.reindex(columns=column_names)

In [538]:
training_data.head()

Unnamed: 0,sku_id,country,title,new_description,price,product_type,category_lvl_1,category_lvl_2,category_lvl_3,clarity,conciseness
0,AD674FAASTLXANMY,my,Adana Gallery Suri Square Hijab – Light Pink,\nMaterial : Non sheer shimmer chiffon\nSizes ...,49.0,local,Fashion,Women,Fashion Women,1,1
1,AE068HBAA3RPRDANMY,my,Cuba Heartbreaker Eau De Parfum Spray 100ml/3.3oz,Formulated with oil-free hydrating botanicals/...,128.0,international,Health & Beauty,Bath & Body,Health & Beauty Bath & Body,1,1
2,AN680ELAA9VN57ANMY,my,Andoer 150cm Cellphone Smartphone Mini Dual-He...,"\n150cm mini microphone compatible for iPhone,...",25.07,international,"TV, Audio / Video, Gaming & Wearables",Audio,"TV, Audio / Video, Gaming & Wearables Audio",1,0
3,AN957HBAAAHDF4ANMY,my,ANMYNA Complaint Silky Set 柔顺洗发配套 (Shampoo 520...,\nANMYNA Complaint Silky Set (Shampoo 520ml + ...,118.0,local,Health & Beauty,Hair Care,Health & Beauty Hair Care,1,1
4,AR511HBAXNWAANMY,my,Argital Argiltubo Green Clay For Face and Body...,\n100% Authentic\nRrefresh and brighten skin\n...,114.8,international,Health & Beauty,Men's Care,Health & Beauty Men's Care,1,1


In [539]:
printColumnWithNA(training_data)

sku_id 0
country 0
title 0
new_description 0
price 0
product_type 3
category_lvl_1 0
category_lvl_2 0
category_lvl_3 0
clarity 0
conciseness 0


## Label Encoding

In [540]:
le = LabelEncoder()

for column in ['sku_id', 'country', 'category_lvl_1', 'category_lvl_2', 'category_lvl_3', 'product_type']:
    le.fit(training_data[column])
    training_data[column + '_encoded'] = le.transform(training_data[column])


In [542]:
column_names = ['sku_id', 'country', 'title', 'new_description', 'price', 'product_type', 'category_lvl_1', 'category_lvl_2', 'category_lvl_3', 'sku_id_encoded', 'country_encoded', 'category_lvl_1_encoded', 'category_lvl_2_encoded', 'category_lvl_3_encoded', 'product_type_encoded', 'clarity', 'conciseness']

training_data = training_data.reindex(columns=column_names)

In [543]:
training_data

Unnamed: 0,sku_id,country,title,new_description,price,product_type,category_lvl_1,category_lvl_2,category_lvl_3,sku_id_encoded,country_encoded,category_lvl_1_encoded,category_lvl_2_encoded,category_lvl_3_encoded,product_type_encoded,clarity,conciseness
0,AD674FAASTLXANMY,my,Adana Gallery Suri Square Hijab – Light Pink,\nMaterial : Non sheer shimmer chiffon\nSizes ...,49.00,local,Fashion,Women,Fashion Women,338,0,2,56,18,1,1,1
1,AE068HBAA3RPRDANMY,my,Cuba Heartbreaker Eau De Parfum Spray 100ml/3.3oz,Formulated with oil-free hydrating botanicals/...,128.00,international,Health & Beauty,Bath & Body,Health & Beauty Bath & Body,350,0,3,3,19,0,1,1
2,AN680ELAA9VN57ANMY,my,Andoer 150cm Cellphone Smartphone Mini Dual-He...,"\n150cm mini microphone compatible for iPhone,...",25.07,international,"TV, Audio / Video, Gaming & Wearables",Audio,"TV, Audio / Video, Gaming & Wearables Audio",661,0,7,1,47,0,1,0
3,AN957HBAAAHDF4ANMY,my,ANMYNA Complaint Silky Set 柔顺洗发配套 (Shampoo 520...,\nANMYNA Complaint Silky Set (Shampoo 520ml + ...,118.00,local,Health & Beauty,Hair Care,Health & Beauty Hair Care,684,0,3,23,23,1,1,1
4,AR511HBAXNWAANMY,my,Argital Argiltubo Green Clay For Face and Body...,\n100% Authentic\nRrefresh and brighten skin\n...,114.80,international,Health & Beauty,Men's Care,Health & Beauty Men's Care,883,0,3,36,26,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36278,SA584ELAA4G4W0SGAMZ,sg,SADES K10 LED Backlit Wired USB Mechanical Gam...,\nNo driver needed.Blue Switches is the core o...,67.09,international,Computers & Laptops,Computer Accessories,Computers & Laptops Computer Accessories,30801,2,1,8,8,0,1,1
36279,SO499HAAA4CCLASGAMZ,sg,SONA 20L Electric Oven SEO 2220,\n 2 Years Warranty\nThermal Fuse Safety Prote...,69.00,local,Home Appliances,Large Appliances,Home Appliances Large Appliances,31916,2,5,30,42,1,1,1
36280,TI990ELAA5ZV1JSGAMZ,sg,OP1001 Portable Wireless Bluetooth 2.1 Speaker...,"\nWith colorful lights on the button, light up...",35.00,international,Computers & Laptops,Computer Accessories,Computers & Laptops Computer Accessories,32710,2,1,8,8,0,1,0
36281,WO203HLAA3KFPCSGAMZ,sg,Woot-Woot TicTacToe Pillow Case (White),\n100% Cotton\nSafe for Sensitive Skin\nCool a...,23.00,local,Home & Living,Bedding,Home & Living Bedding,35365,2,4,5,30,1,1,1


# Data Exploration 

Unnamed: 0,country,sku_id,title,category_lvl_1,category_lvl_2,category_lvl_3,short_description,price,product_type,clarity,conciseness
0,my,AD674FAASTLXANMY,Adana Gallery Suri Square Hijab – Light Pink,Fashion,Women,Muslim Wear,<ul><li>Material : Non sheer shimmer chiffon</...,49.00,local,1,1
1,my,AE068HBAA3RPRDANMY,Cuba Heartbreaker Eau De Parfum Spray 100ml/3.3oz,Health & Beauty,Bath & Body,Hand & Foot Care,Formulated with oil-free hydrating botanicals/...,128.00,international,1,1
2,my,AN680ELAA9VN57ANMY,Andoer 150cm Cellphone Smartphone Mini Dual-He...,"TV, Audio / Video, Gaming & Wearables",Audio,Live Sound & Stage,<ul> <li>150cm mini microphone compatible for ...,25.07,international,1,0
3,my,AN957HBAAAHDF4ANMY,ANMYNA Complaint Silky Set 柔顺洗发配套 (Shampoo 520...,Health & Beauty,Hair Care,Shampoos & Conditioners,<ul> <li>ANMYNA Complaint Silky Set (Shampoo 5...,118.00,local,1,1
4,my,AR511HBAXNWAANMY,Argital Argiltubo Green Clay For Face and Body...,Health & Beauty,Men's Care,Body and Skin Care,<ul> <li>100% Authentic</li> <li>Rrefresh and ...,114.80,international,1,1
...,...,...,...,...,...,...,...,...,...,...,...
36278,sg,SA584ELAA4G4W0SGAMZ,SADES K10 LED Backlit Wired USB Mechanical Gam...,Computers & Laptops,Computer Accessories,Keyboards,<ul> <li>No driver needed.Blue Switches is the...,67.09,international,1,1
36279,sg,SO499HAAA4CCLASGAMZ,SONA 20L Electric Oven SEO 2220,Home Appliances,Large Appliances,Microwaves & Ovens,<ul> <li>&nbsp;2 Years Warranty<br></li> <li>T...,69.00,local,1,1
36280,sg,TI990ELAA5ZV1JSGAMZ,OP1001 Portable Wireless Bluetooth 2.1 Speaker...,Computers & Laptops,Computer Accessories,Speakers,"<ul> <li>With colorful lights on the button, l...",35.00,international,1,0
36281,sg,WO203HLAA3KFPCSGAMZ,Woot-Woot TicTacToe Pillow Case (White),Home & Living,Bedding,Pillows & Bolsters,<ul> <li>100% Cotton</li> <li>Safe for Sensiti...,23.00,local,1,1


In [54]:
# All column names
print(training_data.columns)

# Country uniqueness
print(training_data.country.unique()) # [ 'my', 'ph', 'sg' ], 3

# Title uniqueness
print(len(training_data.title.unique())) # 36283, all unique

# Cateogry # 1
print(len(training_data.category_lvl_1.unique())) # 9

# Category # 2
print(len(training_data.category_lvl_2.unique())) # 9

# Category # 3
print(len(training_data.category_lvl_3.unique())) # 9

# Product Types
print(training_data.product_type.unique()) # 3, [ 'local', 'international', nan ] 


Index(['country', 'sku_id', 'title', 'category_lvl_1', 'category_lvl_2',
       'category_lvl_3', 'short_description', 'price', 'product_type'],
      dtype='object')
['my' 'ph' 'sg']
36283
9
57
185
['local' 'international' nan]


# Feature Engineering 

# Modelling 