# "Classifiez automatiquement des biens de consommation"
_Cleaning Notebook_

## 0 Preliminaries

Checking for PEP8 compliance

In [None]:
# %load_ext pycodestyle_magic
# %pycodestyle_on
# %pycodestyle_off

### 0.0 Importing Packages and Modules

Checking whether the notebook is on Colab or PC

In [22]:
import sys
is_colab = 'google.colab' in sys.modules
is_colab, sys.executable

(True, '/usr/bin/python3')

Mounting my Drive if on Colab

In [23]:
if is_colab==True:
    from google.colab import files, output, drive
    drive.mount('/gdrive')
    %cd /gdrive
    print("You're on Google Colab")
else:
    print("You're on a PC")

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive
You're on Google Colab


Installations and importations required in the virtual environment.

In [None]:
# import os
# if os.getcwd()!='/gdrive/My Drive/--DATA SCIENCE/PROJET6/NOTEBOOKS':
#     os.chdir('/gdrive/My Drive/--DATA SCIENCE/PROJET6/NOTEBOOKS')

In [26]:
import os
if is_colab==True:
    if os.getcwd()!='/gdrive/My Drive/--DATA SCIENCE/PROJET6/NOTEBOOKS':
        os.chdir('/gdrive/My Drive/--DATA SCIENCE/PROJET6/NOTEBOOKS')
else:
    if not (os.path.exists(os.getcwd()+'/requirements_cleaning_eda.txt') \
                     and os.path.exists(os.getcwd()+'/P6_functions.py')):
        print("ERROR: Make sure 'P6_functions.py' and \
'requirements_cleaning_eda.txt' are in the current working directory")

!pip install -r requirements_cleaning_eda.txt
from P6_functions import *

Installations (creating the requirements file)

In [None]:
# !pip install gtts

In [None]:
# !pip freeze > requirements_cleaning_EDA.txt

Importation of modules and packages. 

In [27]:
import io

import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.facecolor']='w'

# import warnings
# warnings.filterwarnings('ignore')

Setting pandas display options.

In [28]:
dictPdSettings = {'display.max_rows': 500, 'display.width': 100,
                    'display.max_colwidth': 100,
                    'display.float_format': lambda x: '%.2f' % x}
for k,v in dictPdSettings.items():
  pd.set_option(k,v)

To play audio text-to-speech during execution.

In [29]:
from IPython.display import Audio
from gtts import gTTS

def speak(text, lang='en'):
    with io.BytesIO() as f:
        gTTS(text=text, lang=lang).write_to_fp(f)
        f.seek(0)
        return Audio(f.read(), autoplay=True)

ModuleNotFoundError: ignored

In [None]:
speak('Packages and modules successfully imported')

### 0.1 Importing the datasets

Data is composed of 9 distinct .csv files we'll load in a dictionnary of dataframes.

In [186]:
if is_colab==True:
    # Importing database from my Drive
    print("Try to import data files in the notebook from myDrive...")
else:
    # Importing database from PC
    print("Try to import data files in the notebook from PC ('DATA')...")

df = pd.read_csv("../DATA/flipkart_com-ecommerce_sample_1050.csv",
                 sep=',', 
                 index_col = 'uniq_id',
                 encoding ='utf-8')

print("-----> Importation of .csv in the notebook: OK")

Try to import data files in the notebook from myDrive...
-----> Importation of .csv in the notebook: OK


In [None]:
speak('Datasets successfully imported')

### 0.2 First Overview

In [187]:
df.describe(include='all')

Unnamed: 0,crawl_timestamp,product_url,product_name,product_category_tree,pid,retail_price,discounted_price,image,is_FK_Advantage_product,description,product_rating,overall_rating,brand,product_specifications
count,1050,1050,1050,1050,1050,1049.0,1049.0,1050,1050,1050,1050,1050,712,1049
unique,149,1050,1050,642,1050,,,1050,2,1050,27,27,490,984
top,2015-12-01 12:40:44 +0000,http://www.flipkart.com/kandyfloss-baby-boy-s-girl-s-romper/p/itmehzabe9ymhfpf?pid=DRPEHZABCC2H27FM,Prithish Monster Kids Ceramic Mug,"[""Home Furnishing >> Bed Linen >> Blankets, Quilts & Dohars""]",STIEC889ZD5GDCVQ,,,75015382e469d36e397c5b47ea613314.jpg,False,"Sovam International Radha Krishna God Showpiece - 6 cm (Brass, White)\r\n ...",No rating available,No rating available,Lapguard,"{""product_specification""=>[{""key""=>""Type"", ""value""=>""Mug""}, {""key""=>""Mug Capacity"", ""value""=>""5...."
freq,150,1,1,56,1,,,1,993,1,889,889,11,22
mean,,,,,,2186.2,1584.53,,,,,,,
std,,,,,,7639.23,7475.1,,,,,,,
min,,,,,,35.0,35.0,,,,,,,
25%,,,,,,555.0,340.0,,,,,,,
50%,,,,,,999.0,600.0,,,,,,,
75%,,,,,,1999.0,1199.0,,,,,,,


Printing total nb and percentage of null:

In [188]:
display(print_null_pct(df))

nb of null:  341 
pct of null:  2.3


None

In [189]:
df.isna().sum()

crawl_timestamp              0
product_url                  0
product_name                 0
product_category_tree        0
pid                          0
retail_price                 1
discounted_price             1
image                        0
is_FK_Advantage_product      0
description                  0
product_rating               0
overall_rating               0
brand                      338
product_specifications       1
dtype: int64

Browsing the content

In [190]:
df[df.isna().any(1)].sample(1)

Unnamed: 0_level_0,crawl_timestamp,product_url,product_name,product_category_tree,pid,retail_price,discounted_price,image,is_FK_Advantage_product,description,product_rating,overall_rating,brand,product_specifications
uniq_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
22ea3b69860bc9a5e0c69b87db2dcdab,2015-12-01 12:40:44 +0000,http://www.flipkart.com/prithish-i-m-mom-what-s-your-superpower-red-blue-bubbles-double-coloured...,Prithish I'm A Mom. What'S Your Superpower? Red with Blue Bubbles Double Coloured Ceramic Mug,"[""Kitchen & Dining >> Coffee Mugs >> Prithish Coffee Mugs""]",MUGE5PZ6NQHW7GZY,599.0,225.0,22ea3b69860bc9a5e0c69b87db2dcdab.jpg,False,Buy Prithish I'm A Mom. What'S Your Superpower? Red with Blue Bubbles Double Coloured Ceramic Mu...,5,5,,"{""product_specification""=>[{""key""=>""Type"", ""value""=>""Mug""}, {""key""=>""Mug Capacity"", ""value""=>""33..."


In [191]:
df['product_specifications'][0]

'{"product_specification"=>[{"key"=>"Brand", "value"=>"Elegance"}, {"key"=>"Designed For", "value"=>"Door"}, {"key"=>"Type", "value"=>"Eyelet"}, {"key"=>"Model Name", "value"=>"Abstract Polyester Door Curtain Set Of 2"}, {"key"=>"Model ID", "value"=>"Duster25"}, {"key"=>"Color", "value"=>"Multicolor"}, {"key"=>"Length", "value"=>"213 cm"}, {"key"=>"Number of Contents in Sales Package", "value"=>"Pack of 2"}, {"key"=>"Sales Package", "value"=>"2 Curtains"}, {"key"=>"Material", "value"=>"Polyester"}]}'

In [192]:
df['description']

uniq_id
55b85ea15a1536d46b7190ad6fff8ce7    Key Features of Elegance Polyester Multicolor Abstract Eyelet Door Curtain Floral Curtain,Elegan...
7b72c92c2f6c40268628ec5f14c6d590    Specifications of Sathiyas Cotton Bath Towel (3 Bath Towel, Red, Yellow, Blue) Bath Towel Featur...
64d5d4a258243731dc7bbb1eef49ad74    Key Features of Eurospa Cotton Terry Face Towel Set Size: small Height: 9 inch GSM: 360,Eurospa ...
d4684dcdc759dd9cdf41504698d737d8    Key Features of SANTOSH ROYAL FASHION Cotton Printed King sized Double Bedsheet Royal Bedsheet P...
6325b6870c54cd47be6ebfbffa620ec7    Key Features of Jaipur Print Cotton Floral King sized Double Bedsheet 100% cotton,Jaipur Print C...
                                                                                   ...                                                 
958f54f4c46b53c8a0a9b8167d9140bc    Oren Empower Extra Large Self Adhesive Sticker (Pack of 2)\r\n                         Price: Rs...
fd6cbcc22efb6b761bd564c28928483c    Wall

Unfolding categories using the 'product_category_tree' colum

In [194]:
# sample checking
df['product_category_tree'][743]

'["Kitchen & Dining >> Containers & Bottles >> Bottles & Sippers >> Water bottle >> Nutcase Water bottle"]'

In [195]:
# determining the maximum tree depth of categories
ser_depth = df['product_category_tree'].apply(lambda x: x.count('>>'))
max_depth = ser_depth.max()
max_depth

6

In [206]:
# Converting the strings in 'product_category_tree' column in 6 categ columns

def str_cleaning(ind, my_str, name_level_cols):
    my_str = my_str.replace("[\"", "").replace("\"]", "")
    tab_str = my_str.split(">>")
    size_tab_str = len(tab_str)
    tup_str = tuple([tab_str[i].strip() if i<size_tab_str else "" \
                     for i in np.arange(max_depth) ])
    return tup_str

name_level_cols = ['cat_level_'+str(i) for i in np.arange(max_depth)]
ser_tuple = df['product_category_tree']\
    .apply(lambda s: str_cleaning(s.index, s, name_level_cols))
df_cat_level = pd.DataFrame([[a,'/'.join([a,b]),'/'.join([a,b,c]),
                              '/'.join([a,b,c,d]),'/'.join([a,b,c,d,e]),
                              '/'.join([a,b,c,d,e,f])] \
                             for a,b,c,d,e,f in ser_tuple.values],
                            columns=name_level_cols, index=df.index)

In [207]:
# printing number of categories in each level and a sample
display(df_cat_level.nunique(), df_cat_level.sample(3))

cat_level_0      7
cat_level_1     62
cat_level_2    243
cat_level_3    460
cat_level_4    596
cat_level_5    633
dtype: int64

Unnamed: 0_level_0,cat_level_0,cat_level_1,cat_level_2,cat_level_3,cat_level_4,cat_level_5
uniq_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
e3ae62ed831fd8b4c20e1742c30d35b5,Baby Care,Baby Care/Infant Wear,Baby Care/Infant Wear/Baby Girls' Clothes,Baby Care/Infant Wear/Baby Girls' Clothes/Dresses & Skirts,Baby Care/Infant Wear/Baby Girls' Clothes/Dresses & Skirts/Dresses,Baby Care/Infant Wear/Baby Girls' Clothes/Dresses & Skirts/Dresses/Wow! Dresses
7037dbd17682322c89bdf7203b403381,Watches,Watches/Wrist Watches,Watches/Wrist Watches/Gift Island Wrist Watches,Watches/Wrist Watches/Gift Island Wrist Watches/,Watches/Wrist Watches/Gift Island Wrist Watches//,Watches/Wrist Watches/Gift Island Wrist Watches///
54754ccd18f2a75c53de68806176392a,Computers,Computers/Network Components,Computers/Network Components/Routers,Computers/Network Components/Routers/Netis Routers,Computers/Network Components/Routers/Netis Routers/,Computers/Network Components/Routers/Netis Routers//


In [211]:
# extracting only useful data
df_desc_cat = pd.concat([df_cat_level, df[["product_name", "description"]]], axis=1)

### Exportation

Now we export the dataset of aggregated orders in a .csv file.

In [None]:
dfs['ord_it'].to_csv('agg_order_items.csv')