# "Classifiez automatiquement des biens de consommation"
_NLP Cleaning Notebook_

## 0 Preliminaries

### 0.0 Importing Packages and Modules

Checking whether the notebook is on Colab or PC

In [56]:
import sys
is_colab = 'google.colab' in sys.modules
is_colab, sys.executable

(True, '/usr/bin/python3')

Mounting my Drive if on Colab

In [57]:
if is_colab==True:
    from google.colab import files, output, drive
    drive.mount('/gdrive')
    %cd /gdrive
    print("You're on Google Colab")
else:
    print("You're on a PC")

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive
You're on Google Colab


Installations and importations required in the virtual environment.

In [58]:
import os
if os.getcwd()!='/gdrive/My Drive/--DATA SCIENCE/PROJET6/NOTEBOOKS':
    os.chdir('/gdrive/My Drive/--DATA SCIENCE/PROJET6/NOTEBOOKS')

In [59]:
# import os
# if is_colab==True:
#     if os.getcwd()!='/gdrive/My Drive/--DATA SCIENCE/PROJET6/NOTEBOOKS':
#         os.chdir('/gdrive/My Drive/--DATA SCIENCE/PROJET6/NOTEBOOKS')
# else:
#     if not (os.path.exists(os.getcwd()+'/requirements_pict.txt') \
#                      and os.path.exists(os.getcwd()+'/P6_functions.py')):
#         print("ERROR: Make sure 'P6_functions.py' and \
# 'requirements_pict.txt' are in the current working directory")

In [60]:
!pip install -r requirements_pict.txt



In [61]:
from P6_functions import *

Installations (creating the requirements file)

In [62]:
# !pip install gtts
# !pip install wikipedia2vec==0.2.2

# !pip install opencv-python==3.4.2.17
# !pip install opencv-contrib-python==3.4.2.17

In [63]:
# !pip freeze > requirements_pict.txt

Importation of modules and packages. 

In [87]:
import io

import string

import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.facecolor']='w'
import PIL

# import warnings
# warnings.filterwarnings('ignore')

Setting pandas display options.

In [65]:
dictPdSettings = {'display.max_rows': 500, 'display.width': 100,
                  'display.max_colwidth': 100,
                  'display.float_format': lambda x: '%.2f' % x}
for k,v in dictPdSettings.items():
  pd.set_option(k,v)

To play audio text-to-speech during execution.

In [66]:
from IPython.display import Audio
from gtts import gTTS

def speak(text, lang='en'):
    with io.BytesIO() as f:
        gTTS(text=text, lang=lang).write_to_fp(f)
        f.seek(0)
        return Audio(f.read(), autoplay=True)

In [67]:
speak('Packages and modules successfully imported')

### 0.1 Importing the datasets

Data is composed of 9 distinct .csv files we'll load in a dictionnary of dataframes.

In [68]:
if is_colab==True:
    # Importing database from my Drive
    print("Try to import data files in the notebook from myDrive...")
else:
    # Importing database from PC
    print("Try to import data files in the notebook from PC ('DATA')...")

df = pd.read_csv("../DATA/flipkart_com-ecommerce_sample_1050.csv",
                 sep=',', 
                 index_col = 'uniq_id',
                 encoding ='utf-8')

print("-----> Importation of .csv in the notebook: OK")

Try to import data files in the notebook from myDrive...
-----> Importation of .csv in the notebook: OK


In [69]:
# speak('Datasets successfully imported')

### 0.2 Categories

In [70]:
# Converting the strings in 'product_category_tree' column in 6 categ columns

# determining the maximum tree depth of categories
ser_depth = df['product_category_tree'].apply(lambda x: x.count('>>'))
max_depth = ser_depth.max()

def str_cleaning(ind, my_str, name_level_cols):
    my_str = my_str.replace("[\"", "").replace("\"]", "")
    tab_str = my_str.split(">>")
    size_tab_str = len(tab_str)
    tup_str = tuple([tab_str[i].strip() if i<size_tab_str else "" \
                     for i in np.arange(max_depth) ])
    return tup_str

name_level_cols = ['cat_level_'+str(i) for i in np.arange(max_depth)]
ser_tuple = df['product_category_tree']\
    .apply(lambda s: str_cleaning(s.index, s, name_level_cols))
df_cat_level = pd.DataFrame([[a,'/'.join([a,b]),'/'.join([a,b,c]),
                              '/'.join([a,b,c,d]),'/'.join([a,b,c,d,e]),
                              '/'.join([a,b,c,d,e,f])] \
                             for a,b,c,d,e,f in ser_tuple.values],
                            columns=name_level_cols, index=df.index)

In [71]:
# Create a dataframe for images

df_image = df[['product_name', 'description']].copy('deep')
df_image['category'] = \
    df_cat_level['cat_level_0'].replace({'Home Furnishing': 'Furnishing',
                                         'Baby Care': 'Baby', 
                                         'Watches': 'Watches',
                                         'Home Decor & Festive Needs': 'Decor',
                                         'Kitchen & Dining': 'Kitchen',
                                         'Beauty and Personal Care': 'Beauty',
                                         'Computers': 'Computers'})
indexes = df_image.index

### 0.3 Images properties

In [72]:
from PIL import Image

## 1 Image pre-processing

### 1.0 Vectorization and resizing of images

Load each image, resizes to 224*224 and store the values of pixel in HSV in the dataframe

In [172]:
# from PIL import Image, ImageOps

# # to silence the decompression bomb warning
# Image.MAX_IMAGE_PIXELS = 1000000000        
# import colorsys

# rgb_to_hsv = np.vectorize(colorsys.rgb_to_hsv)

# size = 224

# # df_temp = pd.DataFrame([])
# dict_images = {}
# li_cols = [ l+'_'+str(i) for l in ['H', 'S', 'V']\
#            for i in range(size*size)]

# for i, ind in enumerate(indexes):
#     img = np.array(Image.open("../DATA/Images/"+ind+".jpg"))
#     img = preproc_image(img, size=224, fill_col=(255,255,255),
#                         autocontrast = True, equalize=False,
#                         gauss_blur=3, interpolation=Image.ANTIALIAS)
#     dict_images[ind] = img
#     # img_ravel = np.stack(np.array(rgb_to_hsv(img[:,:,0],
#     #                                          img[:,:,1],
#     #                                          img[:,:,2])), 0).ravel()
#     # ser = pd.Series(img_ravel, index = li_cols, name=ind).to_frame()
#     # df_temp = pd.concat([df_temp, ser.T], axis=0)

# # df_image = pd.concat([df_image, df_temp], axis=1)

In [173]:
# import dill
# dill.dump(dict_images, open('dict_images.pkl', mode='wb'))

In [113]:
import dill
df_image = dill.load(open('df_image.pkl', mode='rb'))
dict_images = dill.load(open('dict_images.pkl', mode='rb'))

Visualisation séparée des trois canaux (Hue, Saturation, Value)

In [78]:
# fig = plt.figure(figsize=(10,4))
# li_n = []
# for im, color, title, i in zip([img_h, img_s, img_v],
#                                 ['red', 'blue', 'black'],
#                                 ['Hue', 'Saturation', 'Value'],
#                                 range(1,4)):
#     ax1 = fig.add_subplot(2,3,i)
#     ax1.imshow(im, cmap='Greys')
#     ax1.set(xlim=(0,255))
#     ax1.set_title(title, fontweight='bold')
#     ax2 = fig.add_subplot(2,3,i+3)
#     n, bins, patches = ax2.hist(im.flatten(), color=color, bins=range(256))
#     li_n.append(n)
# plt.tight_layout(rect=[0,0,1,0.92])
# plt.show()

# Creation of a bag of visual words

In [161]:
import numpy as np
import cv2
import os
from scipy import ndimage
from scipy.spatial import distance
from sklearn.cluster import KMeans

### Extracts local features from images using SIFT

The below function returns an array whose first index holds a list that holds all local features from all images without an order. This is our visual dictionary. And the second index holds the sift vectors dictionary which holds the descriptors but this is separated class by class

In [234]:
# Creates descriptors using sift 
# Takes one parameter that is images dictionary
# Return an array whose first index holds the decriptor_list without an order
# And the second index holds the sift_vectors dictionary which holds
# the descriptors but this is seperated class by class

def sift_features(images):
    dict_sift_descriptors, dict_keypoints = {}, {}
    all_descriptors_list = []
    sift = cv2.xfeatures2d.SIFT_create()
    for key,img in images.items():
        features = []
        kp, des = sift.detectAndCompute(img,None)
        all_descriptors_list.extend(des)
        dict_sift_descriptors[key] = des
        dict_keypoints[key] = kp
    return dict_keypoints, dict_sift_descriptors, all_descriptors_list

# def get_descriptors(image_path):
#     # load image and convert it to grayscale
#     img_gray = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
#     # detect key points and descriptors
#     keypoints, descriptors = model.detectAndCompute(img_gray, None)
#     # returns keypoints and descriptors
#     return keypoints, descriptors

In [235]:
# # Store descriptors and bovw features 
# # keypoints_list, descriptor_list, all_bovw_feature = sift_features(dict_images) 
# dict_keypoints, dict_sift_descriptors, all_descriptors_list = \
#                                              sift_features(dict_images) 

In [237]:
# import dill
# dill.dump((dict_sift_descriptors, all_descriptors_list),
#           open("descriptors.pkl", "wb"))

In [240]:
import dill
dict_sift_descriptors, all_descriptors_list  = \
    dill.load(open("descriptors.pkl", "rb"))

In [241]:
# print("list of the keypoints", len(keypoints_list))
print("list of all the descriptors: ", len(all_descriptors_list))
print("dictionnary giving all the descriptors (vectors) for each image: ",
      len(dict_sift_descriptors.keys()))

list of the keypoints 230755
list of all the descriptors:  230755
dictionnary giving all the descriptors (vectors) for each image:  1050


### Selects the visual words

Send the visual dictionary to the k-means clustering algorithm and find the visual words which are center points.

In [224]:
# A k-means clustering algorithm who takes 2 parameter which is number 
# of cluster(k) and the other is descriptors list(unordered 1d array)
# Returns an array that holds central points.

from sklearn.cluster import KMeans

def descriptors_cluster_centers(k, descriptors_list):
    clusterer = KMeans(n_clusters = k, n_init=10)
    clusterer.fit(descriptors_list)
    visual_words = clusterer.cluster_centers_ 
    return visual_words, clusterer

In [225]:
# # Takes the central points which is visual words   

# visual_words, desc_clusterer = descriptors_cluster_centers(150,
#                                                            all_descriptors_list)
# visual_words.shape

(150, 128)

In [233]:
# import dill
# dill.dump((visual_words, desc_clusterer), open("visual_words.pkl", "wb"))

In [220]:
import dill
visual_words, desc_clusterer = dill.load(open("visual_words.pkl", "rb"))

In [None]:
# représentation des 150 visual words choisis par rapport aux
# 230 755 descripteurs trouvés (all_descriptors_list)
plot_projection(df, model=None, ser_clust = None, proj='PCA', title=None,
                    figsize=(5, 3), size=1, palette='tab10',
                    legend_on=False, fig=None, ax=None, random_state=14)

In [226]:
# un visual word est un vecteur à 128 composantes (8 directions * 16 sous-images)
# donnant les 
visual_words.shape

(150, 128)

In [229]:
# list of the descriptors of the 14th image

sift_vectors[indexes[14]].shape

(37, 128)

In [232]:
# Finding the descriptors nearest from visual words
desc_clusterer.predict(sift_vectors[indexes[14]]).shape

(37,)

# Exportation

Now we export the dataset of aggregated orders in a .csv file.