# "Classifiez automatiquement des biens de consommation"
_NLP Cleaning Notebook_

## 0 Preliminaries

### 0.0 Importing Packages and Modules

In [None]:
# !apt install sudo
# !sudo apt-get update
# !sudo apt-get upgrade
# !sudo apt update
# !apt list --upgradable
# !sudo apt upgrade
# !apt list --upgradable
# !sudo apt upgrade
# %cd /content
# !sudo apt remove cmake
# !sudo apt purge --auto-remove cmake
# !mkdir ~/temp
# %cd ~/temp
# !wget https://cmake.org/files/v3.12/cmake-3.12.3-Linux-x86_64.sh
# !sudo mkdir /opt/cmake
# !sudo sh cmake-3.12.3-Linux-x86_64.sh --prefix=/opt/cmake --skip-license
# !sudo ln -s /opt/cmake/bin/cmake /usr/local/bin/cmake
# %cd /content
# !sudo apt-get install libopenblas-dev liblapack-dev 
# !sudo apt-get install libx11-dev libgtk-3-dev
# !sudo apt-get install libboost-all-dev

Checking whether the notebook is on Colab or PC

In [None]:
import sys
is_colab = 'google.colab' in sys.modules
is_colab, sys.executable

(True, '/usr/bin/python3')

Mounting my Drive if on Colab

In [None]:
if is_colab==True:
    from google.colab import files, output, drive
    drive.mount('/gdrive')
    %cd /gdrive
    print("You're on Google Colab")
else:
    print("You're on a PC")

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive
You're on Google Colab


Installations and importations required in the virtual environment.

In [None]:
import os
if os.getcwd()!='/gdrive/My Drive/--DATA SCIENCE/PROJET6/NOTEBOOKS':
    os.chdir('/gdrive/My Drive/--DATA SCIENCE/PROJET6/NOTEBOOKS')

In [None]:
# !pip install -r requirements_pict.txt

In [None]:
from P6_functions import *

Installations (creating the requirements file)

In [None]:
!pip install gtts
!pip install wikipedia2vec==0.2.2
!pip install opencv-contrib-python==3.4.2.17
!pip install opencv-python==3.4.2.17



In [None]:
# !pip freeze > requirements_pict.txt

Importation of modules and packages. 

In [None]:
import io

import string

import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.facecolor']='w'

# import warnings
# warnings.filterwarnings('ignore')

Setting pandas display options.

In [None]:
dictPdSettings = {'display.max_rows': 500, 'display.width': 100,
                  'display.max_colwidth': 100,
                  'display.float_format': lambda x: '%.2f' % x}
for k,v in dictPdSettings.items():
  pd.set_option(k,v)

To play audio text-to-speech during execution.

In [None]:
# from IPython.display import Audio
# from gtts import gTTS

# def speak(text, lang='en'):
#     with io.BytesIO() as f:
#         gTTS(text=text, lang=lang).write_to_fp(f)
#         f.seek(0)
#         return Audio(f.read(), autoplay=True)

In [None]:
# speak('Packages and modules successfully imported')

### 0.1 Importing the datasets

Data is composed of 9 distinct .csv files we'll load in a dictionnary of dataframes.

In [None]:
if is_colab==True:
    # Importing database from my Drive
    print("Try to import data files in the notebook from myDrive...")
else:
    # Importing database from PC
    print("Try to import data files in the notebook from PC ('DATA')...")

df = pd.read_csv("../DATA/flipkart_com-ecommerce_sample_1050.csv",
                 sep=',', 
                 index_col = 'uniq_id',
                 encoding ='utf-8')

print("-----> Importation of .csv in the notebook: OK")

Try to import data files in the notebook from myDrive...
-----> Importation of .csv in the notebook: OK


In [None]:
# speak('Datasets successfully imported')

### 0.2 Categories

In [None]:
# Converting the strings in 'product_category_tree' column in 6 categ columns

# determining the maximum tree depth of categories
ser_depth = df['product_category_tree'].apply(lambda x: x.count('>>'))
max_depth = ser_depth.max()

def str_cleaning(ind, my_str, name_level_cols):
    my_str = my_str.replace("[\"", "").replace("\"]", "")
    tab_str = my_str.split(">>")
    size_tab_str = len(tab_str)
    tup_str = tuple([tab_str[i].strip() if i<size_tab_str else "" \
                     for i in np.arange(max_depth) ])
    return tup_str

name_level_cols = ['cat_level_'+str(i) for i in np.arange(max_depth)]
ser_tuple = df['product_category_tree']\
    .apply(lambda s: str_cleaning(s.index, s, name_level_cols))
df_cat_level = pd.DataFrame([[a,'/'.join([a,b]),'/'.join([a,b,c]),
                              '/'.join([a,b,c,d]),'/'.join([a,b,c,d,e]),
                              '/'.join([a,b,c,d,e,f])] \
                             for a,b,c,d,e,f in ser_tuple.values],
                            columns=name_level_cols, index=df.index)

In [None]:
# Create a dataframe for images

df_image = df[['product_name', 'description']].copy('deep')
df_image['category'] = \
    df_cat_level['cat_level_0'].replace({'Home Furnishing': 'Furnishing',
                                         'Baby Care': 'Baby', 
                                         'Watches': 'Watches',
                                         'Home Decor & Festive Needs': 'Decor',
                                         'Kitchen & Dining': 'Kitchen',
                                         'Beauty and Personal Care': 'Beauty',
                                         'Computers': 'Computers'})
indexes = df_image.index

### 0.3 Images properties

In [None]:
from PIL import Image

In [None]:
# # Get properties (size, mode) of each image, put it in a dataframe

# wh_tab, mode_tab = [], []
# for ind in indexes:
#     img = Image.open("../DATA/Images/"+ind+".jpg")
#     mode_tab.append(img.mode) 
#     wh_tab.append(list(img.size))

# df_image['mode_img'] = mode_tab
# df_image[['w_img', 'h_img']] = wh_tab

## 1 Image pre-processing

### 1.0 Vectorization and resizing of images

In [None]:
from PIL import ImageOps
from PIL.ImageFilter import GaussianBlur
# étirement de l'histogramme (ajustement de la luminosité)
img_auto_bright = ImageOps.autocontrast(img)
# ajustemeent du contraste
img_auto_contr = ImageOps.equalize(img)
# floutage / Débruitage
mon_filtre = GaussianBlur(radius=2)
img_blur = img.filter(mon_filtre)

Load each image, resizes to 224*224 and store the values of pixel in HSV in the dataframe

In [1]:
from PIL import Image
import colorsys

rgb_to_hsv = np.vectorize(colorsys.rgb_to_hsv)

size = 224

# df_temp = pd.DataFrame([])
dict_images = {}
li_cols = [ l+'_'+str(i) for l in ['H', 'S', 'V']\
           for i in range(size*size)]

for i, ind in enumerate(indexes):
    if i%100==0: print(i)
    img = np.array(Image.open("../DATA/Images/"+ind+".jpg"))
    img = preproc_image(img, size=size, fill_col=(255,255,255),
                  autocontrast = False, equalize=False,
                  gauss_blur = None, interpolation=cv2.INTER_AREA)
    dict_images[ind] = img
    # img_ravel = np.stack(np.array(rgb_to_hsv(img[:,:,0],
    #                                          img[:,:,1],
    #                                          img[:,:,2])), 0).ravel()
    # ser = pd.Series(img_ravel, index = li_cols, name=ind).to_frame()
    # df_temp = pd.concat([df_temp, ser.T], axis=0)

# df_image = pd.concat([df_image, df_temp], axis=1)

NameError: ignored

In [None]:
# import dill
# dill.dump(dict_images, open('dict_images.pkl', mode='wb'))

In [None]:
import dill
df_image = dill.load(open('df_image.pkl', mode='rb'))
dict_images = dill.load(open('dict_images.pkl', mode='rb'))

Visualisation séparée des trois canaux (Hue, Saturation, Value)

In [None]:
# fig = plt.figure(figsize=(10,4))
# li_n = []
# for im, color, title, i in zip([img_h, img_s, img_v],
#                                 ['red', 'blue', 'black'],
#                                 ['Hue', 'Saturation', 'Value'],
#                                 range(1,4)):
#     ax1 = fig.add_subplot(2,3,i)
#     ax1.imshow(im, cmap='Greys')
#     ax1.set(xlim=(0,255))
#     ax1.set_title(title, fontweight='bold')
#     ax2 = fig.add_subplot(2,3,i+3)
#     n, bins, patches = ax2.hist(im.flatten(), color=color, bins=range(256))
#     li_n.append(n)
# plt.tight_layout(rect=[0,0,1,0.92])
# plt.show()

# Creation of a bag of visual words

### Load train and test images into dictionaries.

In [None]:
import numpy as np
import cv2
import os
from scipy import ndimage
from scipy.spatial import distance
from sklearn.cluster import KMeans

In [None]:
# plt.imshow(dict_images['7b72c92c2f6c40268628ec5f14c6d590'])

In [None]:
from PIL.Image import fromarray

### Extracts local features from images using SIFT.

The below function returns an array whose first index holds a list that holds all local features from all images without an order. This is our visual dictionary. And the second index holds the sift vectors dictionary which holds the descriptors but this is separated class by class

In [None]:
# Creates descriptors using sift 
# Takes one parameter that is images dictionary
# Return an array whose first index holds the decriptor_list without an order
# And the second index holds the sift_vectors dictionary which holds the descriptors but this is seperated class by class
def sift_features(images):
    sift_vectors = {}
    descriptor_list = []
    sift = cv2.xfeatures2d.SIFT_create()
    for key,img in images.items():
        print(key)
        features = []
        print(img.shape)
        # img = Image.fromarray(img)
        kp, des = sift.detectAndCompute(img,None)
        descriptor_list.extend(des)
        features.append(des)
        sift_vectors[key] = features
    return [descriptor_list, sift_vectors]

# def get_descriptors(image_path):
#     # load image and convert it to grayscale
#     img_gray = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
#     # detect key points and descriptors
#     keypoints, descriptors = model.detectAndCompute(img_gray, None)
#     # returns keypoints and descriptors
#     return keypoints, descriptors

In [None]:
import cv2
detector =cv2.xfeatures2d.SIFT_create()

In [None]:
sifts = sift_features(dict_images) 
# Takes the descriptor list which is unordered one
descriptor_list = sifts[0] 
# Takes the sift features that is seperated class by class for train data
all_bovw_feature = sifts[1] 
# Takes the sift features that is seperated class by class for test data
# test_bovw_feature = sift_features(test)[1] 

In [None]:
STOP

In [None]:
df_image

### Find the visual words 

Send the visual dictionary to the k-means clustering algorithm and find the visual words which are center points.

In [None]:

# A k-means clustering algorithm who takes 2 parameter which is number 
# of cluster(k) and the other is descriptors list(unordered 1d array)
# Returns an array that holds central points.
def kmeans(k, descriptor_list):
    kmeans = KMeans(n_clusters = k, n_init=10)
    kmeans.fit(descriptor_list)
    visual_words = kmeans.cluster_centers_ 
    return visual_words
    
# Takes the central points which is visual words    
visual_words = kmeans(150, descriptor_list) 

### Create histograms

In [None]:

# Takes 2 parameters. The first one is a dictionary that holds the descriptors that are separated class by class 
# And the second parameter is an array that holds the central points (visual words) of the k means clustering
# Returns a dictionary that holds the histograms for each images that are separated class by class. 
def image_class(all_bovw, centers):
    dict_feature = {}
    for key,value in all_bovw.items():
        category = []
        for img in value:
            histogram = np.zeros(len(centers))
            for each_feature in img:
                ind = find_index(each_feature, centers)
                histogram[ind] += 1
            category.append(histogram)
        dict_feature[key] = category
    return dict_feature
    
# Creates histograms for train data    
bovw_train = image_class(all_bovw_feature, visual_words) 
# Creates histograms for test data
bovw_test = image_class(test_bovw_feature, visual_words) 

## Predict classes of the test images with k-NN function.

In [None]:

# 1-NN algorithm. We use this for predict the class of test images.
# Takes 2 parameters. images is the feature vectors of train images and tests is the feature vectors of test images
# Returns an array that holds number of test images, number of correctly predicted images and records of class based images respectively
def knn(images, tests):
    num_test = 0
    correct_predict = 0
    class_based = {}
    
    for test_key, test_val in tests.items():
        class_based[test_key] = [0, 0] # [correct, all]
        for tst in test_val:
            predict_start = 0
            #print(test_key)
            minimum = 0
            key = "a" #predicted
            for train_key, train_val in images.items():
                for train in train_val:
                    if(predict_start == 0):
                        minimum = distance.euclidean(tst, train)
                        #minimum = L1_dist(tst,train)
                        key = train_key
                        predict_start += 1
                    else:
                        dist = distance.euclidean(tst, train)
                        #dist = L1_dist(tst,train)
                        if(dist < minimum):
                            minimum = dist
                            key = train_key
            
            if(test_key == key):
                correct_predict += 1
                class_based[test_key][0] += 1
            num_test += 1
            class_based[test_key][1] += 1
            #print(minimum)
    return [num_test, correct_predict, class_based]
    
# Call the knn function    
results_bowl = knn(bovw_train, bovw_test) 

## Calculate the accuracy

In [None]:

# Calculates the average accuracy and class based accuracies.  
def accuracy(results):
    avg_accuracy = (results[1] / results[0]) * 100
    print("Average accuracy: %" + str(avg_accuracy))
    print("\nClass based accuracies: \n")
    for key,value in results[2].items():
        acc = (value[0] / value[1]) * 100
        print(key + " : %" + str(acc))
        
# Calculates the accuracies and write the results to the console.       
accuracy(results_bowl) 
view rawaccuracy.py hosted with ❤ by GitHub

# Exportation

Now we export the dataset of aggregated orders in a .csv file.