In [1]:
from IPython.core.interactiveshell import InteractiveShell
from IPython.display import display
import pandas as pd
import numpy as np
import re
from collections import Counter
import spacy
from sklearn.decomposition import PCA, LatentDirichletAllocation, TruncatedSVD, NMF
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import ParameterGrid
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer
from tensorflow.math import softplus
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import matplotlib.colors
import seaborn as sns
import time
import warnings
import cv2
import os
from keras.layers import Flatten
from keras import Model
from keras.applications.efficientnet import EfficientNetB0
from sklearn import set_config

In [2]:
# Sets the Ipython options

InteractiveShell.ast_node_interactivity = "all"

# Sets sklearn config for pipeline diagram

set_config(display="diagram")

In [3]:
def clean_doc_oov(text, embedder, pos_startegy=[], stop_words=[]):
    
    """Returns lemmatized and lowered document according to a given 
    POS filtering strategy and stop words list."""
    
    # Transforms the text in spacy's Doc object
    
    doc = nlp(text)
    
    # Lists the Doc's tokens lowered, lemmatized and POS filtered
    # Note : token.lemma_ doesn't returns lowered case tokens 
    # anymore hence the .lower()
    
    tokens = [token.lemma_.lower() for token in doc if token.pos_ in pos_startegy 
                                                    and not token.is_punct]
    
    # Removes OOV for spacy mean accuracy
    
    doc_for_oov = embedder(' '.join(tokens))
    
    tokens = [token.text for token in doc_for_oov if not token.is_oov 
                                                  and not token.is_punct]
    
    # Removes stop words from a given stop words list
    # Note : the Spacy's token.is_stop has some buggs regarding to 
    # it's customization at the time this function is written 
    # hence the below coprehension list
    
    tokens = [i for i in tokens if i not in stop_words]
    
    # Returns the filtered, lemmatized, and lowered tokens joined
    
    return ' '.join(tokens)


def return_stop_words_list_oov(serie, embedder, 
                           pos_startegy=[], 
                           most_common_threshold=30, 
                           rare_tokens_threshold=1):
    
    """Returns a list of stop words of a serie according to a given 
    POS filtering strategy, 
    most common words threshold and 
    rare occurences threshold.
    
    Stop words comes from :
    
    Default's Spacy stop words list
    Plus the most common words list
        (default threshold:30 most common words)
    Plus the rare words list
        (default threshold: 1 occurence)"""
    
    # Joins the texts of a serie to have one single text 
    
    serie = ' '.join(serie.values)
    
    # Transforms the text in spacy's Doc object
    
    doc = nlp(serie)
    
    # Lists the Doc's tokens lowered, lemmatized and POS filtered
    
    tokens = [token.lemma_.lower() for token in doc if token.pos_ in pos_startegy 
                                                    and not token.is_punct]
    
    # Removes OOV for spacy mean accuracy
    
    doc_for_oov = embedder(' '.join(tokens))
    
    tokens = [token.text for token in doc_for_oov if not token.is_oov 
                                                  and not token.is_punct]
    
    # Creates a counter of the word occurences
    
    tokens_frequency = Counter(tokens)
    
    # Retrieves a list of the most common occurences
    
    most_common_tokens = list(
                            dict(
                                tokens_frequency
                                .most_common(most_common_threshold))
                                .keys())
    
    # Retrieves a list of the rare words
    
    rare_tokens = [i for i, j in tokens_frequency.items() if j == rare_tokens_threshold]
    
    # Retrieves the default spacy's stop words
    
    default_stop_words = list(nlp.Defaults.stop_words)
    
    # Returns the tree above created lists into one : a stop word list 
    
    return default_stop_words + most_common_tokens + rare_tokens

def vectorize_doc(text, embedder):
    
    """Returns a specified text vectorized by a specified embedder"""
    
    doc = embedder(text)
    
    return doc.vector

def commit_result_in_ari_score_df(results):
    
    """Commits the specified results into ari_scores_df (initialized later on)"""
    
    for i in range(len(results)):     
        ari_scores_df.loc[len(ari_scores_df)] = results[i]

In [4]:
class CorpusEmbedder(BaseEstimator, TransformerMixin):
    
    """Cleans and Embbed (Context Free Embedder) a specified pd.Serie of texts"""
    
    def __init__(self,
                 embedder=None,
                 pos_startegy=['PROPN', 'NOUN', 'ADJ', 'VERB'],
                 most_common_threshold=30, 
                 rare_tokens_threshold=1):
        
        self.pos_startegy = pos_startegy
        self.embedder = embedder
        self.most_common_threshold = most_common_threshold
        self.rare_tokens_threshold = rare_tokens_threshold
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        
        X_ = X.copy()
        
        # Retrieves a stopword list
        
        stop_words = \
            return_stop_words_list_oov(X_,
                                       embedder=\
                                           self.embedder,
                                       pos_startegy=\
                                           self.pos_startegy, 
                                       most_common_threshold=\
                                           self.most_common_threshold, 
                                       rare_tokens_threshold=\
                                           self.rare_tokens_threshold)
        
        # Cleans the text
        
        X_cleaned = X_.apply(lambda text: clean_doc_oov(text,
                                                        embedder=self.embedder,
                                                        pos_startegy=self.pos_startegy, 
                                                        stop_words=stop_words))
        
        # Embbed the text
        
        X_embedded = X_cleaned.apply(lambda text: vectorize_doc(text, self.embedder))
        
        X_embedded_matrix = np.stack(X_embedded.values)
        
        return X_embedded_matrix
    
class CNNExtractor(BaseEstimator, TransformerMixin):
    
    """Vectorizes a given pd.Serie of images 
    with a given Convolutional neural network architecture"""
    
    def __init__(self, 
                 architecture=None,
                 pixel=None,
                 verbose=False):

        self.architecture = architecture
        self.pixel = pixel
        self.verbose = verbose
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        
        X_ = X.copy()
        
        # Checks for the presency of essential arguments
        
        if self.pixel is None:
            print('Please specify a pixel size')
            return None
        
        if self.architecture is None:
            print('Please specify an architecture')
            return None
        
        # Pixel Part
        
        images_resized_list = []

        for loc, image_path in enumerate(X_):
    
            image = cv2.imread(image_path)

            image_resized = cv2.resize(image, (self.pixel, self.pixel))
    
            images_resized_list.append(image_resized)


        images_resized_list = np.array(images_resized_list)
        
        # Inits an architecture
        
        if self.architecture == 'VGG16':
            
            architecture = VGG16(weights="imagenet", 
                                 include_top=False, 
                                 input_shape=(224, 224, 3))
            
        elif self.architecture == 'ResNet50V2':
            
            architecture = ResNet50V2(weights="imagenet", 
                                      include_top=False, 
                                      input_shape=(224, 224, 3))
            
        elif self.architecture == 'EfficientNetB0':
            
            architecture = EfficientNetB0(weights="imagenet", 
                                          include_top=False, 
                                          input_shape=(224, 224, 3))
            
        # Adds a flatten layer to the architecture
        
        output = Flatten()(architecture.layers[-1].output)
        
        model = Model(inputs=architecture.inputs, outputs=output)
        
        # Puts layers into a non trainable mode
        
        for layer in model.layers:
            layer.trainable = False
            
        # Displays the model architecture if verbose activated
        
        if self.verbose == True:
            model.summary()
            
        # Computes the feature matrix

        feature_matrix = model.predict(images_resized_list)
            
        return feature_matrix

In [5]:
# Inits a random seed

random_seed = 34

# Imports cleaned dataset

flipkart_df = \
    pd.read_csv(r'F:\Data\Projet 6\Cleaned\flipkart_cleaned')

# Imports Spacy's Pipeline

nlp = spacy.load("en_core_web_lg")

<h6>Set ups

In [6]:
# Defines a pos_strategy

pos_startegy = ['PROPN', 'NOUN', 'ADJ', 'VERB']

In [7]:
# Inits a Tsne for visualization

tSNE_visualizer = TSNE(n_components=2,
                       learning_rate=300,
                       perplexity=30,
                       init='random',
                       random_state=random_seed)

In [8]:
# Defines y

y = flipkart_df['product_category_node_1_numerical']

<h6>Feature Extraction

In [9]:
# Spacy Features Extractions

nlp_matrix = CorpusEmbedder(embedder=nlp)\
                    .fit_transform(flipkart_df['description'])

nlp_matrix.shape

(1050, 300)

In [10]:
# EfficientNetB0 Features Extractions

efficient_net_feature_matrix = CNNExtractor(architecture='EfficientNetB0', 
                                            pixel=224, 
                                            verbose=True)\
                               .fit_transform(flipkart_df['image_path'])

efficient_net_feature_matrix.shape

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 224, 224, 3  0           []                               
                                )]                                                                
                                                                                                  
 rescaling (Rescaling)          (None, 224, 224, 3)  0           ['input_1[0][0]']                
                                                                                                  
 normalization (Normalization)  (None, 224, 224, 3)  7           ['rescaling[0][0]']              
                                                                                                  
 stem_conv_pad (ZeroPadding2D)  (None, 225, 225, 3)  0           ['normalization[0][0]']      

 block2a_project_bn (BatchNorma  (None, 56, 56, 24)  96          ['block2a_project_conv[0][0]']   
 lization)                                                                                        
                                                                                                  
 block2b_expand_conv (Conv2D)   (None, 56, 56, 144)  3456        ['block2a_project_bn[0][0]']     
                                                                                                  
 block2b_expand_bn (BatchNormal  (None, 56, 56, 144)  576        ['block2b_expand_conv[0][0]']    
 ization)                                                                                         
                                                                                                  
 block2b_expand_activation (Act  (None, 56, 56, 144)  0          ['block2b_expand_bn[0][0]']      
 ivation)                                                                                         
          

                                                                                                  
 block3b_expand_activation (Act  (None, 28, 28, 240)  0          ['block3b_expand_bn[0][0]']      
 ivation)                                                                                         
                                                                                                  
 block3b_dwconv (DepthwiseConv2  (None, 28, 28, 240)  6000       ['block3b_expand_activation[0][0]
 D)                                                              ']                               
                                                                                                  
 block3b_bn (BatchNormalization  (None, 28, 28, 240)  960        ['block3b_dwconv[0][0]']         
 )                                                                                                
                                                                                                  
 block3b_a

 block4b_bn (BatchNormalization  (None, 14, 14, 480)  1920       ['block4b_dwconv[0][0]']         
 )                                                                                                
                                                                                                  
 block4b_activation (Activation  (None, 14, 14, 480)  0          ['block4b_bn[0][0]']             
 )                                                                                                
                                                                                                  
 block4b_se_squeeze (GlobalAver  (None, 480)         0           ['block4b_activation[0][0]']     
 agePooling2D)                                                                                    
                                                                                                  
 block4b_se_reshape (Reshape)   (None, 1, 1, 480)    0           ['block4b_se_squeeze[0][0]']     
          

                                                                                                  
 block5a_se_squeeze (GlobalAver  (None, 480)         0           ['block5a_activation[0][0]']     
 agePooling2D)                                                                                    
                                                                                                  
 block5a_se_reshape (Reshape)   (None, 1, 1, 480)    0           ['block5a_se_squeeze[0][0]']     
                                                                                                  
 block5a_se_reduce (Conv2D)     (None, 1, 1, 20)     9620        ['block5a_se_reshape[0][0]']     
                                                                                                  
 block5a_se_expand (Conv2D)     (None, 1, 1, 480)    10080       ['block5a_se_reduce[0][0]']      
                                                                                                  
 block5a_s

 block5c_se_excite (Multiply)   (None, 14, 14, 672)  0           ['block5c_activation[0][0]',     
                                                                  'block5c_se_expand[0][0]']      
                                                                                                  
 block5c_project_conv (Conv2D)  (None, 14, 14, 112)  75264       ['block5c_se_excite[0][0]']      
                                                                                                  
 block5c_project_bn (BatchNorma  (None, 14, 14, 112)  448        ['block5c_project_conv[0][0]']   
 lization)                                                                                        
                                                                                                  
 block5c_drop (Dropout)         (None, 14, 14, 112)  0           ['block5c_project_bn[0][0]']     
                                                                                                  
 block5c_a

                                                                                                  
 block6b_drop (Dropout)         (None, 7, 7, 192)    0           ['block6b_project_bn[0][0]']     
                                                                                                  
 block6b_add (Add)              (None, 7, 7, 192)    0           ['block6b_drop[0][0]',           
                                                                  'block6a_project_bn[0][0]']     
                                                                                                  
 block6c_expand_conv (Conv2D)   (None, 7, 7, 1152)   221184      ['block6b_add[0][0]']            
                                                                                                  
 block6c_expand_bn (BatchNormal  (None, 7, 7, 1152)  4608        ['block6c_expand_conv[0][0]']    
 ization)                                                                                         
          

                                                                                                  
 block7a_expand_conv (Conv2D)   (None, 7, 7, 1152)   221184      ['block6d_add[0][0]']            
                                                                                                  
 block7a_expand_bn (BatchNormal  (None, 7, 7, 1152)  4608        ['block7a_expand_conv[0][0]']    
 ization)                                                                                         
                                                                                                  
 block7a_expand_activation (Act  (None, 7, 7, 1152)  0           ['block7a_expand_bn[0][0]']      
 ivation)                                                                                         
                                                                                                  
 block7a_dwconv (DepthwiseConv2  (None, 7, 7, 1152)  10368       ['block7a_expand_activation[0][0]
 D)       

(1050, 62720)

<center><h3>Concatenation Strategies

<center><h4>Raw or separate dimentionality reduction

<h6>Raw concatenation

In [11]:
# Concatenates :

raw_feature_matrix = np.concatenate([nlp_matrix, efficient_net_feature_matrix], axis=1)

raw_feature_matrix.shape

(1050, 63020)

<h6>PCAs on imgae

PCA 0.99 on EfficientNetB0

In [12]:
# PCA 0.99 on EfficientNetB0 Strategy

efficient_net_feature_matrix_99_variance = \
    PCA(n_components=0.99).fit_transform(efficient_net_feature_matrix)

efficient_net_feature_matrix_99_variance.shape

(1050, 952)

In [13]:
# Concatenates

pca_99_feature_matrix = np.concatenate([nlp_matrix, 
                                        efficient_net_feature_matrix_99_variance], axis=1)

pca_99_feature_matrix.shape

(1050, 1252)

PCA 300 components on EfficientNetB0

In [14]:
# PCA 300 components on EfficientNetB0 strategy

efficient_net_feature_matrix_pca_300 = \
    PCA(n_components=300).fit_transform(efficient_net_feature_matrix)

efficient_net_feature_matrix_pca_300.shape

(1050, 300)

In [15]:
# Concatenates

pca_300_feature_matrix = np.concatenate([nlp_matrix, 
                                         efficient_net_feature_matrix_pca_300], axis=1)

pca_300_feature_matrix.shape

(1050, 600)

<h6>Best individual dimentionality reduction

In [16]:
# Reproduces the best found combinaison (nlp)

pca_best_nlp_matrix = PCA(n_components=0.8).fit_transform(nlp_matrix)
    
tSNE_best_nlp_matrix = TSNE(n_components=2, perplexity=30, learning_rate=300, 
                            init='random', random_state=random_seed)\
                      .fit_transform(pca_best_nlp_matrix)

tSNE_best_nlp_matrix.shape

(1050, 2)

In [17]:
# Reproduces the best found combinaison (computer vision)

pca_best_efficientnet = PCA(n_components=0.4).fit_transform(efficient_net_feature_matrix)

tSNE_best_efficientnet = TSNE(n_components=3, perplexity=40, learning_rate=300, 
                              init='random', random_state=random_seed)\
                        .fit_transform(pca_best_efficientnet)

tSNE_best_efficientnet.shape

(1050, 3)

In [18]:
# Concatenates

best_individual_dimentionality_reduction_feature_matrix = \
    np.concatenate([tSNE_best_nlp_matrix, tSNE_best_efficientnet], axis=1)

best_individual_dimentionality_reduction_feature_matrix.shape

(1050, 5)

<center><h1>Unsupervised Modelling

In [19]:
# Inits a dataframe to commit results

ari_scores_df = pd.DataFrame(columns=['Feature_Extraction',
                                      'ARI',
                                      'Preprocessing_name',
                                      'Preprocessing_params',
                                      'Fit_score'])

In [20]:
# Defines a dictionary with matrix names and matrixes 

matrix_dict = {'Raw_feature_matrix': raw_feature_matrix}

In [21]:
# Inits a Kmeans

kmeans = KMeans(n_clusters=7, random_state=random_seed)

In [22]:
# Computes ARI on raw_feature_matrix

ari_scores_df.loc[0, ['Feature_Extraction', 'ARI']] = \
    ('Raw_feature_matrix', adjusted_rand_score(y, kmeans.fit_predict(raw_feature_matrix)))

In [23]:
# Computes ARI on best_individual_dimentionality_reduction_feature_matrix

ari_scores_df.loc[1, ['Feature_Extraction', 'ARI']] = \
    ('best_individual_dimentionality_reduction_feature_matrix', 
     adjusted_rand_score(y, kmeans.fit_predict(
         best_individual_dimentionality_reduction_feature_matrix)))

In [24]:
# Computes ARI on pca_99_feature_matrix

ari_scores_df.loc[2, ['Feature_Extraction', 'ARI']] = \
    ('pca_99_feature_matrix', 
     adjusted_rand_score(y, kmeans.fit_predict(
         pca_99_feature_matrix)))

In [25]:
# Computes ARI on pca_300_feature_matrix

ari_scores_df.loc[3, ['Feature_Extraction', 'ARI']] = \
    ('pca_99_feature_matrix', 
     adjusted_rand_score(y, kmeans.fit_predict(
         pca_300_feature_matrix)))

<center><h4>Combined dimentionality reduction

<h6>PCA

In [26]:
def pca_grid_search(matrix, grid, feature_extraction_name):
    
    """Computes and return results of a 7-Means with a 
    PCA dimentionality reduction according to a specified 
    grid of params on a given matrix"""
    
    results = []

    for i in list(grid):
    
        t0 = time.time()
        
        # Dimentionality reduction
        with warnings.catch_warnings():
        
            warnings.simplefilter(action='ignore', category=FutureWarning)
        
            X_pca = PCA(n_components=i['pca__n_components'])\
                            .fit_transform(matrix)
    
        # Computes Kmeans predictions
    
        predictions = KMeans(n_clusters=7, random_state=random_seed)\
                        .fit_predict(X_pca)
    
        fit_time = time.time() - t0
        
        # Computes ARI score
    
        ari_score = adjusted_rand_score(y, predictions)
    
        
        # Appends ARI Score
    
        results.append((feature_extraction_name, ari_score, 'PCA', i, fit_time))
        
    return results

In [27]:
# Defines a grid of params

pca_pipeline_params_grid = \
    ParameterGrid({'pca__n_components': [0.99,0.9,0.8,0.7,0.6,0.5,0.4,0.3,0.2,0.1]})

In [28]:
# Computes for each matrixes the results provided by the above dimentionality
# reduction technique

for matrix_name, matrix in matrix_dict.items():
    
    print('Computing :', matrix_name)
    
    results = pca_grid_search(matrix, pca_pipeline_params_grid, matrix_name)

    commit_result_in_ari_score_df(results)
    
print('DONE')

Computing : Raw_feature_matrix
DONE


<h6>PCA + TSNE

In [29]:
def pca_tsne_grid_search(matrix, grid, feature_extraction_name, verbose=False):
    
    """Computes and return results of a 7-Means with a 
    PCA + TSNE dimentionality reduction according to a specified 
    grid of params on a given matrix"""

    results = []

    for i in list(grid):
        
        t0 = time.time()
        
        if verbose:
            print(i)

        # Dimentionality reduction
        with warnings.catch_warnings():
        
            warnings.simplefilter(action='ignore', category=FutureWarning)
            
            X_pca = PCA(n_components=i['pca__n_components'])\
                            .fit_transform(matrix)
    
            X_tSNE = TSNE(n_components=i['tsne__n_components'], 
                          perplexity=i['perplexity'], 
                          learning_rate=i['learning_rate'], 
                          init='random',
                          random_state=random_seed)\
                            .fit_transform(X_pca)
    
        # Computes Kmeans predictions
    
        predictions = KMeans(n_clusters=7, random_state=random_seed)\
                        .fit_predict(X_tSNE)
        
        fit_time = time.time() - t0
    
        # Computes ARI score
    
        ari_score = adjusted_rand_score(y, predictions)
    
        
        # Appends ARI Score
    
        results.append((feature_extraction_name, ari_score, 'PCA TSNE', i, fit_time))
    
    return results

In [30]:
# Defines a grid of params

pca_tsne_pipeline_params_grid = \
    ParameterGrid(
           [{'pca__n_components': [0.99,0.9,0.8,0.7,0.6,0.5,0.4],
             'tsne__n_components': [2,3], 
             'perplexity':[30,40],
             'learning_rate':[300],
             'method':['barnes_hut']}])

In [31]:
# Computes for each matrixes the results provided by the above dimentionality
# reduction technique

for matrix_name, matrix in matrix_dict.items():
    
    print('Computing :', matrix_name)
    
    results = pca_tsne_grid_search(matrix, pca_tsne_pipeline_params_grid, matrix_name)

    commit_result_in_ari_score_df(results)
    
print('DONE')

Computing : Raw_feature_matrix
DONE


<h6>LDA

In [32]:
def lda_grid_search(matrix, grid, feature_extraction_name, verbose=False):
    
    """Computes and return results of a 7-Means with a 
    LDA dimentionality reduction according to a specified 
    grid of params on a given matrix"""

    results = []

    for i in list(grid):
        
        if verbose:
            
            print(i)
            
        # Dimentionality reduction
        
        t0 = time.time()
        
        # Keeps 99% of variance for speedness 
        
        X_pca = PCA(n_components=0.99).fit_transform(matrix)
             
        matrix = softplus(X_pca) 
            
        X_lda = LatentDirichletAllocation(n_components=i['lda__n_components'], 
                                          random_state=random_seed).fit_transform(matrix)
    
        # Computes Kmeans predictions
    
        predictions = KMeans(n_clusters=7, random_state=random_seed)\
                        .fit_predict(X_lda)
        
        fit_time = time.time() - t0
    
        # Computes ARI score
    
        ari_score = adjusted_rand_score(y,predictions)
    
        
        # Appends ARI Score
    
        results.append((feature_extraction_name, ari_score, 'LDA', i, fit_time))
    
    return results

In [33]:
# Defines a grid of params

lda_pipeline_params_grid = \
    ParameterGrid(
           [{'lda__n_components': [7, 14, 21]}])

In [34]:
# Computes for each matrixes the results provided by the above dimentionality
# reduction technique

for matrix_name, matrix in matrix_dict.items():
    
    print('Computing :', matrix_name)
    
    results = lda_grid_search(matrix, lda_pipeline_params_grid, matrix_name)

    commit_result_in_ari_score_df(results)
    
print('DONE')

Computing : Raw_feature_matrix
DONE


<h6>SVD

In [35]:
def svd_grid_search(matrix, grid, feature_extraction_name, verbose=False):
    
    """Computes and return results of a 7-Means with a 
    SVD dimentionality reduction according to a specified 
    grid of params on a given matrix"""

    results = []

    for i in list(grid):
        
        if verbose:
            print(i)

        # Dimentionality reduction
        
        t0 = time.time()
            
        svd = TruncatedSVD(n_components=i['svd__n_components'], 
                    random_state=random_seed).fit_transform(matrix)
    
        # Computes Kmeans predictions
    
        predictions = KMeans(n_clusters=7, random_state=random_seed)\
                        .fit_predict(svd)
        
        fit_time = time.time() - t0
    
        # Computes ARI score
    
        ari_score = adjusted_rand_score(y, predictions)
    
        
        # Appends ARI Score
    
        results.append((feature_extraction_name, ari_score, 'SVD', i, fit_time))
    
    return results

In [36]:
# Defines a grid of params

svd_pipeline_params_grid = \
    ParameterGrid(
           [{'svd__n_components': [7, 14, 21]}])

In [37]:
# Computes for each matrixes the results provided by the above dimentionality
# reduction technique

for matrix_name, matrix in matrix_dict.items():
    
    print('Computing :', matrix_name)
    
    results = svd_grid_search(matrix, svd_pipeline_params_grid, matrix_name)

    commit_result_in_ari_score_df(results)
    
print('DONE')

Computing : Raw_feature_matrix
DONE


<h6>NMF

In [38]:
def nmf_grid_search(matrix, grid, feature_extraction_name, verbose=False):
    
    """Computes and return results of a 7-Means with a 
    NMF dimentionality reduction according to a specified 
    grid of params on a given matrix"""

    results = []

    for i in list(grid):
        
        if verbose:
            print(i)
            
        if feature_extraction_name not in ['count_vectorized_matrix', 'tf_idf_matrix']:
            
            matrix = softplus(matrix) 

        # Dimentionality reduction
        
        t0 = time.time()
        
        X_nmf = NMF(n_components=i['nmf__n_components'], 
                    random_state=random_seed,
                    max_iter=10000,
                    init='random').fit_transform(matrix)
    
        # Computes Kmeans predictions
    
        predictions = KMeans(n_clusters=7, random_state=random_seed)\
                        .fit_predict(X_nmf)
        
        fit_time = time.time() - t0
    
        # Computes ARI score
    
        ari_score = adjusted_rand_score(y, predictions)
    
        
        # Appends ARI Score
    
        results.append((feature_extraction_name, ari_score, 'NMF', i, fit_time))
    
    return results

In [39]:
# Defines a grid of params

nmf_pipeline_params_grid = \
    ParameterGrid(
           [{'nmf__n_components': [7, 14, 21]}])

In [40]:
# Computes for each matrixes the results provided by the above dimentionality
# reduction technique

for matrix_name, matrix in matrix_dict.items():
    
    print('Computing :', matrix_name)
    
    results = nmf_grid_search(matrix, nmf_pipeline_params_grid, matrix_name)

    commit_result_in_ari_score_df(results)
    
print('DONE')

Computing : Raw_feature_matrix
DONE


<center><h3>Results

In [41]:
# Displays top 5 of the best ARI Scores

ari_scores_df.sort_values(by='ARI', ascending=False).head(5)

Unnamed: 0,Feature_Extraction,ARI,Preprocessing_name,Preprocessing_params,Fit_score
1,best_individual_dimentionality_reduction_featu...,0.50116,,,
39,Raw_feature_matrix,0.432041,PCA TSNE,"{'learning_rate': 300, 'method': 'barnes_hut',...",22.31914
35,Raw_feature_matrix,0.418104,PCA TSNE,"{'learning_rate': 300, 'method': 'barnes_hut',...",22.348614
12,Raw_feature_matrix,0.415884,PCA,{'pca__n_components': 0.2},4.562649
8,Raw_feature_matrix,0.409017,PCA,{'pca__n_components': 0.6},4.581008


<center><h1>Supervised Modelling

In [42]:
# Creates a voting classifier with two pipelines (one for each non tabular data)

nlp_random_pipeline = \
    Pipeline([('column_nlp', ColumnTransformer([('nlp', 
                                    CorpusEmbedder(embedder=nlp), 
                                    'description')])),
              ('random_forest_clf_nlp', RandomForestClassifier(max_depth=10, 
                                                            random_state=random_seed))])

cnn_random_pipeline = \
    Pipeline([('column_cnn', ColumnTransformer([('cnn', 
                                    CNNExtractor(architecture='EfficientNetB0', pixel=224), 
                                    'image_path')])),
              ('random_forest_clf_cnn', RandomForestClassifier(max_depth=10, 
                                                               random_state=random_seed))])

voting_classifier = VotingClassifier(
                        estimators=[('nlp_random_pipeline', nlp_random_pipeline), 
                                    ('cnn_random_pipeline', cnn_random_pipeline)], 
                                    voting='soft')

voting_classifier

In [43]:
# Cross validate the voting classifier through a gridsearch

voting_classifier_params = [dict()]

voting_classifier_gs = GridSearchCV(
                                   voting_classifier, 
                                   voting_classifier_params, 
                                   cv=3, 
                                   scoring='accuracy',
                                   return_train_score=True,
                                   refit=False,
                                   verbose=3)

voting_classifier_gs.fit(flipkart_df[['description', 'image_path']], y);

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV 1/3] END ..............., score=(train=1.000, test=0.911) total time= 2.7min
[CV 2/3] END ..............., score=(train=1.000, test=0.920) total time= 2.7min
[CV 3/3] END ..............., score=(train=1.000, test=0.763) total time= 2.7min


In [44]:
# Displays the mean test score

voting_classifier_gs.cv_results_['mean_test_score']

array([0.8647619])