In [1]:
import os
os.chdir("/home/jupyter/")
import pandas as pd
import numpy as np
from Icarusight.queries.get_queries import query_canape_product_df, query_product_properties_df
from Icarusight.Icarusight.utils import *
from Icarusight.Icarusight.clustering import get_cluster_labels, apply_clustering_algorithm, remove_meaningless_clusters
from Icarusight.Icarusight.multi_node_model.multi_node_model import add_product_node
from cdiscount import snowflake, config
from tqdm import tqdm
import nltk
os.chdir("Icarusight")

In [4]:
g = create_property_value_graph(product_df, product_property_df)
print("Graph creation done.", end=" ")
print(f"got {len(g.nodes)} nodes and {len(g.edges)} edges.")

print("Starting clustering...")
partitions = apply_clustering_algorithm(g)
print("Clustering done, removing the meaningless clusters...")
partitions = remove_meaningless_clusters(partitions, n=3)
print("Removing done.")

Adding nodes...


100%|██████████| 102877/102877 [01:54<00:00, 898.06it/s]
100%|██████████| 46/46 [00:00<00:00, 16396.53it/s]


Nodes added, adding edges...


100%|██████████| 46/46 [00:00<00:00, 82.83it/s] 


Graph creation done. got 102885 nodes and 204933 edges.
Starting clustering...
Clustering done, removing the meaningless clusters...
Removing done.


In [8]:
print(len(partitions))
from collections import Counter
print(Counter(partitions.values()))

102885
Counter({0: 57418, 2: 37347, 1: 8103, 3: 17})


In [None]:
product_df.set_index("product_id", inplace=True)

product_df['cluster_id'] = partitions
product_restrained_df = product_df[~product_df['cluster_id'].isnull()]
product_restrained_df.astype({'cluster_id': int})
product_restrained_df['name_descr'] = product_restrained_df.product_name +  ". " + product_restrained_df.product_long_description + ". "
cluster_serie = product_restrained_df.groupby("cluster_id").name_descr.sum()
print(cluster_serie)

In [20]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
nltk.download('stopwords')
# COUNT VECTORIZER METHOD
vectorizer = CountVectorizer(stop_words=nltk.corpus.stopwords.words('french'))
vector = vectorizer.fit_transform(clusters)
top_words_per_cluster_count = [get_top_tf_idf_words(item, 0.015, 5) for item in vector]
for top_words in top_words_per_cluster_count:
    print(top_words)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


NameError: name 'feature_array' is not defined

In [1]:
from cdiscount.snowflake import get_snowflake_connection, query_snowflake_to_df
# Retrieve data
query = """
SELECT
    CONCAT(
        MAX(PRODUCT_PROPERTY_NAME),
        '=',
        PRODUCT_PROPERTY_VALUE
    ) AS PRODUCT_PROPERTY_VALUE,
    PRODUCT_ID,
    PRODUCT_CATEGORY_LEVEL4_ID,
    PRODUCT_CATEGORY_LEVEL4_NAME,
    PRODUCT_LONG_DESCRIPTION
FROM QUALISCORE_LAB.LAB_STAGE_VINCENT_CANAPES_WITH_PROPERTIES
GROUP BY
    PRODUCT_PROPERTY_ID,
    PRODUCT_PROPERTY_VALUE,
    PRODUCT_ID,
    PRODUCT_CATEGORY_LEVEL4_ID,
    PRODUCT_CATEGORY_LEVEL4_NAME,
    PRODUCT_LONG_DESCRIPTION;
"""
secrets = config.load_secrets('queries/secrets.yml', key='snowflake')
with get_snowflake_connection(**secrets) as con:
    co_viewed_df = query_co_viewed_df(con)
    product_df = query_canape_product_df(con)
    edges = query_snowflake_to_df(sql_query_or_path=query, con=con)
print('-' * 100)
print(f'# of edges : {len(edges)}')
print(f'# of product nodes : {edges.product_id.nunique()}')
print(f'# of property nodes : {edges.product_property_value.nunique()}')
print(edges.columns)
print()

# Create graph
G = nx.Graph()
colors = {'product_property_value': 'green', 'product_id': 'red'}
for node_type, color in colors.items():
    G.add_nodes_from(edges[node_type].unique(), color=[color] * edges[node_type].nunique())
G.add_edges_from(edges[['product_property_value', 'product_id']].to_records(index=False).tolist())
G.add_edges_from(co_viewed_df[['ref_id', 'related_id']].to_records(index=False).tolist())
# Run community algorithm
clustering = nx.community.louvain_communities(G, seed=42)

# Format clustering results
clusters = []
for cluster_label, products in enumerate(clustering):
    clusters.append(
        pd.DataFrame({
            'cluster': [f'cluster_{cluster_label}'] * len(products),
            'product_id': list(products),
        })
    )
clusters = pd.concat(clusters)
print(clusters.columns)
print('-' * 100)
print(f'Number of clusters : {clusters.cluster.nunique()}')
print()

# Print cluster sizes
print('-' * 100)
print('Cluster sizes:')
print(clusters.groupby('cluster').size().sort_values(ascending=False))
print()

# Count property nodes within the clusters
n_property_nodes_by_cluster = (
    clusters
    .loc[lambda x: x.product_id.str.contains('=')]
    .groupby('cluster').size()
    .sort_values(ascending=False)
)
print('-' * 100)
print('Number of property nodes within the cluster :')
print(n_property_nodes_by_cluster)
print()

# Assign product categories to cluster's product nodes
product_categories = edges[['product_id', 'product_category_level4_id', 'product_category_level4_name', 'product_long_description']].drop_duplicates()
clusters = clusters.merge(right=product_categories, how='inner', on='product_id')
# print('-' * 100)
# print('')
# print(clusters.head(5))
# print()

# Print product categories present inside cluster 0
print('-' * 100)
print('Main categories inside cluster 0')
print(clusters.groupby('cluster').product_category_level4_name.value_counts()['cluster_0'])
print()

# Find property nodes connected to each cluster (edges with starting node being a product node belonging to a given cluster)
clusters_with_properties = clusters.merge(right=edges[['product_id', 'product_property_value']], how='inner', on='product_id')
# print(clusters_with_properties.head(5))


# Print top 3 property nodes connected to cluster 0 (Property "nature" of the cluster)
print('-' * 100)
print('Main properties connected to cluster 0')
print(clusters_with_properties.groupby('cluster').product_property_value.value_counts()['cluster_0'].head(3))
print()

NameError: name 'config' is not defined

In [11]:
product_restrained_df.head()

Unnamed: 0_level_0,product_name,product_long_description,brand_name,product_category_name,cluster,product_category_level4_id,product_category_level4_name,product_property_value
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AUC3094814788770,"Couvre pour Canapé,Housses de canapé avec Jupe...","Couvre pour Canapé,Housses de canapé avec Jupe...",,HOUSSE DE CANAPE,cluster_11,1000001706,HOUSSE DE CANAPE,Couleur principale=Bleu
AUC1694790220911,"Housse De Canapé Dangle Matelassé,4-3-2-1 Plac...",Housse de Canapé Dangle Matelassé : La housse ...,,HOUSSE DE CANAPE,cluster_2,1000001706,HOUSSE DE CANAPE,Couleur principale=Jaune
AUC3028235866830,ZHG- MODE&CHIC 3056621 2 Piece Sofa Set Fabric...,ZHG- MODE&CHIC 3056621 2 Piece Sofa Set Fabric...,,CANAPE - SOFA - DIVAN,cluster_4,1000002639,CANAPE - SOFA - DIVAN,Matière du revêtement=Tissu
AUC3028235866830,ZHG- MODE&CHIC 3056621 2 Piece Sofa Set Fabric...,ZHG- MODE&CHIC 3056621 2 Piece Sofa Set Fabric...,,CANAPE - SOFA - DIVAN,cluster_4,1000002639,CANAPE - SOFA - DIVAN,Type de canapé=Fixe
AUC3028235866830,ZHG- MODE&CHIC 3056621 2 Piece Sofa Set Fabric...,ZHG- MODE&CHIC 3056621 2 Piece Sofa Set Fabric...,,CANAPE - SOFA - DIVAN,cluster_4,1000002639,CANAPE - SOFA - DIVAN,Couleur principale=Rouge


In [3]:
merged_product_df = product_df.merge(clusters_with_properties, on=['product_id', 'product_long_description'], how='inner').drop_duplicates(subset=['product_id'])
merged_product_df.set_index("product_id", inplace=True)

In [4]:
product_restrained_df = merged_product_df[~merged_product_df['cluster'].isnull()]

In [5]:
product_restrained_df['name_descr'] = product_restrained_df.product_name +  ". " + product_restrained_df.product_long_description + ". "

In [6]:
len(product_restrained_df)

102877

In [6]:
cluster_serie = product_restrained_df.groupby("cluster").name_descr.sum()

In [15]:
cluster_serie

cluster
cluster_0     Housse de canapé élastique Housse de canapé él...
cluster_1     Ensemble de canapés 2pcs 72x78x74cm - 1 fauteu...
cluster_10    Tbest housse de canapé New Hot 7 Solide Pure C...
cluster_11    Couvre pour Canapé,Housses de canapé avec Jupe...
cluster_12    JIM-7329026656651-Couch Sofa Cover, Cushion Co...
cluster_13    Atyhao Canapé pour enfants Gris clair Peluche ...
cluster_2     Housse De Canapé Dangle Matelassé,4-3-2-1 Plac...
cluster_3     Meilleurs Meubles Canapé-lit réglable avec 2 o...
cluster_4     ZHG- MODE&CHIC 3056621 2 Piece Sofa Set Fabric...
cluster_5     Canapé 3 places convertible clic-clac en tissu...
cluster_6     Housse de canapé Extensible Décoration du Mais...
cluster_7     Canapé d'angle RIO Convertible avec coffre en ...
cluster_8     Ensemble de canapés 2 pcs avec coussins Rotin ...
cluster_9     ESTINK Canapé pour enfants à 2 places Crème Pe...
Name: name_descr, dtype: object

In [7]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import normalize
from sklearn.utils import check_array
import numpy as np
import scipy.sparse as sp


class ClassTfidfTransformer(TfidfTransformer):
    """
    A Class-based TF-IDF procedure using scikit-learns TfidfTransformer as a base.

    ![](../algorithm/c-TF-IDF.svg)

    c-TF-IDF can best be explained as a TF-IDF formula adopted for multiple classes
    by joining all documents per class. Thus, each class is converted to a single document
    instead of set of documents. The frequency of each word **x** is extracted
    for each class **c** and is **l1** normalized. This constitutes the term frequency.

    Then, the term frequency is multiplied with IDF which is the logarithm of 1 plus
    the average number of words per class **A** divided by the frequency of word **x**
    across all classes.

    Arguments:
        bm25_weighting: Uses BM25-inspired idf-weighting procedure instead of the procedure
                        as defined in the c-TF-IDF formula. It uses the following weighting scheme:
                        `log(1+((avg_nr_samples - df + 0.5) / (df+0.5)))`
        reduce_frequent_words: Takes the square root of the bag-of-words after normalizing the matrix.
                               Helps to reduce the impact of words that appear too frequently.

    Examples:

    ```python
    transformer = ClassTfidfTransformer()
    ```
    """
    def __init__(self, bm25_weighting: bool = False, reduce_frequent_words: bool = False):
        self.bm25_weighting = bm25_weighting
        self.reduce_frequent_words = reduce_frequent_words
        super(ClassTfidfTransformer, self).__init__()

    def fit(self, X: sp.csr_matrix, multiplier: np.ndarray = None):
        """Learn the idf vector (global term weights).

        Arguments:
            X: A matrix of term/token counts.
            multiplier: A multiplier for increasing/decreasing certain IDF scores
        """
        X = check_array(X, accept_sparse=('csr', 'csc'))
        if not sp.issparse(X):
            X = sp.csr_matrix(X)
        dtype = np.float64

        if self.use_idf:
            _, n_features = X.shape

            # Calculate the frequency of words across all classes
            df = np.squeeze(np.asarray(X.sum(axis=0)))

            # Calculate the average number of samples as regularization
            avg_nr_samples = int(X.sum(axis=1).mean())

            # BM25-inspired weighting procedure
            if self.bm25_weighting:
                idf = np.log(1+((avg_nr_samples - df + 0.5) / (df+0.5)))

            # Divide the average number of samples by the word frequency
            # +1 is added to force values to be positive
            else:
                idf = np.log((avg_nr_samples / df)+1)

            # Multiplier to increase/decrease certain idf scores
            if multiplier is not None:
                idf = idf * multiplier

            self._idf_diag = sp.diags(idf, offsets=0,
                                      shape=(n_features, n_features),
                                      format='csr',
                                      dtype=dtype)

        return self

    def transform(self, X: sp.csr_matrix):
        """Transform a count-based matrix to c-TF-IDF

        Arguments:
            X (sparse matrix): A matrix of term/token counts.

        Returns:
            X (sparse matrix): A c-TF-IDF matrix
        """
        if self.use_idf:
            X = normalize(X, axis=1, norm='l1', copy=False)

            if self.reduce_frequent_words:
                X.data = np.sqrt(X.data)

            X = X * self._idf_diag

        return X
    

class CustomClassTfidfTransformer(ClassTfidfTransformer):
    def __init__(self, use_idf: bool = False, bm25_weighting: bool = False, reduce_frequent_words: bool = False):
        super(CustomClassTfidfTransformer, self).__init__(
            bm25_weighting=bm25_weighting,
            reduce_frequent_words=reduce_frequent_words
        )
        self.use_idf = use_idf
        
    def transform(self, X):
        X = normalize(X, axis=1, norm='l1', copy=False)

        if self.use_idf:

            if self.reduce_frequent_words:
                X.data = np.sqrt(X.data)

            X = X * self._idf_diag

        return X

In [22]:
def get_top_tf_idf_words_2(feature_array, response, threshold, top_n=2, print_associated_threshold=False):
    response_normalized = custom_norm(response)
    response_normalized.data[response_normalized.data < 0] = 0.0 # TODO: Replace 0 with threshold
    response_normalized.eliminate_zeros()
    
    sorted_nzs = np.argsort(response_normalized.data)[:-(top_n+1):-1]
    
    keywords = feature_array[response_normalized.indices[sorted_nzs[response_normalized.indices[sorted_nzs] > 0]]] # TODO: Replace 0 with threshold
    if not print_associated_threshold:
        return keywords

    tfidf_scores = response_normalized.data[sorted_nzs]
    res = list(zip(keywords, tfidf_scores))
    return res

def custom_norm(x):
    norm = x.sum(axis=1)
    return x / norm


# Vector is the string to apply tf-idf on.
def apply_tf_idf_2(vector, vectorizer, threshold, top_n=2, print_associated_threshold=False):
    v = CustomClassTfidfTransformer(use_idf=True)
    x = v.fit_transform(vector)
    feature_array = np.array(vectorizer.get_feature_names_out())
    tfidf_sorting = np.argsort(x.toarray()).flatten()[::-1]
    top_words_per_cluster_tfidf = [get_top_tf_idf_words_2(feature_array, item, threshold, top_n, print_associated_threshold) for item in x]
    return top_words_per_cluster_tfidf

In [10]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [25]:
print(os.getcwd())
from Icarusight.Icarusight.vectorizer.comunities_naming import apply_tf_idf
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(stop_words=nltk.corpus.stopwords.words('french'), min_df=0.05, max_df=0.7)
vector = vectorizer.fit_transform(cluster_serie)
dominant_words = apply_tf_idf_2(vector, vectorizer, threshold=0.001, top_n=5, print_associated_threshold=True)
for dominant_word in dominant_words:
    print(dominant_word)

/home/jupyter/Icarusight
[('chrome', 0.012115407431163315), ('bouteille', 0.007353176579070996), ('115x60x67', 0.004530119928015576), ('émeraude', 0.003910951311076582), ('verte', 0.0039107190586954105)]
[('queen', 0.005982745590004781), ('personnalisable', 0.005819566959259014), ('polychlorure', 0.0037035824156580804), ('vinyle', 0.003701746269223654), ('positionnez', 0.0034098629792197236)]
[('monoplaces', 0.0157093318974846), ('complétez', 0.007154690428549345), ('a341796', 0.005625706951867143), ('sécuritaire', 0.00538946697069688), ('a341844', 0.004559713715820153)]
[('indigo', 0.0070768360130667755), ('chrome', 0.00478876175380354), ('collant', 0.0040036873904282155), ('illustrer', 0.0040036873904282155), ('vendues', 0.004000481418860051)]
[('résine', 0.007195334958122552), ('tressée', 0.007144034989381584), ('rotin', 0.005134699290375354), ('mengyyshop', 0.004271016315226769), ('jersey', 0.004028496464307161)]
[('étoiles', 0.008095981968049091), ('uyeoco', 0.006114437336541146),

In [61]:
from Icarusight.Icarusight.flatXcoviewed_model.comunities_naming import find_dominant_words, apply_tf_idf

cluster_list = clusters["cluster"].drop_duplicates().tolist()

for cluster_id in cluster_list:
    cluster_serie = clusters.loc[clusters['cluster'] == cluster_id, "product_long_description"]
    descriptions = cluster_serie.tolist()
    dominant_words = find_dominant_words(descriptions)
    print(cluster_id, dominant_words)

Main Theme Key Phrase: cm, totales, canapé, revêtement, cadre
cluster_0 cm, totales, canapé, revêtement, cadre
Main Theme Key Phrase: siège, cm, partir, sol, totales
cluster_1 siège, cm, partir, sol, totales
Main Theme Key Phrase: housse, canapé, extensible, taille, places
cluster_2 housse, canapé, extensible, taille, places
Main Theme Key Phrase: coussin, cm, canapé, dimensions, siège
cluster_3 coussin, cm, canapé, dimensions, siège
Main Theme Key Phrase: canapé, housse, places, coussin, tissu
cluster_4 canapé, housse, places, coussin, tissu
Main Theme Key Phrase: siège, partir, sol, cm, cadre
cluster_5 siège, partir, sol, cm, cadre
Main Theme Key Phrase: siège, coussin, partir, sol, cm
cluster_6 siège, coussin, partir, sol, cm
Main Theme Key Phrase: cm, canapé, mousse, tissu, dimensions
cluster_7 cm, canapé, mousse, tissu, dimensions
Main Theme Key Phrase: facile, revêtement, cadre, totales, pieds
cluster_8 facile, revêtement, cadre, totales, pieds
Main Theme Key Phrase: siège, cm, c