In [1]:
import pandas as pd
import numpy as np
from Icarusight.queries.get_queries import query_canape_product_df, query_product_properties_df
from Icarusight.Icarusight.utils import *
from Icarusight.Icarusight.property_value_model.property_value_model import create_property_value_graph
from Icarusight.Icarusight.clustering import get_cluster_labels, apply_clustering_algorithm, remove_meaningless_clusters
from Icarusight.Icarusight.vectorizer.custom_tfidf_transformer_class import ClassTfidfTransformer
from collections import Counter
from nltk.corpus import stopwords
from cdiscount import snowflake, config
import os

In [2]:
os.getcwd()

'/home/jupyter'

In [2]:
os.chdir("Icarusight")

In [3]:
secrets = config.load_secrets('queries/secrets.yml', key='snowflake')
con = snowflake.get_snowflake_connection(**secrets)
print("Starting queries.", end="")
product_df = query_canape_product_df(con)
print(".", end="")
product_property_df = query_product_properties_df(con)
print(". Done, length of product_df is:", len(product_df))

Starting queries... Done, length of product_df is: 102877


In [7]:
#print(refined_str(product_property_df['product_ids'][0]))

In [8]:
import networkx as nx

from itertools import combinations

def add_and_update_edges_from_list(g, input_list, property_id):
    print(combinations(input_list, 2))
    for product_1, product_2 in combinations(input_list, 2):
        weight = 1
        edge_data = g.get_edge_data(product_1, product_2)
        if edge_data is not None:
            weight = edge_data.get("weight", 1) + 1
        g.add_edge(product_1, product_2, weight=weight, relationship=property_id)


def add_nodes(g, product_df):
    for product_id in tqdm(product_df['product_id'], total=len(product_df['product_id'])):
        g.add_node(product_id)


def create_property_value_graph(product_df, product_property_df):
    g = nx.Graph()
    print('Adding nodes...')
    add_nodes(g, product_df)
    print('Nodes added, adding edges...')
    for i, row in tqdm(product_property_df.iterrows(), total=len(product_property_df)):
        add_and_update_edges_from_list(g, refined_str(row['product_ids']), row['property_id'])
    return g

In [9]:
from tqdm import tqdm
print("Starting graph creation")
g = create_property_value_graph(product_df, product_property_df)
print("Graph creation done.", f"got {len(g.nodes)} nodes and {len(g.edges)} edges.")
print("Starting clustering...")
partitions = apply_clustering_algorithm(g)
print("Clustering done, removing the meaningless clusters...")
partitions = remove_meaningless_clusters(partitions, n=3)
print("Removing done.")

Starting graph creation
Adding nodes...


100%|██████████| 102877/102877 [00:00<00:00, 739212.25it/s]


Nodes added, adding edges...


  0%|          | 0/46 [00:00<?, ?it/s]

<itertools.combinations object at 0x7fba24804e00>


  0%|          | 0/46 [00:13<?, ?it/s]


KeyboardInterrupt: 

In [34]:
import sys

# These are the usual ipython objects, including this one you are creating
ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']

# Get a sorted list of the objects and their sizes
sorted([(x, sys.getsizeof(globals().get(x))) for x in dir() if not x.startswith('_') and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)

[('product_df', 98424522),
 ('product_property_df', 4512254),
 ('ClassTfidfTransformer', 1072),
 ('Counter', 1072),
 ('Network', 1072),
 ('combinations', 408),
 ('defaultdict', 408),
 ('add_and_update_edges_from_list', 144),
 ('add_nodes', 144),
 ('apply_clustering_algorithm', 144),
 ('cleaning_strings', 144),
 ('create_property_value_graph', 144),
 ('df2features', 144),
 ('get_cluster_labels', 144),
 ('open', 144),
 ('query_canape_product_df', 144),
 ('query_product_properties_df', 144),
 ('refined_str', 144),
 ('remove_meaningless_clusters', 144),
 ('str2value_mapping', 144),
 ('write_html_from_nx', 144),
 ('config', 72),
 ('np', 72),
 ('nx', 72),
 ('pd', 72),
 ('plt', 72),
 ('brands_mapping', 64),
 ('categories_mapping', 64),
 ('colors_mapping', 64),
 ('name_mapping', 64),
 ('partitions', 64),
 ('sellers_mapping', 64),
 ('con', 48),
 ('g', 48),
 ('nlp', 48),
 ('stopwords', 48)]

In [8]:
# Estimating the total number of edges.
count = 0
for i, row in product_property_df.iterrows():
    n = len(refined_str(row['product_ids']))
    count += (n*(n-1))/2
print(count)

1775670678.0


In [23]:
def save_results(path_to_save, cluster_infos, cluster_top_words_count, cluster_top_words_tfidf):
    # Create a DataFrame from the dictionaries
    cluster_labels, cluster_max_values, cluster_total_len = cluster_infos[0], cluster_infos[1], cluster_infos[2]
    df = pd.DataFrame({
        'cluster_id': list(cluster_labels.keys()),
        'label': list(cluster_labels.values()),
        'max_values': list(cluster_max_values.values()),
        'total_len': list(cluster_total_len.values()),
        'top_words_count_method': list(cluster_top_words_count),
        'top_words_tfidf_method': list(cluster_top_words_tfidf)
    })

    # Write the DataFrame to a CSV file
    df.to_csv(path_to_save, index=False)

In [30]:
#product_df.set_index("product_id", inplace=True)

product_df['cluster_id'] = partitions
product_restrained_df = product_df[~product_df['cluster_id'].isnull()]
product_restrained_df.astype({'cluster_id': int})
product_restrained_df['name_descr'] = product_restrained_df.product_name +  ". " + product_restrained_df.product_long_description + ". "
cluster_serie = product_restrained_df.groupby("cluster_id").name_descr.sum()
print(cluster_serie)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  product_restrained_df['name_descr'] = product_restrained_df.product_name +  ". " + product_restrained_df.product_long_description + ". "


cluster_id
1.0        ZHI Canapé lit à 2 places Noir Tissu 738650263...
68.0       Canapé lit élégant Canapé moderne Intérieur mo...
87.0       Canapé convertible 3 places avec coffre de ran...
144.0      MOO Canapé lit à 2 places Vert foncé Velours 7...
162.0      Canapé lit à 2 places et deux oreillers Rouge ...
184.0      Canapé 3 places et fauteuil CHESTERFIELD Simil...
226.0      Canapé lit STAR scandinave Sofa convertible av...
715.0      Canapé Chesterfield en forme de L Cuir synthét...
1058.0     Banquette lit BZ matelas HR 140 cm YASMO n 2. ...
1357.0     Canapé d angle panoramique Dante en U en velou...
2264.0     Beliani Chaise longue rose poudré côté droit M...
2377.0     Canapé lit à 2 places Gris clair Velours DIOCH...
4303.0     Micadoni Home JUSTIN Canapé d angle 4 places e...
5138.0     Canapé lit à 2 places Marron Microfibre SALALI...
5407.0     YaJiaSheng Ensemble de canapés à 2 et à 3 plac...
5470.0     LVL MEUBLE SOFA Canapé lit à 2 places Rose Vel...
5505.0     AB

In [24]:
def get_top_tf_idf_words(response, threshold, top_n=2):
    #response_normalized = normalize(response, axis=1, norm='l2')
    response_normalized = custom_norm(response)
    response_normalized.data[response_normalized.data < threshold] = 0.0
    response_normalized.eliminate_zeros()
    #print(response_normalized.data)
    sorted_nzs = np.argsort(response_normalized.data)[:-(top_n+1):-1]
    res = feature_array[response_normalized.indices[sorted_nzs[ response_normalized.indices[sorted_nzs] > threshold]]]
    return np.apply_along_axis(' | '.join, 0, res)

def custom_norm(x):
    norm = x.sum(axis=1)
    return x / norm

from sklearn.preprocessing import normalize

In [25]:
from sklearn.feature_extraction.text import CountVectorizer
 
# COUNT VECTORIZER METHOD
vectorizer = CountVectorizer(stop_words=nltk.corpus.stopwords.words('french'))
vector = vectorizer.fit_transform(cluster_serie)
top_words_per_cluster_count = [get_top_tf_idf_words(item, 0.015, 5) for item in vector]
for top_words in top_words_per_cluster_count:
    print(top_words)

NameError: name 'nltk' is not defined

In [26]:
class CustomClassTfidfTransformer(ClassTfidfTransformer):
    def __init__(self, use_idf: bool = False, bm25_weighting: bool = False, reduce_frequent_words: bool = False):
        super(CustomClassTfidfTransformer, self).__init__(
            bm25_weighting=bm25_weighting,
            reduce_frequent_words=reduce_frequent_words
        )
        self.use_idf = use_idf
        
    def transform(self, X):
        X = normalize(X, axis=1, norm='l1', copy=False)

        if self.use_idf:

            if self.reduce_frequent_words:
                X.data = np.sqrt(X.data)

            X = X * self._idf_diag

        return X

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TFIDF VECTORIZER METHOD
v = CustomClassTfidfTransformer(use_idf=True)
x = v.fit_transform(vector)
feature_array = np.array(vectorizer.get_feature_names_out())
tfidf_sorting = np.argsort(x.toarray()).flatten()[::-1]
top_words_per_cluster_tfidf = [get_top_tf_idf_words(item, 0.015, 5) for item in x]
for top_words in top_words_per_cluster_tfidf:
    print(top_words)

NameError: name 'vector' is not defined

In [37]:
cluster_infos = get_cluster_labels(g, partitions)
print("Saving results...")
save_results("results/results_threshold_at_0.015_cutoff_2.csv", cluster_infos, top_words_per_cluster_count, top_words_per_cluster_tfidf)
print(os.getcwd())
print("Results saved.")

100%|██████████| 10731/10731 [07:43<00:00, 23.16it/s]
100%|██████████| 31/31 [00:00<00:00, 302380.06it/s]

Saving results...
/home/jupyter/Icarusight
Results saved.



