In [1]:
from custom_tf_idf_transformer_class import CustomClassTfidfTransformer
from write_csv_and_push_data import cleaning_strings

from annoy import AnnoyIndex
from tqdm import tqdm
import numpy as np
import pandas as pd
import nltk
from collections import Counter
from sklearn.preprocessing import MinMaxScaler

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

GET TABLES

In [2]:
from cdiscount import config
from cdiscount import snowflake

secrets = config.load_secrets('secrets.yml', key='snowflake')
with snowflake.get_snowflake_connection(**secrets) as con:
    julien_product_seller_df = pd.read_csv("/home/jupyter/20231123_stage_vincent_product_seller_edges.csv", sep=",", engine="python")
    julien_products_df = pd.read_csv("/home/jupyter/20231123_stage_vincent_products.csv", sep=",", engine="python")
    lr_df = snowflake.query_snowflake_to_df(
    """
   WITH table_julien AS (
    WITH
        CATEGORY_LEVEL4_PERIMETER AS (
            SELECT DISTINCT PRODUCT_CATEGORY_LEVEL4_ID AS PRODUCT_CATEGORY_LEVEL4_ID
            FROM QUALISCORE_LAB.LAB_STAGE_VINCENT_CANAPES_WITH_PROPERTIES
        ),
        CATEGORY_LEVEL3_PERIMETER AS (
            SELECT DISTINCT
                C.PRODUCT_CATEGORY_LEVEL3_ID,
                C.PRODUCT_CATEGORY_LEVEL4_ID
            FROM CATEGORY_LEVEL4_PERIMETER AS P
            INNER JOIN REFERENTIEL_SMT.SMT_RFL_DIM_PRODUCT_CATEGORY AS C
                ON P.PRODUCT_CATEGORY_LEVEL4_ID = C.PRODUCT_CATEGORY_LEVEL4_ID
                AND PRODUCT_CATEGORY_DEPTH = 4
        ),
        CATEGORY_LEVEL3_PATH_PERIMETER AS (
            SELECT
                P.PRODUCT_CATEGORY_LEVEL3_ID,
                P.PRODUCT_CATEGORY_LEVEL4_ID,
                C.PRODUCT_CATEGORY_CODE_PATH
            FROM CATEGORY_LEVEL3_PERIMETER AS P
            INNER JOIN REFERENTIEL_SMT.SMT_RFL_DIM_PRODUCT_CATEGORY AS C
                ON P.PRODUCT_CATEGORY_LEVEL3_ID = C.PRODUCT_CATEGORY_LEVEL3_ID
                AND PRODUCT_CATEGORY_DEPTH = 3
        ),
        TRAFFIC AS (
            SELECT
                SEARCH_ID,
                SUM(VIEW_COUNT::FLOAT) AS VIEW_COUNT,
                SUM(CLICK_COUNT) AS CLICK_COUNT,
                SUM(TURNOVER) AS TURNOVER
            FROM SEARCH_SMT.SMT_SCH_AGG_SEARCH_LIST_TRACKING_KPI
            WHERE SNAPSHOT_DATE BETWEEN DATE('2023-11-24') - 119 AND DATE('2023-11-24')
                AND SITE_ID = 100
                AND AB_TESTING_GROUP = 'A'
                AND SEARCH_ID <> ''
                AND SEARCH_ID IS NOT NULL
            GROUP BY SEARCH_ID, SNAPSHOT_DATE
        ),
        CUMULATIVE_TRAFFIC AS (
            SELECT
                SEARCH_ID,
                (
                    SUM(VIEW_COUNT)
                    OVER(
                        ORDER BY VIEW_COUNT DESC, TURNOVER DESC, CLICK_COUNT DESC
                        ROWS UNBOUNDED PRECEDING
                    )
                ) / (
                    SUM(VIEW_COUNT)
                    OVER()
                ) AS CUM_QP_TRAFFIC
            FROM TRAFFIC
        ),
        TRAFFIC_BINS AS (
            SELECT
                SEARCH_ID,
                FLOOR(
                    IFF(
                        CUM_QP_TRAFFIC = 1,
                        CUM_QP_TRAFFIC - 1e-5,
                        CUM_QP_TRAFFIC
                    ) * 10
                ) AS SEARCH_ID_GROUP
            FROM CUMULATIVE_TRAFFIC
        )
    SELECT
        LR.SEARCH_ID,
        P.PRODUCT_CATEGORY_CODE_PATH
    FROM CATEGORY_LEVEL3_PATH_PERIMETER AS P
    INNER JOIN SEARCH_SMT.SMT_SCH_DIM_SEARCH_LIST_CATEGORY_FILTER AS LR
        ON P.PRODUCT_CATEGORY_CODE_PATH = LR.CATEGORY_FILTER_PATH_CODE
        AND LR.SNAPSHOT_DATE = '2023-11-24'
        AND LR.SITE_ID = 100
        AND LR.AB_TESTING_SVC_COOKIE_GROUP = 'A'
    INNER JOIN TRAFFIC_BINS AS T
        ON LR.SEARCH_ID = T.SEARCH_ID
    WHERE SEARCH_ID_GROUP = 9),
    FULL_DATA AS (
    SELECT     rerank.search_id,
               ARRAY_AGG(DISTINCT rerank.product_id) AS product_id_list,
               page_number
            FROM SEARCH_SMT.SMT_SCH_AGG_SEARCH_LIST_PRODUCT_VIEW AS rerank
            JOIN table_julien
                ON table_julien.search_id = rerank.search_id
                WHERE rerank.site_id = 100
                AND rerank.snapshot_date >= DATE('2023-11-24')
            GROUP BY (rerank.search_id, page_number))
    SELECT * FROM full_data WHERE ARRAY_SIZE(product_id_list) >=1;
    """, con=con)

In [4]:
lr_df["product_id_list"] = lr_df.product_id_list.apply(eval)
exploded_lr_df = lr_df.explode("product_id_list").reset_index()
exploded_lr_df.drop_duplicates(subset=["product_id_list"], inplace=True)
full_df = julien_products_df.merge(exploded_lr_df, left_on="product_id", right_on="product_id_list")
full_df.rename(columns={'fp_product_name':'product_name', 'product_long_description':'description'}, inplace=True)
full_df.reset_index()
full_df.description.fillna("", inplace=True)
full_df.description = cleaning_strings(full_df.description)
full_df.product_name = cleaning_strings(full_df.product_name)
full_df.drop(columns=['product_id_list',
                      'product_short_description',
                      'product_properties',
                      'preprocessed_product_properties',
                      'product_marketing_description',
                      'total_token_fp_product_name',
                      'total_token_product_long_description',
                      'total_token_fp_content',
                      #'community_id'
                     ], inplace=True)
#exploded_lr_df['community_id'] = full_df.community_id

ANNOY-ING NEIGHBORS

In [None]:
def id2product_id(index):
    return full_df.iloc[index].product_id

def get_neighbors(u, i, k):
    neighbors, distances = u.get_nns_by_item(i=i, n=k+1, include_distances=True)
    return (
      pd.DataFrame({
          'id': i,
          'neighbor_id': neighbors,
          'distance': distances
      })
      .loc[lambda x: x.neighbor_id.ne(i)]
      .assign(rank=[j for j in range(1, k+1)])
    )

In [None]:
index_list = full_df["index"]
vec_dim = 1536
original_annoy_index = AnnoyIndex(vec_dim, 'dot')
original_annoy_index.load('/home/jupyter/20231123_stage_vincent_products_fp_content.ann')
u = AnnoyIndex(vec_dim, 'dot')
new_idx = 0
for i in tqdm(index_list, total=len(index_list)):
    vector = original_annoy_index.get_item_vector(i)
    u.add_item(new_idx, vector)
    new_idx += 1
u.build(100)

In [None]:
saving_csv=True
all_neighbors = []
for i in tqdm(range(0, u.get_n_items())):
    all_neighbors.append(get_neighbors(u=u, i=i, k=10))
all_neighbors = pd.concat(all_neighbors)
all_neighbors['id'] = all_neighbors['id'].apply(id2product_id)
all_neighbors['neighbor_id'] = all_neighbors['neighbor_id'].apply(id2product_id)
scaler = MinMaxScaler()
all_neighbors['distance'] = scaler.fit_transform(all_neighbors[['distance']])
if saving_csv:
    all_neighbors.to_csv("csv_files/lr_product_product.csv", index=False, sep='\u0001')
    
# all_neighbors[["id", "neighbor_id"]].apply(lambda x: " | ".join(np.sort(x)), axis=1).nunique()

In [None]:
product_id = 'EJL1694120625827'

print(full_df[full_df.product_id == product_id].product_name.item())
for neighbor in all_neighbors[all_neighbors.id == product_id].neighbor_id.tolist():
    print("    ", full_df[full_df.product_id == neighbor].product_name.item())

PUSH TO NEO4J

In [None]:
from neo4j import GraphDatabase, basic_auth
username = "neo4j"
password = "zDje683kEKpo23"
driver = GraphDatabase.driver(uri="bolt://a08datasc002.cdbdx.biz:7687", auth=(username, password))

cleaning_all = '''
MATCH (n)
DETACH DELETE n
'''
products_idx = '''
CREATE INDEX products_constraint IF NOT EXISTS FOR (n:Product) ON n.product_id;
'''
# product_property_value,product_id,product_category_level4_id,product_category_level4_name,product_long_description
create_products_nodes = '''
LOAD CSV WITH HEADERS FROM 'file:///var/lib/neo4j/import/lr_products.csv' AS product_line FIELDTERMINATOR ','
// Create products
MERGE (p:Product
            {product_id: product_line.product_id,
             product_name: COALESCE(product_line.fp_product_name, "empty"),
             description: COALESCE(product_line.product_long_description, "empty")
             })
'''
product_product_relationship = '''
// Create a relationship between the products
CALL apoc.periodic.iterate(
  '
  LOAD CSV WITH HEADERS FROM "file:///var/lib/neo4j/import/lr_product_product.csv" AS line FIELDTERMINATOR "\u0001"
  RETURN line
  ',
  '
  MATCH (p:Product {product_id: line.id})
  MATCH (neighbor:Product {product_id: line.neighbor_id})
  MERGE (p)-[:Neighbors]->(neighbor)
  MERGE (neighbor)-[:Neighbors]->(p)
  ',
  {batchSize: 5000, iterateList: true}
)
'''

product_product_relationship_weighted = '''
// Create a weighted relationship between the products
CALL apoc.periodic.iterate(
  '
  LOAD CSV WITH HEADERS FROM "file:///var/lib/neo4j/import/lr_product_product.csv" AS line FIELDTERMINATOR "\u0001"
  RETURN line
  ',
  '
  MATCH (p:Product {product_id: line.id})
  MATCH (neighbor:Product {product_id: line.neighbor_id})
  MERGE (p)-[r:Neighbors {weight: toFloat(line.distance)}]->(neighbor)
  MERGE (neighbor)-[:Neighbors {weight: toFloat(line.distance)}]->(p)
  ',
  {batchSize: 5000, iterateList: true}
)
'''

cleaning_before_pushing=True
weighted_relationship=True

with driver.session(database="neo4j") as session:
    if cleaning_before_pushing:
        session.execute_write(
            lambda tx: tx.run(cleaning_all).data())
        print("Cleaning done")
    session.execute_write(
        lambda tx: tx.run(products_idx).data())
    print("Indexes done")
    session.execute_write(
        lambda tx: tx.run(create_products_nodes).data())
    print("Product nodes added")
    if weighted_relationship:
        session.execute_write(
            lambda tx: tx.run(product_product_relationship_weighted).data())
    else:
        session.execute_write(
            lambda tx: tx.run(product_product_relationship).data())
    print("product_product relationships added")
driver.close()


ANALYSE DE COMMUNAUTES

In [None]:
#from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer

def extract_vocabulary(documents, stop_words, min_len: int = 3, min_df: int = 2):
    vocabulary = ' '.join(documents)
    vocabulary = [w for w in vocabulary.split() if w not in stop_words]
    vocabulary = Counter(vocabulary)
    for k, v in vocabulary.most_common():
        if v < min_df or len(k) < min_len:
            del vocabulary[k]
    return [k for k, _ in vocabulary.most_common()]

def merge_columns(row):
    return f"{' '.join(row['product_name_list'])} {row['description']}"

def get_top_words(response, threshold, feature_array):
    #response_normalized = normalize(response, axis=1, norm='l2')
    response_normalized = custom_norm(response.toarray())
    response_normalized = response_normalized.reshape(-1)
    sorted_nzs = np.argsort(response_normalized.data)[::-1]
    feature_array = feature_array[sorted_nzs]
    response_normalized = response_normalized[sorted_nzs]
    response_normalized = response_normalized > threshold
    res = feature_array[response_normalized]
    return np.apply_along_axis(' | '.join, 0, res)

def custom_norm(x):
    norm = x.sum(axis=1)
    return x / norm

# TF-IDF METHOD
def add_tf_idf_words(document_list, stopwords, threshold):
    vectorizer = CountVectorizer(vocabulary=extract_vocabulary(documents=document_list, stop_words=stopwords))
    vector = vectorizer.transform(document_list)
    v = CustomClassTfidfTransformer(use_idf=True)
    x = v.fit_transform(vector)
    features = np.array(vectorizer.get_feature_names_out())
    return [get_top_words(response=item, threshold=threshold, feature_array=features) for item in x]

# COUNT VECTORIZER METHOD
def add_count_vectorizer_words(document_list, stopwords, threshold):
    vectorizer = CountVectorizer(vocabulary=extract_vocabulary(documents=document_list, stop_words=stopwords))
    vector = vectorizer.transform(document_list)
    features = np.array(vectorizer.get_feature_names_out())
    return [get_top_words(response=item, threshold=threshold, feature_array=features) for item in vector]

In [None]:
full_df.drop(columns=["community_id_x", "community_id_y"], inplace=True)

In [None]:
# Creating communities_df

stop_words = ["sous", "sur", "en", "a", "d", "l", "de", "du", "des", "le", "la", "les", "un", "une", "mais", "ou", "et", "donc", "or", "ni", "car", "ce", "se", "ces", "ses", "ne", "pas", "tout", "tous", "toute", "toutes"]
stop_words += ["je", "tu", "il", "elle", "nous", "vous", "ils", "elles"]
stop_words += nltk.corpus.stopwords.words('french')

product_community_df = pd.read_csv("csv_files/communities_analysis/lr_communities_products.csv", sep=",", engine="python")
full_df = full_df.merge(product_community_df, on='product_id') # Adding the column 'community_id' to full_df
communities_df = full_df.groupby('community_id')['description'].apply(lambda x: " | ".join(x)).reset_index()
communities_df.set_index("community_id", inplace=True)

# Community size
count = full_df.groupby('community_id').product_id.count().reset_index()
count.set_index("community_id", inplace=True)
communities_df["community_size"] = count.product_id
len_before = len(communities_df)
communities_df.drop(communities_df[communities_df.community_size < 2].index, inplace=True)
len_after = len(communities_df)
print(f"len before is:{len_before} | len after is:{len_after}")


# Community Lists
communities_df["product_id_list"] = full_df.groupby("community_id").product_id.apply(list)
communities_df["product_name_list"] = full_df.groupby("community_id").product_name.apply(list)

temp_df = exploded_lr_df.dropna().merge(full_df, left_on='product_id_list', right_on='product_id')
temp_df.drop(columns=['search_id_x', 'search_id_y', 'community_id_x', 'index', 'brand_name', 'description', 'fp_content', 'product_id_list'], inplace=True)
temp_df.rename(columns={'community_id_y':'community_id'}, inplace=True)
communities_df["page_number_list"] = temp_df.groupby("community_id").page_number.apply(list)
del temp_df

#communities_df["category_name_list"] = full_df.groupby("community_id").category_name.apply(list)
#communities_df['category_counts'] = communities_df['category_name_list'].apply(lambda x: dict(Counter(x)))


# TF-IDF and CountVectorizer
communities_df['document'] = communities_df.apply(merge_columns, axis=1)
communities_df['count_vectorizer_top_words'] = add_count_vectorizer_words(document_list=communities_df.document, stopwords=stop_words, threshold=0.025)
communities_df['tf_idf_top_words'] = add_tf_idf_words(document_list=communities_df.document, stopwords=stop_words, threshold=0.015)

communities_df = communities_df.sort_values("community_size", ascending=False)


# Get the top 5 search ids
temp_df = exploded_lr_df.dropna().groupby('community_id', as_index=True).search_id.apply(lambda x: x.value_counts(normalize=True).head(5)).reset_index()
temp_df['search_id'] = temp_df.apply(lambda x: f"{x['level_1']}: {x['search_id']}", axis=1)
temp_df.drop(columns='level_1', inplace=True)
temp_df = temp_df.groupby('community_id').apply(lambda x: list(x.search_id)).rename("top_search_id_list").reset_index()
temp_df.head()
communities_df = communities_df.merge(temp_df, on='community_id')
communities_df.set_index('community_id', inplace=True)
del temp_df

In [None]:
communities_df

In [None]:
full_df[full_df.search_id == 'protegecanape']

In [None]:
# All neighbors df buffed with communities (product_id/communities/distances mapping)

communities_distances_df = all_neighbors.merge(full_df, left_on='neighbor_id', right_on='product_id', how='inner')
communities_distances_df.rename(columns={'community_id': 'neighbor_community_id'}, inplace=True)
communities_distances_df.drop(columns=["product_id"], inplace=True)
communities_distances_df = communities_distances_df.merge(full_df, left_on='id', right_on='product_id', how='inner')
communities_distances_df.drop(columns=["product_id"], inplace=True)

In [None]:
infos = communities_distances_df[communities_distances_df.community_id == 
            communities_distances_df.neighbor_community_id].groupby('community_id').agg(
                dmean=("distance", "mean"),
                dmedian=("distance", lambda x: np.quantile(x,0.5)),
                quantile_025=("distance", lambda x: np.quantile(x,0.25)),
                quantile_095=("distance", lambda x: np.quantile(x,0.95)),
                quantile_005=("distance", lambda x: np.quantile(x,0.05))
            )
infos['community_size'] = communities_df['community_size']
infos.sort_values('community_size', ascending=True, inplace=True)
infos['len_edges'] = communities_distances_df.loc[communities_distances_df.community_id == communities_distances_df.neighbor_community_id].groupby('community_id').count().id
infos['density'] = (infos['len_edges']) / (infos['community_size'] * 10)

infos.sort_values('dmean', ascending=False, inplace=True)
print(len(infos))
display(infos)

In [None]:
def look4one_community(idx):
    row = communities_df.loc[idx]
    row_infos = infos.loc[idx]
    print("\n\t------INFOS------")
    print(f"Mean: {row_infos.dmean} | Median: {row_infos.dmedian} | Density: {row_infos.density} | Community_size: {row.community_size}\n",
          f"| Quantile 25%: {row_infos.quantile_025} | Quantile 95%: {row_infos.quantile_095} | Quantile 5%: {row_infos.quantile_005}")
    print("\t------TOP_SEARCH_IDs------")
    for s_id in row.top_search_id_list:
        print(f"{s_id}")
    print("\t------PRODUCTS_NAME_SAMPLE------")
    products_list = row.product_name_list[:25]
    for product in products_list:
        print(product)
    print(end="\n\n" + "-" * 150 + "\n")

In [None]:
look4one_community(17295)

In [None]:
temp_df = full_df
#temp_df.set_index('search_id', inplace=True)
temp_df = temp_df.groupby('search_id').community_id.apply(lambda x: x.value_counts().head(5)).reset_index()
#temp_df.rename(columns={'community_id':'com_id'}, inplace=True)
temp_df.sort_values('community_id', ascending=False, inplace=True)
display(temp_df)

In [None]:
restrained_df = temp_df[temp_df.level_1.isin(community_ids)]
restrained_df

In [None]:
print(infos.columns)
infos.sort_values('density', inplace=True, ascending=False)
community_ids = communities_df[communities_df.community_size < 40].index
print(f"Number of total community is: {len(community_ids)}")
for community_idx in community_ids:
    look4one_community(community_idx)
    