In [1]:
from neo4j import GraphDatabase

def get_neo4j_driver() -> GraphDatabase.driver:
    """Establishes and returns a Neo4j session for AuraDB."""
    neo4j_uri = 'neo4j+s://9d1381c2.databases.neo4j.io:7687'
    neo4j_user = 'Shirley'
    neo4j_password = 'Sxl19950312'
    return GraphDatabase.driver(neo4j_uri, auth=(neo4j_user, neo4j_password))

# Call the function to get the driver
driver = get_neo4j_driver()

# Open a session
with driver.session() as session:
    # Verify connectivity within the session
    session.run("RETURN 1")

print("Connectivity verified successfully.")


from dash import Dash, dcc, html
import plotly.express as px
import pandas as pd


# Define a Cypher query to retrieve the required data
query = """
MATCH (n)
RETURN n
"""

# Execute the query and process the results
with driver.session() as session:
    result = session.run(query)
    neo4j_data = [record.data() for record in result]

# Converting Neo4j Data to Pandas DataFrame Format
df = pd.DataFrame(neo4j_data)

#print(df)

Connectivity verified successfully.


In [3]:
# DELET :GROUPS_TO RELATIONSHIP
with driver.session() as session:
    session.run("MATCH (:Product)-[r:GROUPS_TO]->(:Group) DELETE r")

# DELECT Group NODE
with driver.session() as session:
    session.run("MATCH (g:Group) DELETE g")


In [7]:
from neo4j import GraphDatabase
import pandas as pd
import networkx as nx
from community import community_louvain
import matplotlib.pyplot as plt
%matplotlib inline

import plotly.graph_objects as go
import community as community_louvain



def get_data(tx):
    query = """
        MATCH (p:Product)-[:STOCKED_IN]->(c:Category),
              (p)-[:HAS_RECIPE]->(r:Recipe)-[:USES_INGREDIENT]->(i:Ingredient)-[:HAS_RESEARCH]->(res:IngredientResearch)
        RETURN p.name AS product, res.ingredient_name AS ingredientResearch, COLLECT(c.name) AS categories
    """
    result = tx.run(query)
    data = [(record["product"], record["ingredientResearch"], record["categories"]) for record in result if record["product"] is not None and record["ingredientResearch"] is not None and record["categories"] is not None]
    return data

with driver.session() as session:
    data = session.read_transaction(get_data)

# Constructing a bipartite graph
bipartite_graph = nx.Graph()
products = set()
ingredients = set()

for product, ingredientResearch, categories in data:
    products.add(product)
    ingredients.add(ingredientResearch)
    for category in categories:
        ingredients.add(category)
    bipartite_graph.add_node(product, bipartite=0)
    bipartite_graph.add_node(ingredientResearch, bipartite=1)
    for category in categories:
        bipartite_graph.add_node(category, bipartite=1)
        bipartite_graph.add_edge(product, category)
    bipartite_graph.add_edge(product, ingredientResearch)

# Generate a projection of the product
product_nodes = {node for node, data in bipartite_graph.nodes(data=True) if data['bipartite'] == 0}
product_graph = nx.bipartite.projected_graph(bipartite_graph, product_nodes, multigraph=True)

# Run Louvain's algorithm for community detection
communities = community_louvain.best_partition(product_graph)

# Store the results in a DataFrame
product_groups = pd.DataFrame({'Product': list(products), 'Group': [communities[node] for node in product_graph.nodes()]})

# Create a list using Plotly
table_data = go.Table(
    header=dict(values=['Product', 'Group']),
    cells=dict(values=[product_groups['Product'], product_groups['Group']])
)

# Layout for the list
layout = go.Layout(title='Product Groups')

# Create figure
fig = go.Figure(data=table_data, layout=layout)

# Show the figure
fig.show()

# Grouping products by cluster and linking product names
grouped_products = product_groups.groupby('Group')['Product'].apply(lambda x: ', '.join(x)).reset_index()

# Sort products according to the group's serial number
grouped_products.sort_values(by='Group', inplace=True)

# Create a new DataFrame with each row containing a group and the corresponding products
new_product_groups = pd.DataFrame({'Group': grouped_products['Group'], 'Products': grouped_products['Product']})

# Create the table
table_data = go.Table(
    header=dict(values=['Group', 'Products']),
    cells=dict(values=[new_product_groups['Group'], new_product_groups['Products']])
)


layout = go.Layout(title='Product Groups (Products Sorted by Group Number)')


fig = go.Figure(data=table_data, layout=layout)
fig.show()


read_transaction has been renamed to execute_read



In [23]:
import pandas as pd
import networkx as nx
from community import community_louvain
import plotly.graph_objs as go


from neo4j import GraphDatabase

with driver.session() as session:
    # Run the Cypher query and get the results
    result = session.run("MATCH (p:Product)-[:STOCKED_IN]->(c:Category), "
                         "(p)-[:HAS_RECIPE]->(r:Recipe)-[:USES_INGREDIENT]->(i:Ingredient)-[:HAS_RESEARCH]->(res:IngredientResearch) "
                         "RETURN p.name AS product, res.ingredient_name AS ingredientResearch, COLLECT(c.name) AS categories")

    # Convert results to Pandas DataFrame
    data = []
    for record in result:
        data.append({'product': record['product'], 'ingredientResearch': record['ingredientResearch'], 'categories': record['categories']})
    df = pd.DataFrame(data)

    # Build a product network based on raw material research
    product_net = nx.Graph()

    for _, row in df.iterrows():
        product = row['product']
        ingredient = row['ingredientResearch']
        categories = row['categories']
        
        # Add node
        product_net.add_node(product, ingredient=ingredient, categories=categories)
        
        # Add edge
        for other_product in product_net.nodes():
            if other_product != product and ingredient in product_net.nodes[other_product]['ingredient']:
                product_net.add_edge(product, other_product, weight=1)

    # Community testing based on raw material research
    ingredient_partition = community_louvain.best_partition(product_net)

    # Visual communities
    ingredient_communities = sorted(set(ingredient_partition.values()))

    trace = go.Pie(labels=[f'Community {i}' for i in ingredient_communities],
                   values=[len([k for k, v in ingredient_partition.items() if v == c]) for c in ingredient_communities])

    fig = go.Figure(data=[trace])
    fig.update_layout(title='Product Communities Based on Ingredient Research')
    fig.show()



In [60]:
import pandas as pd
import networkx as nx
from community import community_louvain
import plotly.graph_objs as go


from neo4j import GraphDatabase



with driver.session() as session:
    # Run the Cypher query and get the results
    result = session.run("MATCH (p:Product)-[:STOCKED_IN]->(c:Category), "
                         "(p)-[:HAS_RECIPE]->(r:Recipe)-[:USES_INGREDIENT]->(i:Ingredient)-[:HAS_RESEARCH]->(res:IngredientResearch) "
                         "RETURN p.name AS product, res.ingredient_name AS ingredientResearch, COLLECT(c.name) AS categories")

    # Convert results to Pandas DataFrame
    data = []
    for record in result:
        data.append({'product': record['product'], 'ingredientResearch': record['ingredientResearch'], 'categories': record['categories']})
    df = pd.DataFrame(data)

    # Build a product network based on raw material research
    product_net = nx.Graph()

    for _, row in df.iterrows():
        product = row['product']
        ingredient = row['ingredientResearch']
        categories = row['categories']
        
        # Add node
        product_net.add_node(product, ingredient=ingredient, categories=categories)
        
        # Add edge
        for other_product in product_net.nodes():
            if other_product != product and ingredient in product_net.nodes[other_product]['ingredient']:
                product_net.add_edge(product, other_product, weight=1)

    # Community testing based on raw material research
    ingredient_partition = community_louvain.best_partition(product_net)

    # Build community lists
    communities = {}
    for node, community in ingredient_partition.items():
        if community not in communities:
            communities[community] = []
        communities[community].append(node)

    # Create the table data
    table_data = []
    for community, products in sorted(communities.items()):
        row = [f"<b>Community {community}</b>"] + ["<br>".join(products)]
        table_data.append(row)

    # Create table
    table = go.Table(
        header=dict(values=["Community", "Products"]),
        cells=dict(values=[list(zip(*table_data))[0], list(zip(*table_data))[1]],
                   fill_color=[['#f2f2f2', 'white'] * len(table_data)],
                   align=['center', 'left'],
                   font=dict(size=10),
                   height=50)
    )

    # Create layout
    layout = go.Layout(title="Product Communities Based on Ingredient Research",
                       width=1000, height=600)

    # Create the figure
    fig = go.Figure(data=[table], layout=layout)
    fig.show()


In [51]:
import pandas as pd
import networkx as nx
from community import community_louvain
import plotly.graph_objs as go


from neo4j import GraphDatabase


with driver.session() as session:
    # Run the Cypher query and get the results
    result = session.run("MATCH (p:Product)-[:STOCKED_IN]->(c:Category), "
                         "(p)-[:HAS_RECIPE]->(r:Recipe)-[:USES_INGREDIENT]->(i:Ingredient)-[:HAS_RESEARCH]->(res:IngredientResearch) "
                         "RETURN p.name AS product, res.ingredient_name AS ingredientResearch, COLLECT(c.name) AS categories")

    # Convert results to Pandas DataFrame
    data = []
    for record in result:
        data.append({'product': record['product'], 'ingredientResearch': record['ingredientResearch'], 'categories': record['categories']})
    df = pd.DataFrame(data)

    # Build product networks based on raw material research and product categories
    product_net = nx.Graph()

    for _, row in df.iterrows():
        product = row['product']
        ingredient = row['ingredientResearch']
        categories = row['categories']
        
        # Add node
        product_net.add_node(product, ingredient=ingredient, categories=categories)
        
        # Add edge
        for other_product in product_net.nodes():
            if other_product != product and ingredient in product_net.nodes[other_product]['ingredient'] and any(c in product_net.nodes[other_product]['categories'] for c in categories):
                product_net.add_edge(product, other_product, weight=1)

    # Community testing based on ingredient research and product categories
    ingredient_partition = community_louvain.best_partition(product_net)

    # Build community lists
    communities = {}
    for node, community in ingredient_partition.items():
        if community not in communities:
            communities[community] = []
        communities[community].append(node)

    # Create the table data
    table_data = []
    for community, products in sorted(communities.items()):
        row = [f"<b>Community {community}</b>"] + ["<br>".join(products)]
        table_data.append(row)

    # Create the table
    table = go.Table(
        header=dict(values=["Community", "Products"]),
        cells=dict(values=[list(zip(*table_data))[0], list(zip(*table_data))[1]],
                   fill_color=[['#f2f2f2', 'white'] * len(table_data)],
                   align=['center', 'left'],
                   font=dict(size=11),
                   height=80)
    )

    # Create the layout
    layout = go.Layout(title="Product Communities Based on Ingredient Research and Category",
                       width=1000, height=600)

    # Create the figure
    fig = go.Figure(data=[table], layout=layout)
    fig.show()



In [49]:

import pandas as pd
import networkx as nx
from networkx.algorithms.community import label_propagation_communities
import plotly.graph_objs as go




with driver.session() as session:
    
    result = session.run("MATCH (p:Product)-[:STOCKED_IN]->(c:Category), "
                         "(p)-[:HAS_RECIPE]->(r:Recipe)-[:USES_INGREDIENT]->(i:Ingredient)-[:HAS_RESEARCH]->(res:IngredientResearch) "
                         "RETURN p.name AS product, res.ingredient_name AS ingredientResearch, COLLECT(c.name) AS categories")

   # Convert results to Pandas DataFrame
    data = []
    for record in result:
        data.append({'product': record['product'], 'ingredientResearch': record['ingredientResearch'], 'categories': record['categories']})
    df = pd.DataFrame(data)

    # Build product networks based on raw material research and product categories
    product_net = nx.Graph()

    for _, row in df.iterrows():
        product = row['product']
        ingredient = row['ingredientResearch']
        categories = row['categories']
        
        # Add node
        product_net.add_node(product, ingredient=ingredient, categories=categories)
        
        # Add edge
        for other_product in product_net.nodes():
            if other_product != product and ingredient in product_net.nodes[other_product]['ingredient'] and any(c in product_net.nodes[other_product]['categories'] for c in categories):
                product_net.add_edge(product, other_product, weight=1)

    # Community Detection Using the Label Propagation Algorithm
    communities = label_propagation_communities(product_net)

    # Convert communities to dictionary format
    community_dict = {}
    for i, community in enumerate(communities):
        community_dict[i] = list(community)

    # Create the table data
    table_data = []
    for community, products in sorted(community_dict.items()):
        row = [f"<b>Community {community}</b>"] + ["<br>".join(products)]
        table_data.append(row)

    # Create the table
    table = go.Table(
        header=dict(values=["Community", "Products"]),
        cells=dict(values=[list(zip(*table_data))[0], list(zip(*table_data))[1]],
                   fill_color=[['#f2f2f2', 'white'] * len(table_data)],
                   align=['center', 'left'],
                   font=dict(size=10),
                   height=50)
    )

    # Create the layout
    layout = go.Layout(title="Product Communities Based on Ingredient Research and Category (Label Propagation)",
                       width=1000, height=600)

    # Create the figure
    fig = go.Figure(data=[table], layout=layout)
    fig.show()



In [13]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import MultiLabelBinarizer
import plotly.graph_objs as go


from neo4j import GraphDatabase


with driver.session() as session:
   
    result = session.run("MATCH (p:Product)-[:STOCKED_IN]->(c:Category), "
                         "(p)-[:HAS_RECIPE]->(r:Recipe)-[:USES_INGREDIENT]->(i:Ingredient)-[:HAS_RESEARCH]->(res:IngredientResearch) "
                         "RETURN p.name AS product, res.ingredient_name AS ingredientResearch, COLLECT(c.name) AS categories")

    # Convert results to Pandas DataFrame
    data = []
    for record in result:
        data.append({'product': record['product'], 'ingredientResearch': record['ingredientResearch'], 'categories': record['categories']})
    df = pd.DataFrame(data)

    # Solo heat coding of raw material research and product categories
    mlb = MultiLabelBinarizer()
    ingredient_encoded = pd.DataFrame(mlb.fit_transform(df['ingredientResearch'].apply(lambda x: [x])),
                                      columns=mlb.classes_, index=df.index)
    category_encoded = pd.DataFrame(mlb.fit_transform(df['categories']),
                                    columns=mlb.classes_, index=df.index)

    # Splicing of coded raw material studies and product categories
    features = pd.concat([ingredient_encoded, category_encoded], axis=1)

    # Clustering using the K-means algorithm
    kmeans = KMeans(n_clusters=6, random_state=50)
    clusters = kmeans.fit_predict(features)

    # Add the clustering results to the DataFrame
    df['community'] = clusters

    # Create table data
    table_data = []
    for community in sorted(df['community'].unique()):
        products = df[df['community'] == community]['product'].unique().tolist()
        row = [f"<b>community {community}</b>"] + ["<br>".join(products)]
        table_data.append(row)

    
    table = go.Table(
        header=dict(values=["community", "Products"]),
        cells=dict(values=[list(zip(*table_data))[0], list(zip(*table_data))[1]],
                   fill_color=[['#f2f2f2', 'white'] * len(table_data)],
                   align=['center', 'left'],
                   font=dict(size=12),
                   height=30)
    )

    
    layout = go.Layout(title="Product communities Based on Ingredient Research and Category (K-means)",
                       width=1000, height=600)

   
    fig = go.Figure(data=[table], layout=layout)
    fig.show()







In [16]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import MultiLabelBinarizer
import plotly.graph_objects as go

# ... (data loading and preprocessing code remains the same)

# Clustering using the K-means algorithm
kmeans = KMeans(n_clusters=6, random_state=50)
clusters = kmeans.fit_predict(features)

# Add the clustering results to the DataFrame
df['community'] = clusters

# Filter products in community 3
community_3_df = df[df['community'] == 3]

if len(community_3_df) > 0:  # Check if there are products in community 3
    # One-hot encoding of ingredient research and product categories for community 3
    community_3_ingredient_encoded = pd.DataFrame(mlb.transform(community_3_df['ingredientResearch'].apply(lambda x: [x])), 
                                                  columns=mlb.classes_, index=community_3_df.index)
    community_3_category_encoded = pd.DataFrame(mlb.transform(community_3_df['categories']), 
                                                columns=mlb.classes_, index=community_3_df.index)

    # Concatenate encoded ingredient research and product categories for community 3
    community_3_features = pd.concat([community_3_ingredient_encoded, community_3_category_encoded], axis=1)

    # Clustering using the K-means algorithm for community 3
    sub_kmeans = KMeans(n_clusters=3, random_state=50)  # Adjust the number of clusters as needed
    sub_clusters = sub_kmeans.fit_predict(community_3_features)

    # Update the community labels for the subdivided community 3
    df.loc[df['community'] == 3, 'community'] = sub_clusters + 6  # 6, 7, 8

# Create table data
table_data = []
for community in sorted(df['community'].unique()):
    products = df[df['community'] == community]['product'].unique().tolist()
    row = [f"<b>Community {community}</b>"] + ["<br>".join(products)]
    table_data.append(row)

# Create the table
table = go.Table(
    header=dict(values=["Community", "Products"]),
    cells=dict(values=[list(zip(*table_data))[0], list(zip(*table_data))[1]],
               fill_color=[['#f2f2f2', 'white'] * len(table_data)],
               align=['center', 'left'],
               font=dict(size=12),
               height=30)
)

# Create the layout
layout = go.Layout(title="Product Communities Based on Ingredient Research and Category (K-means)", 
                   width=1000, height=600)

# Create the figure
fig = go.Figure(data=[table], layout=layout)

# Show the figure
fig.show()




unknown class(es) ['', 'Acesulfame Potassium', 'Acidity Regulator 339', 'Anthocyanins', 'Antioxidant 385', 'Apple', 'Ascorbic Acid', 'Aspartame', 'Australian Spring Water', 'Bis-Diglyceryl Polyacyladipate-2', 'Caffeine', 'Calcium Lactate', 'Carbonated Mineral Water', 'Carbonated Water', 'Carboxymethyl Cellulose', 'Carmine', 'Carminic Acid', 'Carotene', 'Carrageenan', 'Citric Acid', 'Clarified Pineapple Juice Concentrate', 'Cochineal', 'Coconut Extract', 'Colouring Agent 150d', 'Cream', 'Diced Peaches', 'Diced Pears', 'Ferric Ferrocyanide', 'Flavour', 'Folate', 'Fruit Peach', 'Full Cream Milk', 'Guar Gum', 'Guava Puree', 'Homogenised Full Cream Milk', 'Iron Oxides', 'Jelly Natural Pineapple Flavour', 'Locust Bean Gum', 'Lutein', 'Malic Acid', 'Mica', 'Milk', 'Natural Flavour', 'Natural Flavour Botanical Extracts', 'Natural Flavour Citrus Oils', 'Natural Flavour Lemon Peel Extract', 'Natural Flavour Lime Oils', 'Natural Flavour Mango', 'Natural Flavour Orange Peel Extract', 'Natural F

In [7]:
from sklearn.cluster import AgglomerativeClustering
import pandas as pd


n_clusters = 9 
hierarchical_clustering = AgglomerativeClustering(n_clusters=n_clusters)


clusters = hierarchical_clustering.fit_predict(features)


df['community'] = clusters


table_data = []
for community in sorted(df['community'].unique()):
    products = df[df['community'] == community]['product'].unique().tolist()
    row = [f"<b>community {community}</b>"] + ["<br>".join(products)]
    table_data.append(row)


table = go.Table(
    header=dict(values=["community", "Products"]),
    cells=dict(values=[list(zip(*table_data))[0], list(zip(*table_data))[1]],
               fill_color=[['#f2f2f2', 'white'] * len(table_data)],
               align=['center', 'left'],
               font=dict(size=12),
               height=30)
)


layout = go.Layout(title="Product communities Based on Ingredient Research and Category (Hierarchical Clustering)",
                   width=1000, height=600)


fig = go.Figure(data=[table], layout=layout)
fig.show()


In [58]:
import pandas as pd
import networkx as nx
from community import community_louvain
from networkx.algorithms.community import label_propagation_communities
from sklearn.cluster import KMeans
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import silhouette_score


from neo4j import GraphDatabase



with driver.session() as session:
    # Run a Cypher query and get the results
    result = session.run("MATCH (p:Product)-[:STOCKED_IN]->(c:Category), "
                         "(p)-[:HAS_RECIPE]->(r:Recipe)-[:USES_INGREDIENT]->(i:Ingredient)-[:HAS_RESEARCH]->(res:IngredientResearch) "
                         "RETURN p.name AS product, res.ingredient_name AS ingredientResearch, COLLECT(c.name) AS categories")

    # Convert results to Pandas DataFrame
    data = []
    for record in result:
        data.append({'product': record['product'], 'ingredientResearch': record['ingredientResearch'], 'categories': record['categories']})
    df = pd.DataFrame(data)

    # Build product networks based on raw material research and product categories
    product_net = nx.Graph()

    for _, row in df.iterrows():
        product = row['product']
        ingredient = row['ingredientResearch']
        categories = row['categories']
        
        # Add node
        product_net.add_node(product, ingredient=ingredient, categories=categories)
        
        # Add an edge
        for other_product in product_net.nodes():
            if other_product != product and ingredient in product_net.nodes[other_product]['ingredient'] and any(c in product_net.nodes[other_product]['categories'] for c in categories):
                product_net.add_edge(product, other_product, weight=1)

    # Louvain algorithm
    louvain_partition = community_louvain.best_partition(product_net)
    louvain_modularity = community_louvain.modularity(louvain_partition, product_net)
    print(f"Louvain Modularity: {louvain_modularity}")

    # Label Propagation algorithm
    lp_communities = label_propagation_communities(product_net)
    lp_partition = {node: i for i, community in enumerate(lp_communities) for node in community}
    lp_modularity = nx.algorithms.community.modularity(product_net, lp_communities)
    print(f"Label Propagation Modularity: {lp_modularity}")

    # K-means algorithm
    # Unique heat coding for raw material studies and product categories
    mlb = MultiLabelBinarizer()
    ingredient_encoded = pd.DataFrame(mlb.fit_transform(df['ingredientResearch'].apply(lambda x: [x])),
                                      columns=mlb.classes_, index=df.index)
    category_encoded = pd.DataFrame(mlb.fit_transform(df['categories']),
                                    columns=mlb.classes_, index=df.index)

    # Splicing of coded raw material studies and product categories
    features = pd.concat([ingredient_encoded, category_encoded], axis=1)

    # Clustering using the K-means algorithm
    kmeans = KMeans(n_clusters=6, random_state=42)
    clusters = kmeans.fit_predict(features)

    # Add the clustering results to the DataFrame
    df['cluster'] = clusters

    # Calculate the modularity of K-means
    kmeans_partition = {product: cluster for product, cluster in zip(df['product'], df['cluster'])}
    kmeans_modularity = nx.algorithms.community.modularity(product_net, [set(df[df['cluster'] == c]['product']) for c in df['cluster'].unique()])
    print(f"K-means Modularity: {kmeans_modularity}")



Louvain Modularity: 0.8628892733564012
Label Propagation Modularity: 0.8447231833910034






K-means Modularity: 0.79465830449827


In [62]:
#import pandas as pd
#from sklearn.cluster import KMeans
#from sklearn.preprocessing import MultiLabelBinarizer
#from neo4j import GraphDatabase

#with driver.session() as session:
    
#    result = session.run("MATCH (p:Product)-[:STOCKED_IN]->(c:Category), "
#                         "(p)-[:HAS_RECIPE]->(r:Recipe)-[:USES_INGREDIENT]->(i:Ingredient)-[:HAS_RESEARCH]->(res:IngredientResearch) "
#                         "RETURN p.name AS product, res.ingredient_name AS ingredientResearch, COLLECT(c.name) AS categories")

    
#    data = []
#    for record in result:
#        data.append({'product': record['product'], 'ingredientResearch': record['ingredientResearch'], 'categories': record['categories']})
#    df = pd.DataFrame(data)

    
#   mlb = MultiLabelBinarizer()
#    ingredient_encoded = pd.DataFrame(mlb.fit_transform(df['ingredientResearch'].apply(lambda x: [x])),
#                                      columns=mlb.classes_, index=df.index)
#    category_encoded = pd.DataFrame(mlb.fit_transform(df['categories']),
#                                    columns=mlb.classes_, index=df.index)

    
#    features = pd.concat([ingredient_encoded, category_encoded], axis=1)

    
#    kmeans = KMeans(n_clusters=6, random_state=42)
#    clusters = kmeans.fit_predict(features)

#    # Add the clustering results to the DataFrame
#    df['cluster'] = clusters

#    # Create dictionaries of community names and descriptions
#    community_info = {
#        0: {'name': 'Community1', 'description': 'This community is related to drinks.'},
#        1: {'name': 'Community2', 'description': 'This community is related to snacks.'},
#        2: {'name': 'Community3', 'description': 'This community is related to dairy products.'},
#        3: {'name': 'Community4', 'description': 'This community is related to baked goods.'},
#        4: {'name': 'Community5', 'description': 'This community is related to condiments.'},
#        5: {'name': 'Community6', 'description': 'This community is related to frozen foods.'}
#    }

#    # Creating a Community node in Neo4j
#    for cluster_id, info in community_info.items():
#        query = "MERGE (c:Community {id: $cluster_id, name: $name, description: $description})"
#        session.run(query, cluster_id=cluster_id, name=info['name'], description=info['description'])

#    # Create a relationship between the Product node and the Community node.
#    for _, row in df.iterrows():
#        product = row['product']
#        cluster_id = row['cluster']
#        query = "MATCH (p:Product {name: $product}), (c:Community {id: $cluster_id}) MERGE (p)-[:COMMUNITY_TO]->(c)"
#        session.run(query, product=product, cluster_id=cluster_id)







In [65]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import MultiLabelBinarizer

from neo4j import GraphDatabase

with driver.session() as session:

    result = session.run("MATCH (p:Product)-[:STOCKED_IN]->(c:Category), "
                         "(p)-[:HAS_RECIPE]->(r:Recipe)-[:USES_INGREDIENT]->(i:Ingredient)-[:HAS_RESEARCH]->(res:IngredientResearch) "
                         "RETURN p.name AS product, res.ingredient_name AS ingredientResearch, COLLECT(c.name) AS categories")

    
    data = []
    for record in result:
        data.append({'product': record['product'], 'ingredientResearch': record['ingredientResearch'], 'categories': record['categories']})
    df = pd.DataFrame(data)


    mlb = MultiLabelBinarizer()
    ingredient_encoded = pd.DataFrame(mlb.fit_transform(df['ingredientResearch'].apply(lambda x: [x])),
                                      columns=mlb.classes_, index=df.index)
    category_encoded = pd.DataFrame(mlb.fit_transform(df['categories']),
                                    columns=mlb.classes_, index=df.index)

   
    features = pd.concat([ingredient_encoded, category_encoded], axis=1)

    
    kmeans = KMeans(n_clusters=7, random_state=42)
    clusters = kmeans.fit_predict(features)

    df['cluster'] = clusters

     # Create table data
    table_data = []
    for community in sorted(df['community'].unique()):
        products = df[df['community'] == community]['product'].unique().tolist()
        row = [f"<b>community {community}</b>"] + ["<br>".join(products)]
        table_data.append(row)

    
    table = go.Table(
        header=dict(values=["community", "Products"]),
        cells=dict(values=[list(zip(*table_data))[0], list(zip(*table_data))[1]],
                   fill_color=[['#f2f2f2', 'white'] * len(table_data)],
                   align=['center', 'left'],
                   font=dict(size=12),
                   height=30)
    )

    
    layout = go.Layout(title="Product communities Based on Ingredient Research and Category (K-means)",
                       width=1000, height=600)

   
    fig = go.Figure(data=[table], layout=layout)
    fig.show()

    community_info = {
        0: {'name': 'Community1', 'description': 'This community is related to drinks.'},
        1: {'name': 'Community2', 'description': 'This community is related to snacks.'},
        2: {'name': 'Community3', 'description': 'This community is related to dairy products.'},
        3: {'name': 'Community4', 'description': 'This community is related to baked goods.'},
        4: {'name': 'Community5', 'description': 'This community is related to condiments.'},
        5: {'name': 'Community6', 'description': 'This community is related to frozen foods.'},
        6: {'name': 'Community7', 'description': 'This community is related to frozen food.'}
    }

    for cluster_id, info in community_info.items():
        query = "MERGE (c:Community {name: $name, description: $description})"
        session.run(query, name=info['name'], description=info['description'])

    for _, row in df.iterrows():
        product = row['product']
        cluster_name = community_info[row['cluster']]['name']
        query = "MATCH (p:Product {name: $product}), (c:Community {name: $cluster_name}) MERGE (p)-[:BELONGS_TO]->(c)"
        session.run(query, product=product, cluster_name=cluster_name)







In [22]:
#The final one

import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import MultiLabelBinarizer


from neo4j import GraphDatabase



with driver.session() as session:
    
    session.run("MATCH (c:Community) DETACH DELETE c")
    session.run("MATCH (p:Product)-[r:BELONGS_TO]->() DELETE r")
    session.run("MATCH (p:Product)-[r:COMMUNITY_TO]->() DELETE r")

    # Run the Cypher query and get the results
    result = session.run("MATCH (p:Product)-[:STOCKED_IN]->(c:Category), "
                         "(p)-[:HAS_RECIPE]->(r:Recipe)-[:USES_INGREDIENT]->(i:Ingredient)-[:HAS_RESEARCH]->(res:IngredientResearch) "
                         "RETURN p.name AS product, res.ingredient_name AS ingredientResearch, COLLECT(c.name) AS categories")

    # Convert results to Pandas DataFrame
    data = []
    for record in result:
        data.append({'product': record['product'], 'ingredientResearch': record['ingredientResearch'], 'categories': record['categories']})
    df = pd.DataFrame(data)

    # Solo heat coding of raw material research and product categories
    mlb = MultiLabelBinarizer()
    ingredient_encoded = pd.DataFrame(mlb.fit_transform(df['ingredientResearch'].apply(lambda x: [x])),
                                      columns=mlb.classes_, index=df.index)
    category_encoded = pd.DataFrame(mlb.fit_transform(df['categories']),
                                    columns=mlb.classes_, index=df.index)

    # Splicing of coded raw material studies and product categories
    features = pd.concat([ingredient_encoded, category_encoded], axis=1)

    # Use K-means algorithm for clustering, change the number of clusters to 8.
    kmeans = KMeans(n_clusters=8, random_state=42)
    clusters = kmeans.fit_predict(features)

    # Add the clustering results to the DataFrame
    df['cluster'] = clusters

    # Update the dictionary of community names and descriptions
    community_info = {
        0: {'name': 'Fruit juice drinks', 'description': 'This community is related to Fruit juice drinks.'},
        1: {'name': 'Cookies', 'description': 'This community is related to Cookies.'},
        2: {'name': 'Sauce', 'description': 'This community is related to Sauce.'},
        3: {'name': 'Bar', 'description': 'This community is related to Bar.'},
        4: {'name': 'Noodles', 'description': 'This community is related to Noodles.'},
        5: {'name': 'Chips', 'description': 'This community is related to Chips.'},
        6: {'name': 'Carbonated and Bottle drinks', 'description': 'This community is related to Can and Bottle.'},
        7: {'name': 'Others drinks', 'description': 'This community is related to Others drinks.'}
    }

    # Creating a Community node in Neo4j
    for cluster_id, info in community_info.items():
        query = "MERGE (c:Community {name: $name, description: $description})"
        session.run(query, name=info['name'], description=info['description'])

    # Create a relationship between the Product node and the Community node.
    for _, row in df.iterrows():
        product = row['product']
        cluster_name = community_info[row['cluster']]['name']
        query = "MATCH (p:Product {name: $product}), (c:Community {name: $cluster_name}) MERGE (p)-[:COMMUNITY_TO]->(c)"
        session.run(query, product=product, cluster_name=cluster_name)





