<a href="https://colab.research.google.com/github/SushmitalKhan/Dissertation/blob/main/sankey_prep_ipynb_v4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import openai


def run_prompt(prompt):
    response = openai.chat.completions.create(
        model="gpt-4o-mini",  # Changed 'engine' to 'model'
        messages=[{"role": "user", "content": prompt}],
        max_tokens=800,
        temperature=0.7,
        n=1, # Return only one response
        stop=None # optional stopping sequence
        # request_timeout = 60
    )
    # print(response.choices[0].message.content)
    return response.choices[0].message.content  # Access text from choices

In [None]:
import pandas as pd
import numpy as np
import re
from google.colab import drive
drive.mount('/content/drive')
import plotly.graph_objects as go
import json

Mounted at /content/drive


In [None]:
c0 = ('/content/drive/MyDrive/Dissertation/Study 1/Inference_data/01MAY2025/chrome_map_ytWatch_misc_data_c0_i3.json')
c2 = ('/content/drive/MyDrive/Dissertation/Study 1/Inference_data/01MAY2025/chrome_map_ytWatch_misc_data_c2_i3.json')
c3 = ('/content/drive/MyDrive/Dissertation/Study 1/Inference_data/01MAY2025/chrome_map_ytWatch_misc_data_c3_i3.json')
c4 = ('/content/drive/MyDrive/Dissertation/Study 1/Inference_data/01MAY2025/chrome_map_ytWatch_misc_data_c4_i3.json')

In [None]:
file_paths = [c0, c2, c3, c4]

In [None]:
# Container for combined entries
combined_data = []

# Read and flatten data
for path in file_paths:
    with open(path, 'r') as f:
        data = json.load(f)
        for entry in data:
            source = entry.get("columns", "")
            for inf in entry.get("inferences", []):
                flattened_entry = {
                    "source": source,
                    **inf  # merge inference fields
                }
                combined_data.append(flattened_entry)

# Write to a new JSON file
# with open('/content/drive/MyDrive/Dissertation/Study 1/Inference_data/01MAY2025/combined_output.json', 'w') as f_out:
#     json.dump(combined_data, f_out, indent=2)

In [None]:
combined_data

In [None]:
# Define all known source types
source_types = [
    'takeout1_YT_watch-history_Search Title',
    'takeout1_chrome_MyActivity_Search Title',
    'takeout1_maps_MyActivity_Search Title',
    'takeout1_misc_MyActivity_Search Title',
]

In [None]:
# Prepare the transformed data
rows = []
for item in combined_data:
    # Parse multiple sources from the "source" field
    sources = [s.strip() for s in item['source'].split('AND')]

    # Create binary flags
    row = {src: 1 if src in sources else 0 for src in source_types}

    # Add inference and recommendation
    row['Inference'] = item['inference']
    row['Recommendation'] = item['recommendation']
    row['Sensitivity Score'] = item['sensitivity']
    row['Commonness Score'] = item['commonness']

    rows.append(row)

# Convert to DataFrame
df = pd.DataFrame(rows)

# Optional: ensure consistent column order
ordered_columns = source_types + ['Inference', 'Recommendation', 'Sensitivity Score', 'Commonness Score']
binary_df = df[ordered_columns]

binary_df = binary_df.rename(columns={
        'takeout1_YT_watch-history_Search Title': 'YouTube',
        'takeout1_chrome_MyActivity_Search Title': 'Browsing',
        'takeout1_maps_MyActivity_Search Title': 'Location',
        'takeout1_misc_MyActivity_Search Title': 'Other'
    })

pattern = 'Interested in |interested in '

binary_df['Inference'] = binary_df['Inference'].str.replace(pattern, '', regex=True)
# df['Recommendation'] = df['Recommendation'].str.slice(0, 30)

In [None]:
binary_df

In [None]:
def generate_interest_labels(df):

    # Extract the list of interests
    # Use the 'Inference' column directly
    interest_list = binary_df['Inference'].dropna().unique().tolist()

    # Construct the prompt
    prompt = f"""
      1. Relabel duplicates AND similar values in {interest_list} under the same label.
      2. If no similar values are found, return them as-is in both columns (i.e., same value for interest and label).
      3. Ensure that the assigned label are meaningful, concise, and representative of the grouped interests.
      4. Follow EXACT output format

      |interest|label|
      |interest 1|label 1|
      |interest 2|label 1|
      ...
      |interest n|label n|
      |interest n+1|label n|

      DO NOT include any extra text or explanation.
      Return in the EXACT format
    """

    # Send the prompt
    assistant_reply = run_prompt(prompt)
    # print(assistant_reply)

    # Parse the result into a mapping
    mapping = {}
    # Skip the header line and process the rest
    for line in assistant_reply.strip().split("\n")[1:]:
        if line.startswith("|") and line.count("|") == 3:
            parts = line.split("|")
            if len(parts) == 4: # Ensure the line has the expected number of parts
                 _, interest, label, _ = parts
                 mapping[interest.strip().lower()] = label.strip()
            else:
                 print(f"Skipping malformed line: {line}")


    # Apply the mapping to create the new column
    # Use the 'Inference' column directly for mapping
    df["Grouped Inference"] = df["Inference"].apply(
    lambda x: mapping.get(x.lower(), x) if isinstance(x, str) else x
)

    return df

In [None]:
generate_interest_labels(binary_df)

In [None]:
binary_df['Sensitivity Score'] = pd.to_numeric(binary_df['Sensitivity Score'], errors='coerce')

# Similarly, for 'Commonness Score'
binary_df['Commonness Score'] = pd.to_numeric(binary_df['Commonness Score'], errors='coerce')

# Now perform the comparison
sensitive_rows = binary_df['Sensitivity Score'] > 7
uncommon_rows = binary_df['Commonness Score'] < 4

# Update 'Grouped Inference' for these rows
binary_df.loc[sensitive_rows, 'Grouped Inference'] = binary_df.loc[sensitive_rows, 'Inference']
binary_df.loc[uncommon_rows, 'Grouped Inference'] = binary_df.loc[uncommon_rows, 'Inference']

In [None]:
binary_df

In [None]:
# binary_df.to_csv('/content/drive/MyDrive/Dissertation/Study 1/sankey/binary_df_july2.csv', index=False)

In [None]:
!pip install dash
import dash
from dash import dcc, html, Input, Output
import pandas as pd
import plotly.graph_objects as go
from google.colab import output # Import output from google.colab
from itertools import combinations # Needed for generating combinations in the dropdown

In [None]:
# Assuming binary_df is already prepared as in your notebook

# --- Start: Code to create sankey_table ---
# Step 1: Calculate source count in binary_df (assuming source_cols are already defined)
source_cols = ['YouTube', 'Browsing', 'Location', 'Other']
binary_df['source_count'] = binary_df[source_cols].sum(axis=1)

# Step 2: Source → Inference links
source_inference_links = []
for _, row in binary_df.iterrows():
    weight = 1 / row['source_count'] if row['source_count'] > 0 else 0 # Handle case with no sources
    for col in source_cols:
        if row[col] == 1:
            source_inference_links.append({
                'Source': col,
                'Target': row['Grouped Inference'],
                'Weight': weight
            })

# Step 3: Inference → Recommendation links (weight always 1)
# Make sure to use binary_df here as it has 'Grouped Inference' and 'Recommendation'
inference_rec_links = [
    {
        'Source': row['Grouped Inference'],
        'Target': row['Recommendation'],
        'Weight': 1
    } for _, row in binary_df.iterrows()
]

# Step 4: Combine both link types into sankey_table
sankey_table = pd.DataFrame(source_inference_links + inference_rec_links)
# --- End: Code to create sankey_table ---

# All unique node labels (using the newly created sankey_table)
all_nodes = pd.unique(sankey_table[['Source', 'Target']].values.ravel())
node_map = {label: i for i, label in enumerate(all_nodes)}

# Add numeric node IDs to the sankey_table
sankey_table['source_id'] = sankey_table['Source'].map(node_map)
sankey_table['target_id'] = sankey_table['Target'].map(node_map)

# Identify true "source" types: only nodes that appear in 'Source' but never in 'Target'
true_sources = sorted(set(sankey_table['Source']) - set(sankey_table['Target']))


# Helper to build a filtered Sankey link dictionary with stricter filtering
def make_sankey_df_strictly_isolated(filtered_sources=None):
    if filtered_sources is None or not filtered_sources: # Handle None or empty list for no filter
        # If no sources are selected, return an empty dataset
        return dict(source=[], target=[], value=[], label=[]) # Include label for consistency

    # Ensure filtered_sources is a list
    if isinstance(filtered_sources, str):
        selected_sources = [filtered_sources]
    else:
        selected_sources = filtered_sources

    # Create a boolean mask for the strict isolation condition
    # Selected sources must be 1 AND all OTHER sources must be 0
    combined_condition = pd.Series([True] * len(binary_df))

    # Condition 1: Selected sources must be 1
    for source in selected_sources:
        if source in source_cols:
             combined_condition = combined_condition & (binary_df[source] == 1)
        else:
             print(f"Warning: '{source}' is not a recognized source column.")

    # Condition 2: All OTHER sources must be 0
    other_sources = [src for src in source_cols if src not in selected_sources]
    for source in other_sources:
        if source in source_cols:
            combined_condition = combined_condition & (binary_df[source] == 0)


    # Filter binary_df based on the combined strict isolation condition
    filtered_binary_df = binary_df[combined_condition].copy()

    # --- Reconstruct links based on the filtered binary_df ---
    strict_source_inference_links = []
    strict_inference_rec_links = pd.DataFrame() # Initialize empty DataFrame

    if not filtered_binary_df.empty:
        # Create the source → inference links from the filtered binary_df
        for index, row in filtered_binary_df.iterrows():
            # The weight for each selected source to the inference is 1,
            # as these are the *only* active sources for this row.
            weight = 1

            for col in selected_sources: # Iterate only over the selected sources
                 strict_source_inference_links.append({
                     'Source': col,
                     'Target': row['Grouped Inference'],
                     'Weight': weight
                 })

        # Create the inference → recommendation links from the filtered binary_df
        # The source nodes here are the inferences from the filtered binary_df
        # The target nodes are the recommendations from the filtered binary_df
        strict_inference_rec_links = [
            {
                'Source': row['Grouped Inference'],
                'Target': row['Recommendation'],
                'Weight': 1 # Weight is 1 for inference → recommendation
            } for index, row in filtered_binary_df.iterrows()
        ]
        strict_inference_rec_links = pd.DataFrame(strict_inference_rec_links)


    # Combine the strict links using pd.concat
    if strict_source_inference_links or not strict_inference_rec_links.empty:
         strict_sankey_data = pd.concat([pd.DataFrame(strict_source_inference_links), strict_inference_rec_links], ignore_index=True)
    else:
         strict_sankey_data = pd.DataFrame(columns=['Source', 'Target', 'Weight']) # Ensure columns exist even if empty


    # Map to node IDs based *only* on the filtered data
    if strict_sankey_data.empty:
         return dict(source=[], target=[], value=[], label=[]) # Return empty data if no links match the filter

    filtered_nodes = pd.unique(strict_sankey_data[['Source', 'Target']].values.ravel())
    filtered_node_map = {label: i for i, label in enumerate(filtered_nodes)}

    strict_sankey_data['source_id'] = strict_sankey_data['Source'].map(filtered_node_map)
    strict_sankey_data['target_id'] = strict_sankey_data['Target'].map(filtered_node_map)

    node_labels = filtered_nodes.tolist()

    return dict(
        source=strict_sankey_data['source_id'],
        target=strict_sankey_data['target_id'],
        value=strict_sankey_data['Weight'],
        label=node_labels # Include node labels in the return dictionary
    )


# Build figure with initial (all) trace
fig = go.Figure()
# Get initial link data and node labels from the function
# For the 'All' view, we'll show the original, unfiltered Sankey diagram.

# Create the initial Sankey trace using the original sankey_table (for the 'All' view)
initial_sankey_trace = go.Sankey(
    arrangement='snap',
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=all_nodes.tolist() # Use all node labels for the initial view
    ),
    link=dict( # Use the original sankey_table link data
        source=sankey_table['source_id'],
        target=sankey_table['target_id'],
        value=sankey_table['Weight']
    )
)

fig.add_trace(initial_sankey_trace)

# Add dropdown menu
buttons = [
    dict(label="All (Unfiltered)",
         method="restyle", # Use restyle for the unfiltered view
         args=[{
             "link": dict( # Provide the original link data
                 source=sankey_table['source_id'],
                 target=sankey_table['target_id'],
                 value=sankey_table['Weight']
             ),
             "node": {"label": all_nodes.tolist()} # Ensure all node labels are available
         }]),
] + [
    dict(label=src,
         method="update", # Use update for filtered views (nodes and links might change)
         args=[{
             "link": make_sankey_df_strictly_isolated([src]), # Get link data for single source filter
             "node": {"label": make_sankey_df_strictly_isolated([src])['label']} # Get node labels for single source filter
         }])
    for src in true_sources
]

# Add buttons for combinations of sources (example for 2 and 3 sources)
# We only need combinations of 2 and 3 since true_sources has 4 elements,
# and the combination of 4 would be handled by checking if all 4 are 1 (and others are 0).
for r in range(2, len(true_sources)): # Iterate for combinations of 2 and 3 sources
    for combo in combinations(true_sources, r):
        combo_label = " & ".join(combo)
        buttons.append(
            dict(label=combo_label,
                 method="update",
                 args=[{
                     "link": make_sankey_df_strictly_isolated(list(combo)),
                     "node": {"label": make_sankey_df_strictly_isolated(list(combo))['label']}
                 }])
        )

# Add a button for the case where all sources are 1 (and others are 0 - which is impossible here)
# Let's add a button for the combination of all true sources.
if len(true_sources) > 1: # Only add if there are at least two true sources
    all_combo_label = " & ".join(true_sources)
    buttons.append(
         dict(label=all_combo_label,
              method="update",
              args=[{
                  "link": make_sankey_df_strictly_isolated(true_sources),
                  "node": {"label": make_sankey_df_strictly_isolated(true_sources)['label']}
              }])
    )


updatemenus = [
    dict(
        buttons=buttons,
        direction="down",
        showactive=True,
        x=0.1,
        y=1.1
    )
]

fig.update_layout(
    title_text="Strictly Isolated Source-Filtered Sankey Diagram",
    font_size=12,
    updatemenus=updatemenus
)

fig.show()

In [None]:
import plotly.graph_objects as go
from itertools import combinations

# Assuming binary_df and sankey_table are already prepared as in your notebook

# --- Start: Code to create sankey_table ---
# Step 1: Calculate source count in binary_df (assuming source_cols are already defined)
source_cols = ['YouTube', 'Browsing', 'Location', 'Other']
binary_df['source_count'] = binary_df[source_cols].sum(axis=1)

# Step 2: Source → Inference links
source_inference_links = []
for _, row in binary_df.iterrows():
    weight = 1 / row['source_count'] if row['source_count'] > 0 else 0 # Handle case with no sources
    for col in source_cols:
        if row[col] == 1:
            source_inference_links.append({
                'Source': col,
                'Target': row['Grouped Inference'],
                'Weight': weight
            })

# Step 3: Inference → Recommendation links (weight always 1)
# Make sure to use binary_df here as it has 'Grouped Inference' and 'Recommendation'
inference_rec_links = [
    {
        'Source': row['Grouped Inference'],
        'Target': row['Recommendation'],
        'Weight': 1
    } for _, row in binary_df.iterrows()
]

# Step 4: Combine both link types into sankey_table
sankey_table = pd.DataFrame(source_inference_links + inference_rec_links)
# --- End: Code to create sankey_table ---


# All unique node labels (using the newly created sankey_table)
all_nodes = pd.unique(sankey_table[['Source', 'Target']].values.ravel())
node_map = {label: i for i, label in enumerate(all_nodes)}

# Add numeric node IDs to the sankey_table
sankey_table['source_id'] = sankey_table['Source'].map(node_map)
sankey_table['target_id'] = sankey_table['Target'].map(node_map)

# Identify true "source" types: only nodes that appear in 'Source' but never in 'Target'
true_sources = sorted(set(sankey_table['Source']) - set(sankey_table['Target']))


# Helper to build a filtered Sankey link dictionary with stricter filtering
def make_sankey_df_strictly_isolated(filtered_sources=None):
    if filtered_sources is None or not filtered_sources: # Handle None or empty list for no filter
        # If no sources are selected, return an empty dataset
        return dict(source=[], target=[], value=[], label=[]) # Include label for consistency

    # Ensure filtered_sources is a list
    if isinstance(filtered_sources, str):
        selected_sources = [filtered_sources]
    else:
        selected_sources = filtered_sources

    # Create a boolean mask for the strict isolation condition
    # Selected sources must be 1 AND all OTHER sources must be 0
    combined_condition = pd.Series([True] * len(binary_df))

    # Condition 1: Selected sources must be 1
    for source in selected_sources:
        if source in source_cols:
             combined_condition = combined_condition & (binary_df[source] == 1)
        else:
             print(f"Warning: '{source}' is not a recognized source column.")

    # Condition 2: All OTHER sources must be 0
    other_sources = [src for src in source_cols if src not in selected_sources]
    for source in other_sources:
        if source in source_cols:
            combined_condition = combined_condition & (binary_df[source] == 0)


    # Filter binary_df based on the combined strict isolation condition
    filtered_binary_df = binary_df[combined_condition].copy()

    # --- Reconstruct links based on the filtered binary_df ---
    strict_source_inference_links = []
    strict_inference_rec_links = pd.DataFrame() # Initialize empty DataFrame

    if not filtered_binary_df.empty:
        # Create the source → inference links from the filtered binary_df
        for index, row in filtered_binary_df.iterrows():
            # The weight for each selected source to the inference is 1,
            # as these are the *only* active sources for this row.
            weight = 1

            for col in selected_sources: # Iterate only over the selected sources
                 strict_source_inference_links.append({
                     'Source': col,
                     'Target': row['Grouped Inference'],
                     'Weight': weight
                 })

        # Create the inference → recommendation links from the filtered binary_df
        # The source nodes here are the inferences from the filtered binary_df
        # The target nodes are the recommendations from the filtered binary_df
        strict_inference_rec_links = [
            {
                'Source': row['Grouped Inference'],
                'Target': row['Recommendation'],
                'Weight': 1 # Weight is 1 for inference → recommendation
            } for index, row in filtered_binary_df.iterrows()
        ]
        strict_inference_rec_links = pd.DataFrame(strict_inference_rec_links)


    # Combine the strict links using pd.concat
    if strict_source_inference_links or not strict_inference_rec_links.empty:
         strict_sankey_data = pd.concat([pd.DataFrame(strict_source_inference_links), strict_inference_rec_links], ignore_index=True)
    else:
         strict_sankey_data = pd.DataFrame(columns=['Source', 'Target', 'Weight']) # Ensure columns exist even if empty


    # Map to node IDs based *only* on the filtered data
    if strict_sankey_data.empty:
         return dict(source=[], target=[], value=[], label=[]) # Return empty data if no links match the filter

    filtered_nodes = pd.unique(strict_sankey_data[['Source', 'Target']].values.ravel())
    filtered_node_map = {label: i for i, label in enumerate(filtered_nodes)}

    strict_sankey_data['source_id'] = strict_sankey_data['Source'].map(filtered_node_map)
    strict_sankey_data['target_id'] = strict_sankey_data['Target'].map(filtered_node_map)

    node_labels = filtered_nodes.tolist()

    return dict(
        source=strict_sankey_data['source_id'],
        target=strict_sankey_data['target_id'],
        value=strict_sankey_data['Weight'],
        label=node_labels # Include node labels in the return dictionary
    )


# Build figure with initial (all) trace
fig = go.Figure()
# Get initial link data and node labels from the function
# For the 'All' view, we'll show the original, unfiltered Sankey diagram.

# Create the initial Sankey trace using the original sankey_table (for the 'All' view)
initial_sankey_trace = go.Sankey(
    arrangement='snap',
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=all_nodes.tolist() # Use all node labels for the initial view
    ),
    link=dict( # Use the original sankey_table link data
        source=sankey_table['source_id'],
        target=sankey_table['target_id'],
        value=sankey_table['Weight']
    )
)

fig.add_trace(initial_sankey_trace)

# Add dropdown menu
buttons = [
    dict(label="All (Unfiltered)",
         method="restyle", # Use restyle for the unfiltered view
         args=[{
             "link": dict( # Provide the original link data
                 source=sankey_table['source_id'],
                 target=sankey_table['target_id'],
                 value=sankey_table['Weight']
             ),
             "node": {"label": all_nodes.tolist()} # Ensure all node labels are available
         }]),
] + [
    dict(label=src,
         method="update", # Use update for filtered views (nodes and links might change)
         args=[{
             "link": make_sankey_df_strictly_isolated([src]), # Get link data for single source filter
             "node": {"label": make_sankey_df_strictly_isolated([src])['label']} # Get node labels for single source filter
         }])
    for src in true_sources
]

# Add buttons for combinations of sources (example for 2 and 3 sources)
# We only need combinations of 2 and 3 since true_sources has 4 elements,
# and the combination of 4 would be handled by checking if all 4 are 1 (and others are 0).
for r in range(2, len(true_sources)): # Iterate for combinations of 2 and 3 sources
    for combo in combinations(true_sources, r):
        combo_label = " & ".join(combo)
        buttons.append(
            dict(label=combo_label,
                 method="update",
                 args=[{
                     "link": make_sankey_df_strictly_isolated(list(combo)),
                     "node": {"label": make_sankey_df_strictly_isolated(list(combo))['label']}
                 }])
        )

# Add a button for the case where all sources are 1 (and others are 0 - which is impossible here)
# Let's add a button for the combination of all true sources.
if len(true_sources) > 1: # Only add if there are at least two true sources
    all_combo_label = " & ".join(true_sources)
    buttons.append(
         dict(label=all_combo_label,
              method="update",
              args=[{
                  "link": make_sankey_df_strictly_isolated(true_sources),
                  "node": {"label": make_sankey_df_strictly_isolated(true_sources)['label']}
              }])
    )


updatemenus = [
    dict(
        buttons=buttons,
        direction="down",
        showactive=True,
        x=0.1,
        y=1.1
    )
]

fig.update_layout(
    title_text="Strictly Isolated Source-Filtered Sankey Diagram",
    font_size=12,
    updatemenus=updatemenus
)

fig.show()

assuming that i will get a lot more recommendations after the combinations, for each of the interest, and each set of 3 recommendation ask for multiple final product recommendation, and specific products rather than generic products, take the entire list of products, cluster those products, and now the sankey plot becomes dataset inference, product, product cluster

sankey --> connection between data + inference, inference + recommendation

*prompt: based on these three inference, recommend one thing*

