<a href="https://colab.research.google.com/github/SushmitalKhan/Dissertation/blob/main/infer_w_Cat.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import openai

# It's recommended to use environment variables or Colab secrets for API keys
# For demonstration, keeping it here, but be cautious with sharing notebooks with keys

def run_prompt(prompt):
    response = openai.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=2000,  # Increased max_tokens to allow for longer response
        temperature=0.7,
        n=1,
        stop=None
    )
    return response.choices[0].message.content

In [None]:
import pandas as pd
import numpy as np
import re
from google.colab import drive
drive.mount('/content/drive')
import plotly.graph_objects as go
import json
import os

Mounted at /content/drive


In [None]:
FILE_PATH = "/content/drive/MyDrive/Dissertation/Study 1/Inference_data/columns_[('Image_Search', 'Browser_history', 'Location_history', 'YT_search_history')]_inferenceNo_[3]_combinations_[1, 2, 3, 4]_20251021_005136.json"

In [None]:
import json
import re
# Assuming run_prompt function is defined and available from a previous cell

# --- Load your JSON data ---
# Ensure FILE_PATH is defined in a previous cell
# FILE_PATH = "/content/drive/MyDrive/Dissertation/Study 1/Inference_data/columns_[('Image_Search', 'Browser_history', 'Location_history', 'YT_search_history')]_inferenceNo_[3]_combinations_[1, 2, 3, 4]_20251021_005136.json" # Assuming FILE_PATH is set elsewhere

with open(FILE_PATH, "r") as f:
    data = json.load(f)

# --- Extract all inferences ---
all_inferences = []
for entry in data:
    inferences = entry.get("gpt_output", {}).get("inferences", [])
    for inf in inferences:
        if "inference" in inf:
            all_inferences.append(inf["inference"])

# --- Prepare prompt for thematic labeling ---
# import json # Already imported

prompt = f"""
You are an assistant that groups user inferences into short, descriptive thematic labels.
- Here are all the inferences:

{json.dumps(all_inferences, indent=2)}

Task:
1. Thematically analyze inferences and label the themes. Themes should be conceptually similar.
2. Labels should be short, descriptive, and reflect the main topic of the inference.
3. Return a valid JSON mapping in the following format:
   {{
     "inference text 1": "Label 1",
     "inference text 2": "Label 2",
     ...
   }}
4. Include all inferences. Do not include any text outside the JSON.

{json.dumps(all_inferences)}

"""
# --- 5. Parse JSON safely ---
def parse_llm_json(output):
    # Adjust the regex to be more robust to potential leading/trailing text
    # It will look for the first occurrence of a JSON object {.*}
    match = re.search(r"\{.*\}", output, re.DOTALL)
    if not match:
        raise ValueError(f"No JSON found in LLM output:\n{output}")
    try:
        return json.loads(match.group(0))
    except json.JSONDecodeError as e:
        raise ValueError(f"Invalid JSON:\n{match.group(0)}") from e

assistant_reply = run_prompt(prompt)
print("LLM Assistant Reply (for debugging):") # Added print for debugging
print(assistant_reply) # Added print for debugging

inference_label_map = parse_llm_json(assistant_reply)

# --- 6. Add label inside gpt_output ---
for entry in data:
    for inf in entry['gpt_output'].get('inferences', []):
        inf_text = inf['inference']
        # Use the map to get the label, default to "Other" if not found
        inf['label'] = inference_label_map.get(inf_text, "Other")

# --- 7. Save updated JSON ---
OUTPUT_PATH = "labeled_inferences.json"
with open(OUTPUT_PATH, 'w') as f:
    json.dump(data, f, indent=2)

print(f"Labeled JSON saved to {OUTPUT_PATH}")

In [None]:
import json

# Load the labeled JSON data
LABELED_FILE_PATH = "labeled_inferences.json"
with open(LABELED_FILE_PATH, "r") as f:
    labeled_data = json.load(f)

# Collect all unique labels
all_labels = set()
for entry in labeled_data:
    inferences = entry.get("gpt_output", {}).get("inferences", [])
    for inf in inferences:
        if "label" in inf:
            all_labels.add(inf["label"])

# Print all unique labels
print("Unique Labels:")
for label in sorted(list(all_labels)):
    print(f"- {label}")

In [None]:
# --- 1. DATA FLATTENING AND CLUSTERING (Topic Extraction) ---
FILE_PATH = "labeled_inferences.json"

# Load JSON
with open(FILE_PATH, 'r') as f:
    json_data = json.load(f)

flattened_data = []
inferences_list = []

for entry in json_data:
    combined_cols = entry.get('combined_cols', [])
    combo_size = entry.get('combo_size', 0)
    inferences = entry.get('gpt_output', {}).get('inferences', [])

    for inference_item in inferences:
        inference_text = inference_item.get('inference')
        inference_label = inference_item.get('label')
        product = inference_item.get('recommended_product') or inference_item.get('product_recommendation') or ""

        # Store full metadata
        flattened_data.append({
            'Inference': inference_text,
            'Interested Category': inference_label,
            'uncommonness': inference_item.get('uncommonness'),
            'sensitivity': inference_item.get('sensitivity'),
            'Recommended_Product': product,
            'Combined_Columns': ', '.join(combined_cols),
            'Source_Count': combo_size
        })

        if inference_text:
            inferences_list.append(inference_text)

# Create DataFrames once after the loop
df_labeled = pd.DataFrame(flattened_data)
# df_inferences = pd.DataFrame(inferences_list, columns=['Inference'])

# print(df_full.head())
# print(df_inferences.head())


In [None]:
import json
import re

def extract_inference_details(inference_text):
    """
    Use GPT to split an inference into category, activity, and reason.
    Falls back to default parsing if model output isn't valid JSON.
    """

    prompt = f"""
    You are given an inference about a user's behavior or interest.
    Extract and return a JSON object with the following keys:
    - category: a short theme (e.g., 'Health & Fitness', 'Travel', 'Cooking')
    - activity: what the user is doing or interested in
    - reason: the evidence or rationale given in the sentence

    Example:
    Input: "The user is interested in health and fitness, as indicated by their search for exercises like 'leg raises' and 'calf raises', as well as their queries about tendonitis and Pilates."
    Output: {{
      "category": "Health & Fitness",
      "activity": "Exercise Routine like Pilates",
      "reason": "User searched for exercises such as leg raises and calf raises, and looked up information on tendonitis and Pilates."
    }}

    Now extract for this input:
    "{inference_text}"
    Return ONLY valid JSON.
    """

    reply = run_prompt(prompt)  # Your existing GPT call
    match = re.search(r"\{.*\}", reply, re.DOTALL)
    if match:
        try:
            return json.loads(match.group(0))
        except json.JSONDecodeError:
            pass

    # Fallback default structure if parsing fails
    return {
        "category": "Other",
        "activity": inference_text.split(",")[0].replace("The user", "User").strip(),
        "reason": inference_text
    }

def add_inference_details(data):
    """
    Add category, activity, and reason for each inference in the dataset.
    """
    for entry in data:
        gpt_output = entry.get("gpt_output", {})
        inferences = gpt_output.get("inferences", [])

        for inf in inferences:
            details = extract_inference_details(inf["inference"])
            inf.update(details)  # Add new keys to each inference

    return data


# ---- Run on your input file ----

updated_data = add_inference_details(data)

with open("output_with_details.json", "w") as f:
    json.dump(updated_data, f, indent=2)

print(f"✅ Done! Added category, activity, and reason to all inferences. Saved to {output_path}")


✅ Done! Added category, activity, and reason to all inferences. Saved to output_with_details.json


In [None]:
# Define all known source types
source_types = ['Image_Search', 'Browser_history', 'Location_history', 'YT_search_history', 'Misc_history']

In [None]:
# Prepare the transformed data
rows = []
source_types = ['Image_Search', 'Browser_history', 'Location_history', 'YT_search_history', 'Misc_history']

for _, item in df_full.iterrows():
    # Parse multiple sources from the "Combined_Columns" field
    sources = [s.strip() for s in item['Combined_Columns'].split(',')]

    # Use the actual inference from this row
    inf = item['Inference']

    # Create binary flags for each source
    row = {src: 1 if src in sources else 0 for src in source_types}

    # Add inference and recommendation
    row['Inference'] = inf
    row['Recommendation'] = item.get('Recommended_Product', '')
    row['Sensitivity Score'] = item.get('sensitivity', '')
    row['Commonness Score'] = item.get('uncommonness', '')

    rows.append(row)

# Convert to DataFrame
df = pd.DataFrame(rows)

# Optional: ensure consistent column order
ordered_columns = source_types + ['Inference', 'Recommendation', 'Sensitivity Score', 'Commonness Score']
binary_df = df[ordered_columns]

# Rename columns
binary_df = binary_df.rename(columns={
    'Image_Search': 'Search',
    'Browser_history': 'Browsing',
    'Location_history': 'Location',
    'YT_search_history': 'YouTube',
    'Misc_history': 'Misc'
})

# Remove leading phrases like 'Interested in '
# binary_df['Inference'] = binary_df['Inference'].str.replace(pattern, '', regex=True)


In [None]:
binary_df

Unnamed: 0,Search,Browsing,Location,YouTube,Misc,Inference,Recommendation,Sensitivity Score,Commonness Score
0,1,0,0,0,0,"The user is interested in health and fitness, ...",Resistance Bands for Home Workouts,3,4
1,1,0,0,0,0,The user appears to have an interest in garden...,Gardening Tool Set,2,5
2,1,0,0,0,0,The user has a creative inclination towards ho...,DIY Home Decor Craft Kit,4,6
3,0,1,0,0,0,The user is involved in academic research or e...,Research Methods in HCI (Book),6,4
4,0,1,0,0,0,The user has a strong interest in data privacy...,Privacy and Data Protection in Education (Onli...,7,5
5,0,1,0,0,0,The user is planning a trip or seeking accommo...,Travel Guide to Myrtle Beach (Book),2,3
6,0,0,1,0,0,"The user enjoys outdoor activities and nature,...",National Parks Annual Pass,3,4
7,0,0,1,0,0,The user has a strong interest in travel and e...,Travel Guidebook for the Southeastern United S...,2,5
8,0,0,1,0,0,The user appears to have connections or intere...,Clemson University Merchandise,4,6
9,0,0,0,1,0,The user is interested in cooking and trying o...,Cooking Class Subscription,3,4


In [None]:
def generate_interest_labels(df):
    """
    Generate grouped labels for inferences using an LLM.
    Works on a copy of the DataFrame to avoid modifying the original.
    """

    df_copy = df.copy()
    interest_list = df_copy['Inference'].dropna().unique().tolist()

    # Construct the prompt
    prompt = f"""
1. Analyze {interest_list} and label conceptually similar inferences.
2. Label should summarize the shared theme of similar items. Labels should be short, descriptive, and representative of the inference.
3. If no similar values exist, use the same value for both columns (i.e., identical interest and label).
4. Return Labels as a new column with header 'label' in the format '|inference|label|'.
"""

    # Send the prompt
    assistant_reply = run_prompt(prompt)
    # print(assistant_reply)

    # Parse LLM output into mapping
    mapping = {}
    lines = assistant_reply.strip().split("\n")
    for line in lines:
        line = line.strip()
        # Check for table format
        if line.startswith("|") and line.count("|") == 2:
            _, interest, label = line.split("|")
            mapping[interest.strip().lower()] = label.strip()
        # Optional: check for colon or arrow separators
        elif ":" in line:
            parts = line.split(":", 1)
            mapping[parts[0].strip().lower()] = parts[1].strip()
        elif "->" in line:
            parts = line.split("->", 1)
            mapping[parts[0].strip().lower()] = parts[1].strip()

    # Map inferences to labels on the copy
    df_copy["Label"] = df_copy["Inference"].apply(
        lambda x: mapping.get(x.lower(), x) if isinstance(x, str) else x
    )

    return df_copy


In [None]:
generate_interest_labels(binary_df)
# binary_df[['Inference', 'Grouped Inference']].head()

Unnamed: 0,Search,Browsing,Location,YouTube,Misc,Inference,Recommendation,Sensitivity Score,Commonness Score,Grouped Inference,Label
0,1,0,0,0,0,"The user is interested in health and fitness, ...",Resistance Bands for Home Workouts,3,4,"The user is interested in health and fitness, ...","The user is interested in health and fitness, ..."
1,1,0,0,0,0,The user appears to have an interest in garden...,Gardening Tool Set,2,5,The user appears to have an interest in garden...,The user appears to have an interest in garden...
2,1,0,0,0,0,The user has a creative inclination towards ho...,DIY Home Decor Craft Kit,4,6,The user has a creative inclination towards ho...,The user has a creative inclination towards ho...
3,0,1,0,0,0,The user is involved in academic research or e...,Research Methods in HCI (Book),6,4,The user is involved in academic research or e...,The user is involved in academic research or e...
4,0,1,0,0,0,The user has a strong interest in data privacy...,Privacy and Data Protection in Education (Onli...,7,5,The user has a strong interest in data privacy...,The user has a strong interest in data privacy...
5,0,1,0,0,0,The user is planning a trip or seeking accommo...,Travel Guide to Myrtle Beach (Book),2,3,The user is planning a trip or seeking accommo...,The user is planning a trip or seeking accommo...
6,0,0,1,0,0,"The user enjoys outdoor activities and nature,...",National Parks Annual Pass,3,4,"The user enjoys outdoor activities and nature,...","The user enjoys outdoor activities and nature,..."
7,0,0,1,0,0,The user has a strong interest in travel and e...,Travel Guidebook for the Southeastern United S...,2,5,The user has a strong interest in travel and e...,The user has a strong interest in travel and e...
8,0,0,1,0,0,The user appears to have connections or intere...,Clemson University Merchandise,4,6,The user appears to have connections or intere...,The user appears to have connections or intere...
9,0,0,0,1,0,The user is interested in cooking and trying o...,Cooking Class Subscription,3,4,The user is interested in cooking and trying o...,The user is interested in cooking and trying o...


In [None]:
binary_df

Unnamed: 0,Inference,Grouped Inference
0,"The user is interested in health and fitness, ...","The user is interested in health and fitness, ..."
1,The user appears to have an interest in garden...,The user appears to have an interest in garden...
2,The user has a creative inclination towards ho...,The user has a creative inclination towards ho...
3,The user is involved in academic research or e...,The user is involved in academic research or e...
4,The user has a strong interest in data privacy...,The user has a strong interest in data privacy...
5,The user is planning a trip or seeking accommo...,The user is planning a trip or seeking accommo...
6,"The user enjoys outdoor activities and nature,...","The user enjoys outdoor activities and nature,..."
7,The user has a strong interest in travel and e...,The user has a strong interest in travel and e...
8,The user appears to have connections or intere...,The user appears to have connections or intere...
9,The user is interested in cooking and trying o...,The user is interested in cooking and trying o...


In [None]:
import pandas as pd
import json
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

def run_dynamic_profiling(json_file_path: str, n_clusters: int = 5) -> pd.DataFrame:
    """
    Analyzes a user profile JSON dataset, dynamically extracts topics using NLP clustering,
    and categorizes all inferences across sensitivity, risk, and source type dimensions.

    Args:
        json_file_path (str): The path to the input JSON file (containing 'gpt_output', etc.).
        n_clusters (int): The number of conceptual topics (K) to extract using K-Means.

    Returns:
        pd.DataFrame: A structured DataFrame with all categorized dimensions.
    """

    # --- 1. DATA FLATTENING AND CLUSTERING (Topic Extraction) ---

    with open(json_file_path, 'r') as f:
        json_data = json.load(f)

    flattened_data = []
    inferences_list = []
    for entry in json_data:
        combined_cols = entry.get('combined_cols', [])
        combo_size = entry.get('combo_size', 0)

        inferences = entry.get('gpt_output', {}).get('inferences', [])
        for inference_item in inferences:
            inference_text = inference_item.get('inference')

            # Store full metadata for later use
            flattened_data.append({
                'Inference': inference_text,
                'uncommonness': inference_item.get('uncommonness'),
                'sensitivity': inference_item.get('sensitivity'),
                'Recommended_Product': inference_item.get('recommended_product'),
                'Combined_Columns': ', '.join(combined_cols),
                'Source_Count': combo_size
            })

            if inference_text:
                inferences_list.append(inference_text)

    df_full = pd.DataFrame(flattened_data)
    df_inferences = pd.DataFrame(inferences_list, columns=['Inference'])

    # 1b. Text Preprocessing
    def preprocess(text):
        text = text.lower()
        text = re.sub(r'[^\w\s]', '', text)
        return text

    processed_data = df_inferences['Inference'].apply(preprocess)

    # --- MODIFIED SECTION INSIDE run_dynamic_profiling ---

    # ... (code before vectorization)
    custom_stop_words = list(ENGLISH_STOP_WORDS) + [
    'user', 'likely', 'appears', 'interested', 'indicated',
    'strong', 'shows', 'evidence', 'specific', 'types'
]
    # 1c. Vectorization and K-Means Clustering
    tfidf_vectorizer = TfidfVectorizer(
        stop_words=custom_stop_words,
        max_features=50
    )
    tfidf_matrix = tfidf_vectorizer.fit_transform(processed_data)

    # Check if the vocabulary is empty (the fix for NotFittedError)
    if not tfidf_vectorizer.vocabulary_:
        print("WARNING: Vectorizer resulted in an empty vocabulary. Clustering skipped.")
        df_inferences['NLP_Cluster'] = 0 # Assign all to one cluster
        n_clusters_actual = 1
    else:
        # Proceed with K-Means clustering only if vocabulary is not empty
        kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
        kmeans.fit(tfidf_matrix)
        df_inferences['NLP_Cluster'] = kmeans.labels_
        n_clusters_actual = n_clusters

    # --- 2. DYNAMIC TOPIC LABELING ---

    feature_names = tfidf_vectorizer.get_feature_names_out()
    order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1] if 'kmeans' in locals() else []

    cluster_labels = {}
    for i in range(n_clusters_actual):
        if n_clusters_actual == 1:
            # Fallback label for empty vocabulary case
            cluster_labels[i] = "Topic: Undefined / Low Variety"
        else:
            # Normal dynamic label generation
            top_words = [feature_names[ind] for ind in order_centroids[i, :3]]
            cluster_labels[i] = "Topic: " + " / ".join(top_words).title()

    # ... (rest of the script continues)

    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    kmeans.fit(tfidf_matrix)
    df_inferences['NLP_Cluster'] = kmeans.labels_

    # --- 2. DYNAMIC TOPIC LABELING (The most crucial change) ---

    feature_names = tfidf_vectorizer.get_feature_names_out()
    order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]

    # Analyze the top 3 defining words for each cluster
    cluster_labels = {}
    for i in range(n_clusters):
        top_words = [feature_names[ind] for ind in order_centroids[i, :3]]
        # Create a dynamic label based on the top words
        cluster_labels[i] = " / ".join(top_words).title()

    # Merge Topic Categories back to the full metadata DataFrame
    df = pd.merge(df_full, df_inferences, on='Inference', how='left')
    df['NLP_Cluster'] = df['NLP_Cluster'].astype(int)

    # Apply the dynamic topic labels
    df['Topic Category'] = df['NLP_Cluster'].map(cluster_labels)


    # --- 3. STRUCTURED CATEGORIZATION ---

    # 3a. Sensitivity Level Mapping (Thresholds: Low [1-3], Medium [4-6], High [7+])
    def categorize_sensitivity(score):
        if score >= 7:
            return 'High'
        elif score >= 4:
            return 'Medium'
        else:
            return 'Low'

    df['Sensitivity Level'] = df['sensitivity'].apply(categorize_sensitivity)

    # 3b. Uncommonness Level Mapping (Thresholds: Common [1-3], Moderate [4-6], Rare [7+])
    def categorize_uncommonness(score):
        if score >= 7:
            return 'Rare'
        elif score >= 4:
            return 'Moderate'
        else:
            return 'Common'

    df['Uncommonness Level'] = df['uncommonness'].apply(categorize_uncommonness)

    # 3c. Ethical Risk Assessment Logic
    def assess_ethical_risk(sensitivity_level, uncommonness_level):
        if sensitivity_level == 'High':
            return 'High Risk'
        elif sensitivity_level == 'Medium' and uncommonness_level in ['Moderate', 'Rare']:
            return 'Medium Risk'
        elif sensitivity_level == 'Low' and uncommonness_level == 'Rare':
            return 'Medium Risk'
        else:
            return 'Low Risk'

    df['Ethical Risk'] = df.apply(
        lambda row: assess_ethical_risk(row['Sensitivity Level'], row['Uncommonness Level']), axis=1
    )

    # 3d. Data Source Type Categorization
    def categorize_source_type(source_count):
        if source_count == 1:
            return 'Single-source'
        elif source_count == 2:
            return 'Cross-source'
        elif source_count >= 3:
            return 'Multi-context'
        else:
            return 'Error'

    df['Data Source Type'] = df['Source_Count'].apply(categorize_source_type)

    # --- 4. FINAL OUTPUT STRUCTURE ---

    final_columns = [
        'Inference',
        'Topic Category',
        'Sensitivity Level',
        'Uncommonness Level',
        'Ethical Risk',
        'Data Source Type',
        'Recommended_Product',
        'Combined_Columns'
    ]

    # Create the final DataFrame and rename columns for presentation
    df_final = df[final_columns].rename(columns={
        'Recommended_Product': 'Recommended Products',
        'Combined_Columns': 'Combined Columns'
    })

    return df_final

In [None]:
import pandas as pd
import json
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
# Required for comprehensive stop word removal
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS


def run_dynamic_profiling(json_file_path: str, n_clusters: int = 5) -> pd.DataFrame:
    """
    Analyzes a user profile JSON dataset, dynamically extracts conceptually relevant topics
    using NLP clustering, and categorizes all inferences across structured dimensions.
    """

        # --- 1. DATA FLATTENING AND CLUSTERING (Topic Extraction) ---

    with open(json_file_path, 'r') as f:
        json_data = json.load(f)

    # --- 1. DATA FLATTENING ---
    flattened_data = []
    inferences_list = []
    for entry in json_data:
        combined_cols = entry.get('combined_cols', [])
        combo_size = entry.get('combo_size', 0)
        inferences = entry.get('gpt_output', {}).get('inferences', [])
        for inference_item in inferences:
            inference_text = inference_item.get('inference')
            flattened_data.append({
                'Inference': inference_text,
                'uncommonness': inference_item.get('uncommonness'),
                'sensitivity': inference_item.get('sensitivity'),
                'Recommended_Product': inference_item.get('recommended_product'),
                'Combined_Columns': ', '.join(combined_cols),
                'Source_Count': combo_size
            })
            if inference_text:
                inferences_list.append(inference_text)

    df_full = pd.DataFrame(flattened_data)
    df_inferences = pd.DataFrame(inferences_list, columns=['Inference'])

    # 1b. Text Preprocessing
    def preprocess(text):
        text = text.lower()
        text = re.sub(r'[^\w\s]', '', text)
        return text

    processed_data = df_inferences['Inference'].apply(preprocess)

    # --- 2. DYNAMIC TOPIC EXTRACTION (CLUSTERING) ---

    # Define a comprehensive stop word list to filter out structural/vague words.
    custom_stop_words = list(ENGLISH_STOP_WORDS) + [
        'user', 'likely', 'appears', 'interested', 'indicated',
        'strong', 'shows', 'evidence', 'specific', 'types',
        'various', 'related', 'focus', 'content', 'searches' # Added more generic filter terms
    ]

    tfidf_vectorizer = TfidfVectorizer(
        stop_words=custom_stop_words,
        max_features=50
    )
    tfidf_matrix = tfidf_vectorizer.fit_transform(processed_data)

    # Robustness Check: Handle NotFittedError if vocabulary is empty
    if not tfidf_vectorizer.vocabulary_:
        df_inferences['NLP_Cluster'] = 0
        n_clusters_actual = 1
        print("WARNING: Vectorizer vocabulary is empty. Clustering skipped.")
    else:
        # Proceed with K-Means clustering
        kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
        kmeans.fit(tfidf_matrix)
        df_inferences['NLP_Cluster'] = kmeans.labels_
        n_clusters_actual = n_clusters


    # --- 3. CONCEPTUAL LABELING ---

    feature_names = tfidf_vectorizer.get_feature_names_out()
    order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1] if 'kmeans' in locals() and n_clusters_actual > 1 else []

    # Filter for cleaner labels
    filter_words_for_label = {'user', 'likely', 'topic', 'related', 'content'} # Words not wanted in the final label itself

    cluster_labels = {}
    for i in range(n_clusters_actual):
        if n_clusters_actual == 1:
            # Fallback label for low-variety data
            cluster_labels[i] = "Topic: Undefined / Low Variety"
        else:
            # Generate conceptually rich label from the top 5 words
            top_words_all = [feature_names[ind] for ind in order_centroids[i, :8]]

            # Select first 5 non-filtered words for the label
            top_words_filtered = []
            for word in top_words_all:
                if word not in filter_words_for_label:
                    top_words_filtered.append(word)
                if len(top_words_filtered) == 5: # Use top 5 conceptually relevant words
                    break

            cluster_labels[i] = "Topic: " + " / ".join(top_words_filtered).title()

    # Apply the dynamic topic labels
    df = pd.merge(df_full, df_inferences, on='Inference', how='left')
    df['NLP_Cluster'] = df['NLP_Cluster'].astype(int)
    df['Topic Category'] = df['NLP_Cluster'].map(cluster_labels)


    # --- 4. STRUCTURED CATEGORIZATION (Sensitivity, Risk, etc.) ---

    # 4a. Sensitivity Level Mapping (Thresholds: Low [1-3], Medium [4-6], High [7+])
    def categorize_sensitivity(score):
        if score >= 7: return 'High'
        elif score >= 4: return 'Medium'
        else: return 'Low'
    df['Sensitivity Level'] = df['sensitivity'].apply(categorize_sensitivity)

    # 4b. Uncommonness Level Mapping (Thresholds: Common [1-3], Moderate [4-6], Rare [7+])
    def categorize_uncommonness(score):
        if score >= 7: return 'Rare'
        elif score >= 4: return 'Moderate'
        else: return 'Common'
    df['Uncommonness Level'] = df['uncommonness'].apply(categorize_uncommonness)

    # 4c. Ethical Risk Assessment Logic
    def assess_ethical_risk(sensitivity_level, uncommonness_level):
        if sensitivity_level == 'High': return 'High Risk'
        elif sensitivity_level == 'Medium' and uncommonness_level in ['Moderate', 'Rare']: return 'Medium Risk'
        elif sensitivity_level == 'Low' and uncommonness_level == 'Rare': return 'Medium Risk'
        else: return 'Low Risk'
    df['Ethical Risk'] = df.apply(
        lambda row: assess_ethical_risk(row['Sensitivity Level'], row['Uncommonness Level']), axis=1
    )

    # 4d. Data Source Type Categorization
    def categorize_source_type(source_count):
        if source_count == 1: return 'Single-source'
        elif source_count == 2: return 'Cross-source'
        elif source_count >= 3: return 'Multi-context'
        else: return 'Error'
    df['Data Source Type'] = df['Source_Count'].apply(categorize_source_type)

    # --- 5. FINAL OUTPUT STRUCTURE ---
    final_columns = [
        'Inference', 'Topic Category', 'Sensitivity Level', 'Uncommonness Level',
        'Ethical Risk', 'Data Source Type', 'Recommended_Product', 'Combined_Columns'
    ]

    df_final = df[final_columns].rename(columns={
        'Recommended_Product': 'Recommended Products',
        'Combined_Columns': 'Combined Columns'
    })

    return df_final

In [None]:
# Cell 3: Run the analysis

# This is the file name you uploaded
# ✅ Correct way to load JSON from a file:
FILE_PATH = "/content/drive/MyDrive/Dissertation/Study 1/Inference_data/columns_[('Image_Search', 'Browser_history', 'Location_history', 'YT_search_history')]_inferenceNo_[3]_combinations_[1, 2, 3, 4]_20251021_005136.json"

# with open(file_path, "r") as f:
#     data = json.load(f)
# FILE_PATH = 'columns_[(\'Image_Search\', \'Browser_history\', \'Location_history\', \'YT_search_history\')]_inferenceNo_[3]_combinations_[1, 2, 3, 4]_20251021_005136.json'

# Call the function, using 5 topics for consistency with previous analysis
profile_analysis_df = run_dynamic_profiling(FILE_PATH, n_clusters=5)

# Display the final results (showing the first 10 rows)
print(f"Analysis complete. Shape: {profile_analysis_df.shape}")
# profile_analysis_df.head(10)
profile_analysis_df['Topic Category'].unique()