In [None]:
# %%
# 🧠 LLM Reasoning & Personality Knowledge Graph Challenge
# Notebook: 01_pipeline.ipynb
# Author: Onyekachukwu Ekesi
# Description: End-to-end pipeline to extract knowledge graph and model personalities from text.

# %%
# 📦 Import libraries

import spacy
import pandas as pd
import networkx as nx
import random
import json
from pprint import pprint

In [None]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# %%
# 🧩 Step 1: Generate synthetic dataset
# In a real scenario, LLM (ChatGPT/OpenAI API) can generate bios automatically.
# For simplicity, we define a few synthetic examples here.

synthetic_bios = [
    {
        "id": 1,
        "text": "John is a dedicated software engineer at Intellumia. He loves solving problems, mentoring juniors, and spends weekends volunteering. He is known for being very organized and dependable."
    },
    {
        "id": 2,
        "text": "Mary is an artist who enjoys exploring new ideas and meeting people. She works independently, often collaborates with international designers, and has a cheerful personality."
    },
    {
        "id": 3,
        "text": "David, a financial analyst at NovaCorp, prefers working alone on complex models. Though introverted, he is analytical, precise, and sometimes anxious about deadlines."
    },
]

df = pd.DataFrame(synthetic_bios)
df

In [None]:
# %%
# 💬 Step 2: Named Entity Recognition (NER) using spaCy

def extract_entities(text):
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

df["entities"] = df["text"].apply(extract_entities)
df[["id", "entities"]]

In [None]:
# %%
# 🔗 Step 3: Relation Extraction (Rule-based mockup)
# Here, we simulate relationship triples: (subject, relation, object)

def extract_relations(text):
    relations = []
    if "engineer" in text:
        relations.append(("John", "works_as", "Software Engineer"))
    if "artist" in text:
        relations.append(("Mary", "works_as", "Artist"))
    if "analyst" in text:
        relations.append(("David", "works_as", "Financial Analyst"))
    if "Intellumia" in text:
        relations.append(("John", "works_at", "Intellumia"))
    if "NovaCorp" in text:
        relations.append(("David", "works_at", "NovaCorp"))
    return relations

df["relations"] = df["text"].apply(extract_relations)
df[["id", "relations"]]

In [None]:
# %%
# 🧬 Step 4: Simulated LLM Personality Inference
# This step would normally use a chain of LLM prompts to infer Big Five traits.
# For now, we'll simulate it using keyword cues.

def infer_personality(text):
    traits = {
        "Openness": 0.5,
        "Conscientiousness": 0.5,
        "Extraversion": 0.5,
        "Agreeableness": 0.5,
        "Neuroticism": 0.5,
    }

    cues = {
        "organized": ("Conscientiousness", +0.3),
        "dependable": ("Conscientiousness", +0.2),
        "cheerful": ("Extraversion", +0.3),
        "introverted": ("Extraversion", -0.3),
        "analytical": ("Openness", +0.3),
        "anxious": ("Neuroticism", +0.4),
        "volunteering": ("Agreeableness", +0.3),
        "exploring": ("Openness", +0.2),
    }

    for word, (trait, effect) in cues.items():
        if word in text.lower():
            traits[trait] += effect

    # Clamp between 0 and 1
    for trait in traits:
        traits[trait] = max(0, min(1, traits[trait]))

    return traits

In [None]:
df["personality"] = df["text"].apply(infer_personality)
df[["id", "personality"]].apply(lambda x: pprint(x), axis=1)

In [None]:
# %%
# 🌐 Step 5: Build Knowledge Graph

G = nx.Graph()

for _, row in df.iterrows():
    person_name = row["text"].split()[0]  # crude name extraction
    G.add_node(person_name, type="Person")

    # Add profession relations
    for (subj, rel, obj) in row["relations"]:
        G.add_node(obj, type="Entity")
        G.add_edge(subj, obj, relation=rel)

    # Add personality trait nodes
    for trait, score in row["personality"].items():
        trait_node = f"{person_name}_{trait}"
        G.add_node(trait_node, type="Trait", score=score)
        G.add_edge(person_name, trait_node, relation="has_trait", weight=score)

# Export to GraphML
nx.write_graphml(G, "data/output/knowledge_graph.graphml")

print(f"Graph has {len(G.nodes())} nodes and {len(G.edges())} edges.")

In [None]:
# %%
# 🧭 Step 6: Inspect sample triples
for u, v, d in list(G.edges(data=True))[:10]:
    print(f"{u} -[{d['relation']}]-> {v}")

In [None]:
# %%
# 📊 Step 7: Evaluate (Simple Mock Evaluation)

# Assume ground truth personality scores (for demonstration)
ground_truth = {
    "John": {"Conscientiousness": 0.8, "Agreeableness": 0.7},
    "Mary": {"Extraversion": 0.8, "Openness": 0.7},
    "David": {"Neuroticism": 0.7, "Openness": 0.6},
}

def simple_eval(pred, truth):
    results = []
    for person, traits in truth.items():
        pred_traits = pred.loc[df["text"].str.contains(person), "personality"].iloc[0]
        for t, val in traits.items():
            diff = abs(pred_traits.get(t, 0.5) - val)
            results.append({"Person": person, "Trait": t, "Error": round(diff, 2)})
    return pd.DataFrame(results)

eval_df = simple_eval(df, ground_truth)
eval_df

In [None]:
# %%
# 💾 Step 8: Save results
df.to_json("data/output/results.json", orient="records", indent=2)
eval_df.to_csv("data/output/evaluation.csv", index=False)

print("✅ Pipeline completed and results saved.")