In [1]:
import json
import random
import uuid
from copy import deepcopy
from datetime import datetime


In [2]:
with open("investors_raw.txt", "r", encoding="utf-8") as f:
    real_investors = [json.loads(line) for line in f]

len(real_investors)


53

In [3]:
BASE_SCHEMA_KEYS = set(real_investors[0].keys())
BASE_SCHEMA_KEYS


{'confidence_score',
 'created_at',
 'firm_name',
 'id',
 'investment_stage_pref',
 'investment_thesis',
 'investor_type',
 'is_active_investing',
 'linkedin_url',
 'location',
 'name',
 'notable_investments',
 'past_investments',
 'primary_domain',
 'profile_url',
 'secondary_domains',
 'short_bio',
 'source',
 'tags',
 'total_investments_count',
 'twitter_url',
 'updated_at',
 'website'}

In [4]:
BASE_SCHEMA_KEYS = set(real_investors[0].keys())
BASE_SCHEMA_KEYS


{'confidence_score',
 'created_at',
 'firm_name',
 'id',
 'investment_stage_pref',
 'investment_thesis',
 'investor_type',
 'is_active_investing',
 'linkedin_url',
 'location',
 'name',
 'notable_investments',
 'past_investments',
 'primary_domain',
 'profile_url',
 'secondary_domains',
 'short_bio',
 'source',
 'tags',
 'total_investments_count',
 'twitter_url',
 'updated_at',
 'website'}

In [8]:
LOCATIONS = [
    "San Francisco, California",
    "New York, New York",
    "Bangalore, India",
    "London, UK",
    "Singapore",
    "Berlin, Germany"
]

FIRST_NAMES = ["Alex", "Chris", "Jordan", "Taylor", "Sam", "Ryan"]
LAST_NAMES = ["Kapoor", "Smith", "Chen", "Patel", "Müller", "Brown"]

def mutate_name():
    return f"{random.choice(FIRST_NAMES)} {random.choice(LAST_NAMES)}"

def mutate_firm(firm):
    suffix = random.choice(["Capital", "Ventures", "Partners", "Fund"])

    if not firm or not str(firm).strip():
        base = random.choice([
            "Alpha", "Nova", "Pioneer", "Summit", "Vertex", "Catalyst"
        ])
    else:
        parts = firm.strip().split()
        base = parts[0] if parts else "Alpha"

    return f"{base} {suffix}"


def mutate_past_investments(past):
    if not past:
        return []
    sample = random.sample(past, k=min(len(past), random.randint(3, 6)))
    for p in sample:
        p["year"] = random.randint(2015, 2025)
    return sample


In [9]:
def mutate_investor(base, idx):
    inv = deepcopy(base)

    inv["id"] = str(uuid.uuid4())
    inv["name"] = mutate_name()
    inv["firm_name"] = mutate_firm(base.get("firm_name", "Generic VC"))
    inv["location"] = random.choice(LOCATIONS)

    inv["secondary_domains"] = random.sample(
        base.get("secondary_domains", []),
        k=min(4, len(base.get("secondary_domains", [])))
    )

    inv["investment_stage_pref"] = random.sample(
        base.get("investment_stage_pref", []),
        k=min(3, len(base.get("investment_stage_pref", [])))
    )

    inv["past_investments"] = mutate_past_investments(
        base.get("past_investments", [])
    )

    inv["total_investments_count"] = len(inv["past_investments"]) + random.randint(5, 20)
    inv["confidence_score"] = round(random.uniform(0.75, 0.95), 2)

    now = datetime.utcnow().isoformat() + "Z"
    inv["created_at"] = now
    inv["updated_at"] = now

    return inv


In [10]:
synthetic = []
i = 0

while len(synthetic) < 200:
    base = random.choice(real_investors)
    synthetic.append(mutate_investor(base, i))
    i += 1

len(synthetic)


200

In [11]:
for inv in synthetic:
    if set(inv.keys()) != BASE_SCHEMA_KEYS:
        raise ValueError("❌ Schema mismatch detected")

print("✅ All synthetic investors match schema")


✅ All synthetic investors match schema


In [12]:
final_investors = real_investors + synthetic

with open("investors_expanded_200.json", "w", encoding="utf-8") as f:
    json.dump(final_investors, f, indent=2)

len(final_investors)


253

In [13]:
import json
import random
import uuid
from copy import deepcopy
from datetime import datetime


In [14]:
with open("founders_raw.txt", "r", encoding="utf-8") as f:
    real_founders = [json.loads(line) for line in f]

len(real_founders)


264

In [15]:
BASE_SCHEMA_KEYS = set(real_founders[0].keys())
BASE_SCHEMA_KEYS


{'company',
 'competitors',
 'created_at',
 'domain',
 'id',
 'name',
 'past_funding',
 'umbrella_companies',
 'valuation'}

In [19]:
COMPANY_SUFFIX = ["Health", "Labs", "Systems", "Tech", "AI", "Solutions"]
DOMAINS = ["health", "ai", "med", "care", "bio"]

def mutate_company(name):
    if not name or not str(name).strip():
        base = random.choice(["Nova", "Alpha", "Pulse", "Core", "Vertex"])
    else:
        base = name.split()[0]
    return f"{base}{random.choice(COMPANY_SUFFIX)}"

def mutate_domain(domain):
    if not domain or "." not in domain:
        return f"{random.choice(DOMAINS)}tech.com"
    base = domain.split(".")[0]
    return f"{base}{random.randint(1,99)}.com"

def mutate_funding(pf):
    if not pf:
        return {"year": random.randint(2018, 2025), "round": "seed", "amount": "$1M"}

    return {
        "year": random.randint(2018, 2025),
        "round": pf.get("round", "seed"),
        "amount": pf.get("amount", "$1M")
    }


In [20]:
def mutate_founder(base):
    f = deepcopy(base)

    f["id"] = str(uuid.uuid4())
    f["name"] = mutate_company(base.get("name"))
    f["company"] = f["name"]
    f["domain"] = mutate_domain(base.get("domain"))

    f["past_funding"] = mutate_funding(base.get("past_funding"))

    f["valuation"] = base.get("valuation", 0) + random.randint(-2_000_000, 5_000_000)

    f["competitors"] = random.sample(
        base.get("competitors", []),
        k=min(len(base.get("competitors", [])), random.randint(1, 2))
    )

    f["umbrella_companies"] = random.sample(
        base.get("umbrella_companies", []),
        k=min(len(base.get("umbrella_companies", [])), 1)
    )

    now = datetime.utcnow().isoformat()
    f["created_at"] = now

    return f


In [21]:
synthetic = []

while len(synthetic) < (500 - len(real_founders)):
    base = random.choice(real_founders)
    synthetic.append(mutate_founder(base))

len(synthetic)


236

In [22]:
for f in synthetic:
    if set(f.keys()) != BASE_SCHEMA_KEYS:
        raise ValueError("❌ Schema mismatch")

print("✅ Schema validated for all synthetic founders")


✅ Schema validated for all synthetic founders


In [23]:
final_founders = real_founders + synthetic

with open("founders_expanded_500.json", "w", encoding="utf-8") as f:
    json.dump(final_founders, f, indent=2)

len(final_founders)


500

In [24]:
import json
import pandas as pd


In [25]:
def flatten_json(record):
    flat = {}

    for k, v in record.items():
        if isinstance(v, dict):
            for sk, sv in v.items():
                flat[f"{k}_{sk}"] = sv

        elif isinstance(v, list):
            # store lists as JSON string (Excel-safe)
            flat[k] = json.dumps(v, ensure_ascii=False)

        else:
            flat[k] = v

    return flat


In [26]:
# Load founders
with open("data/founders_expanded_500.json", "r", encoding="utf-8") as f:
    founders = json.load(f)

# Flatten
founders_flat = [flatten_json(f) for f in founders]

# DataFrame
df_founders = pd.DataFrame(founders_flat)

# Save to Excel
df_founders.to_excel(
    "data/founders_expanded_500.xlsx",
    index=False,
    engine="openpyxl"
)

df_founders.shape


(500, 11)

In [27]:
# Load investors
with open("data/investors_expanded_200.json", "r", encoding="utf-8") as f:
    investors = json.load(f)

# Flatten
investors_flat = [flatten_json(i) for i in investors]

# DataFrame
df_investors = pd.DataFrame(investors_flat)

# Save to Excel
df_investors.to_excel(
    "data/investors_expanded_200.xlsx",
    index=False,
    engine="openpyxl"
)

df_investors.shape


(253, 23)