In [None]:
from rdflib import Graph, Namespace, RDF, OWL, XSD

# Namespaces
EX = Namespace("http://example.org/esg#")
g = Graph()
g.bind("ex", EX)
g.bind("xsd", XSD)
g.bind("owl", OWL)

# === Ontology Classes ===
classes = [
    "Company",
    "Industry",
    "ESGMetric",
    "Category",
    "Pillar",
    "CalculationModel",
    "ESGObservation"
]
for cls in classes:
    g.add((EX[cls], RDF.type, OWL.Class))

# === Object Properties ===
object_properties = [
    "hasCompany",
    "hasMetric",
    "hasCategory",
    "hasPillar",
    "belongsToIndustry",
    "relatedToMetric"
]
for prop in object_properties:
    g.add((EX[prop], RDF.type, OWL.ObjectProperty))

# === Data Properties ===
data_properties = [
    "hasYear",
    "hasValue",
    "hasUnit",
    "hasDataType"
]
for prop in data_properties:
    g.add((EX[prop], RDF.type, OWL.DatatypeProperty))

# === Save schema to TTL ===
schema_path = "/Users/sujanbharadwaj/Documents/Ontology_PCA_Project/Normalized_Data/esg_ontology_schema.ttl"
g.serialize(destination=schema_path, format="turtle")
print(f" ESG ontology schema saved to: {schema_path}")

✅ ESG ontology schema saved to: /Users/sujanbharadwaj/Documents/Ontology_PCA_Project/Normalized_Data/esg_ontology_schema.ttl


In [2]:
import pandas as pd
from rdflib import Graph, Namespace, Literal, RDF, OWL, XSD, URIRef
import os
import re

# === Setup RDF Graph and Namespaces
EX = Namespace("http://example.org/esg#")
g = Graph()
g.bind("ex", EX)
g.bind("xsd", XSD)
g.bind("owl", OWL)

# === URI Cleaner
def safe_uri(text):
    if pd.isnull(text):
        return URIRef(EX + "undefined")
    clean = str(text).strip().lower()
    clean = re.sub(r'[^\w\-]', '_', clean)
    clean = re.sub(r'__+', '_', clean)
    return URIRef(EX + clean.strip('_'))

# === Load SASB Final Files
base_path = "/Users/sujanbharadwaj/Documents/Ontology_PCA_Project/Normalized_Data"
bio_path = os.path.join(base_path, "biopharma_sasb_final.csv")
semi_path = os.path.join(base_path, "semiconductors_sasb_final.csv")
df = pd.concat([pd.read_csv(semi_path), pd.read_csv(bio_path)], ignore_index=True)

# === Preprocess
for col in ["company_name", "Industry", "metric", "category", "pillar"]:
    df[col] = df[col].astype(str).str.strip().str.lower()

df["metric_unit"] = df.get("metric_unit", "unknown")
df = df.drop_duplicates()

# === Generate Triples
for _, row in df.iterrows():
    try:
        company = row["company_name"]
        industry = row["Industry"]
        metric = row["metric"]
        category = row["category"]
        pillar = row.get("pillar", "unknown")
        model = metric + "_model"  # Dynamic model name from metric
        year = int(row["year"])
        value = float(row["metric_value"])
        unit = row.get("metric_unit", "unknown")

        # URIs
        company_uri = safe_uri(company)
        industry_uri = safe_uri(industry)
        metric_uri = safe_uri(f"{metric}_{industry}")
        category_uri = safe_uri(category)
        model_uri = safe_uri(model)
        pillar_uri = safe_uri(pillar)
        obs_uri = safe_uri(f"{company}_{industry}_{metric}_{category}_{pillar}_{year}")

        # === Class Declarations
        g.add((company_uri, RDF.type, EX.Company))
        g.add((industry_uri, RDF.type, EX.Industry))
        g.add((metric_uri, RDF.type, EX.ESGMetric))
        g.add((model_uri, RDF.type, EX.CalculationModel))
        g.add((category_uri, RDF.type, EX.Category))
        g.add((pillar_uri, RDF.type, EX.Pillar))
        g.add((obs_uri, RDF.type, EX.ESGObservation))

        # === Metric Schema (no longer using hasCategory here)
        g.add((obs_uri, EX.belongsToIndustry, industry_uri))
        g.add((metric_uri, EX.hasPillar, pillar_uri))

        # === Model-Metric Link
        g.add((model_uri, EX.relatedToMetric, metric_uri))

        # === Observation Details
        g.add((obs_uri, EX.hasCompany, company_uri))
        g.add((obs_uri, EX.hasMetric, metric_uri))
        g.add((obs_uri, EX.hasCategory, category_uri))
        g.add((obs_uri, EX.hasYear, Literal(year, datatype=XSD.gYear)))
        g.add((obs_uri, EX.hasValue, Literal(value, datatype=XSD.float)))
        g.add((obs_uri, EX.hasUnit, Literal(unit)))
        g.add((obs_uri, EX.hasPillar, pillar_uri))

    except Exception as e:
        print(f"Skipped row due to error: {e}")
        continue

# === Export TTL
ttl_path = os.path.join(base_path, "esg_data_graph_clean.ttl")
g.serialize(destination=ttl_path, format="turtle")
print(f"RDF successfully saved to:\n{ttl_path}")

RDF successfully saved to:
/Users/sujanbharadwaj/Documents/Ontology_PCA_Project/Normalized_Data/esg_data_graph_clean.ttl
