In [1]:
import pandas as pd
from rdflib import Graph, Namespace, Literal, RDF, RDFS, OWL, XSD, URIRef
import os
import re


In [3]:
# === RDF Setup ===
EX = Namespace("http://example.org/esg/")
g = Graph()
g.bind("ex", EX)
g.bind("xsd", XSD)
g.bind("rdfs", RDFS)
g.bind("owl", OWL)

# === URI Cleaner ===
def safe_uri(text):
    if pd.isnull(text): return URIRef(EX + "Undefined")
    clean = str(text).strip()
    clean = re.sub(r'[^\w\-]', '_', clean)
    clean = re.sub(r'__+', '_', clean)
    return URIRef(EX + clean.strip('_'))

# === Paths ===
base_path = "/Users/sujanbharadwaj/Documents/Ontology_PCA_Project/Normalized_Data"
biotech_file = os.path.join(base_path, "biopharma_sasb_filtered.csv")
semicon_file = os.path.join(base_path, "semiconductors_sasb_filtered.csv")
mapping_file = os.path.join(base_path, "esg_master_mapping_pillar_updated1.csv")

# === Load Data ===
biotech_df = pd.read_csv(biotech_file)
semicon_df = pd.read_csv(semicon_file)
mapping_df = pd.read_csv(mapping_file)

# === Merge Industry Files ===
combined_df = pd.concat([biotech_df, semicon_df], ignore_index=True)

# === Merge Mapping (left join)
combined_df = combined_df.merge(mapping_df,
                                 how="left",
                                 left_on=["metric_name", "Industry"],
                                 right_on=["Metric", "Industry"])

# === Ontology Schema Triples ===
ontology_schema = [
    (EX.ESGObservation, RDF.type, OWL.Class),
    (EX.Company, RDF.type, OWL.Class),
    (EX.Industry, RDF.type, OWL.Class),
    (EX.Metric, RDF.type, OWL.Class),
    (EX.Category, RDF.type, OWL.Class),
    (EX.hasCompany, RDF.type, OWL.ObjectProperty),
    (EX.hasIndustry, RDF.type, OWL.ObjectProperty),
    (EX.hasMetric, RDF.type, OWL.ObjectProperty),
    (EX.hasCategory, RDF.type, OWL.ObjectProperty),
    (EX.hasValue, RDF.type, OWL.DatatypeProperty),
    (EX.hasYear, RDF.type, OWL.DatatypeProperty),
    (EX.hasMetricName, RDF.type, OWL.DatatypeProperty),
    (EX.hasMetricDescription, RDF.type, OWL.DatatypeProperty),
    (EX.hasDataType, RDF.type, OWL.DatatypeProperty),
    (EX.hasSASBCode, RDF.type, OWL.DatatypeProperty)
]
for triple in ontology_schema:
    g.add(triple)

# === Generate Triples ===
for _, row in combined_df.iterrows():
    try:
        company = row["company_name"]
        industry = row["Industry"]
        metric = row["standard_metric"]
        metric_label = row["metric_name"]
        year = int(row["year"])
        value = float(row["metric_value_scaled"])
        category = row["Topic"] if pd.notnull(row["Topic"]) else metric
        sasb_code = row["SASB_Code"] if "SASB_Code" in row else "unknown"
        data_type = row["data_type"] if "data_type" in row else "unknown"
        desc = row["metric_description"]

        # URIs
        obs_uri = safe_uri(f"{company}_{industry}_{year}_{metric}")
        company_uri = safe_uri(f"Company_{company}")
        industry_uri = safe_uri(f"Industry_{industry}")
        metric_uri = safe_uri(metric)
        category_uri = safe_uri(f"Category_{category}")

        # Add triples
        g.add((obs_uri, RDF.type, EX.ESGObservation))
        g.add((obs_uri, EX.hasCompany, company_uri))
        g.add((obs_uri, EX.hasIndustry, industry_uri))
        g.add((obs_uri, EX.hasMetric, metric_uri))
        g.add((obs_uri, EX.hasCategory, category_uri))
        g.add((obs_uri, EX.hasValue, Literal(value, datatype=XSD.float)))
        g.add((obs_uri, EX.hasYear, Literal(year, datatype=XSD.gYear)))
        g.add((obs_uri, EX.hasDataType, Literal(data_type)))
        g.add((obs_uri, EX.hasSASBCode, Literal(sasb_code)))

        # Define classes
        g.add((company_uri, RDF.type, EX.Company))
        g.add((industry_uri, RDF.type, EX.Industry))
        g.add((metric_uri, RDF.type, EX.Metric))
        g.add((category_uri, RDF.type, EX.Category))
        g.add((metric_uri, EX.hasMetricName, Literal(metric_label)))
        g.add((metric_uri, EX.hasMetricDescription, Literal(desc)))
    except Exception as e:
        print(f"❌ Error with row: {e}")
        continue

In [4]:
# === Serialize RDF ===
output_file = os.path.join(base_path, "sasb_industry_enhanced.ttl")
g.serialize(destination=output_file, format="turtle")
print(f"RDF exported to: {output_file}")

RDF exported to: /Users/sujanbharadwaj/Documents/Ontology_PCA_Project/Normalized_Data/sasb_industry_enhanced.ttl
