In [1]:
from rdflib import Graph, Namespace, RDF, OWL, XSD

# Namespaces
EX = Namespace("http://example.org/esg#")
g = Graph()
g.bind("ex", EX)
g.bind("xsd", XSD)
g.bind("owl", OWL)

# === Ontology Classes ===
classes = [
    "Industry",
    "ESGMetric",
    "Category",
    "Pillar",
    "CalculationModel",
    "ESGObservation"
]
for cls in classes:
    g.add((EX[cls], RDF.type, OWL.Class))

# === Object Properties ===
object_properties = [
    "belongsToIndustry",
    "hasCategory",
    "hasPillar",
    "relatedToMetric"
]
for prop in object_properties:
    g.add((EX[prop], RDF.type, OWL.ObjectProperty))

# === Data Properties ===
data_properties = [
    "hasDataType",
    "hasYear",
    "hasValue"
]
for prop in data_properties:
    g.add((EX[prop], RDF.type, OWL.DatatypeProperty))

# === Save schema to TTL ===
schema_path = "/Users/sujanbharadwaj/Documents/Ontology_PCA_Project/Normalized_Data/esg_ontology_schema.ttl"
g.serialize(destination=schema_path, format="turtle")
print(f"✅ ESG ontology schema saved to: {schema_path}")

✅ ESG ontology schema saved to: /Users/sujanbharadwaj/Documents/Ontology_PCA_Project/Normalized_Data/esg_ontology_schema.ttl


In [None]:
import pandas as pd
from rdflib import Graph, Namespace, Literal, RDF, OWL, XSD, URIRef
import os
import re

# === Setup Namespaces ===
EX = Namespace("http://example.org/esg#")
g = Graph()
g.bind("ex", EX)
g.bind("xsd", XSD)
g.bind("owl", OWL)

# === URI Cleaner ===
def safe_uri(text):
    if pd.isnull(text):
        return URIRef(EX + "undefined")
    clean = str(text).strip().lower()
    clean = re.sub(r'[^\w\-]', '_', clean)
    clean = re.sub(r'__+', '_', clean)
    return URIRef(EX + clean.strip('_'))

# === File Paths ===
base_path = "/Users/sujanbharadwaj/Documents/Ontology_PCA_Project/Normalized_Data"
bio_file = os.path.join(base_path, "biopharma_sasb_final.csv")
semi_file = os.path.join(base_path, "semiconductors_sasb_final.csv")
df = pd.concat([pd.read_csv(semi_file), pd.read_csv(bio_file)], ignore_index=True)

# Clean columns
for col in ["company_name", "Industry", "metric_name", "model", "category", "pillar", "data_type_class"]:
    df[col] = df[col].astype(str).str.strip().str.lower()

# Fill missing unit column if needed
if "metric_unit" not in df.columns:
    df["metric_unit"] = "unknown"

# Drop duplicates
df = df.drop_duplicates(subset=[
    "company_name", "Industry", "metric_name", "model", "category",
    "pillar", "year", "metric_value", "data_type_class", "metric_unit"
])

# === Generate RDF Triples ===
for _, row in df.iterrows():
    try:
        company = row["company_name"]
        industry = row["Industry"]
        metric = row["metric_name"]
        model = row["model"]
        category = row["category"]
        data_type = row["data_type_class"]
        pillar = row.get("pillar", "unknown")
        year = int(row["year"])
        value = float(row["metric_value"])
        unit = row.get("metric_unit", "unknown")

        # URIs
        company_uri = safe_uri(company)
        industry_uri = safe_uri(industry)
        metric_uri = safe_uri(metric)
        model_uri = safe_uri(model)
        category_uri = safe_uri(category)
        pillar_uri = safe_uri(pillar)
        obs_uri = safe_uri(f"{company}_{industry}_{metric}_{category}_{pillar}_{year}")

        # Class declarations
        g.add((company_uri, RDF.type, EX.Company))
        g.add((industry_uri, RDF.type, EX.Industry))
        g.add((metric_uri, RDF.type, EX.ESGMetric))
        g.add((model_uri, RDF.type, EX.CalculationModel))
        g.add((category_uri, RDF.type, EX.Category))
        g.add((pillar_uri, RDF.type, EX.Pillar))
        g.add((obs_uri, RDF.type, EX.ESGObservation))

        # Metric schema
        g.add((metric_uri, EX.belongsToIndustry, industry_uri))
        g.add((metric_uri, EX.hasCategory, category_uri))
        g.add((metric_uri, EX.hasPillar, pillar_uri))
        g.add((metric_uri, EX.hasDataType, Literal(data_type)))

        # Model linkage
        g.add((model_uri, EX.relatedToMetric, metric_uri))

        # Observation data
        g.add((obs_uri, EX.hasCompany, company_uri))
        g.add((obs_uri, EX.hasMetric, metric_uri))
        g.add((obs_uri, EX.hasYear, Literal(year, datatype=XSD.gYear)))
        g.add((obs_uri, EX.hasValue, Literal(value, datatype=XSD.float)))
        g.add((obs_uri, EX.hasDataType, Literal(data_type)))
        g.add((obs_uri, EX.hasPillar, pillar_uri))
        g.add((obs_uri, EX.hasUnit, Literal(unit)))

    except Exception as e:
        print(f"Skipped row due to error: {e}")
        continue

# === Save RDF Graph ===
ttl_path = os.path.join(base_path, "esg_data_graph_with_company.ttl")
g.serialize(destination=ttl_path, format="turtle")
print(f"ESG RDF with company triples (with pillar and unit) saved to: {ttl_path}")


ESG RDF with company triples (with pillar and unit) saved to: /Users/sujanbharadwaj/Documents/Ontology_PCA_Project/Normalized_Data/esg_data_graph_with_company.ttl
