In [8]:
import pandas as pd
from rdflib import Graph, Namespace, Literal, RDF, RDFS, OWL, XSD, URIRef
import os
import re

In [10]:
# === RDF Graph Initialization ===
EX = Namespace("http://example.org/esg/")
g = Graph()
g.bind("ex", EX)
g.bind("xsd", XSD)
g.bind("rdfs", RDFS)
g.bind("owl", OWL)

# === Helper to Clean URIs ===
def safe_uri(text):
    if pd.isnull(text):
        return URIRef(EX + "Undefined")
    clean = str(text).strip()
    clean = re.sub(r'[^\w\-]', '_', clean)
    clean = re.sub(r'__+', '_', clean)
    clean = clean.strip('_')
    return URIRef(EX + clean)

# === Set Your Local Path Here ===
base_path = "/Users/sujanbharadwaj/Documents/Ontology_PCA_Project/Normalized_Data"

# === File Names ===
mapping_file = os.path.join(base_path, "esg_master_mapping_pillar_updated1.csv")
biotech_file = os.path.join(base_path, "biotechnology_and_pharmaceuticals_esg_consolidated.csv")
semicon_file = os.path.join(base_path, "semiconductors_esg_consolidated.csv")

# === Load Files ===
mapping_df = pd.read_csv(mapping_file)
biotech_df = pd.read_csv(biotech_file)
semicon_df = pd.read_csv(semicon_file)

# === Merge and Enhance Data ===
combined_df = pd.concat([biotech_df, semicon_df], ignore_index=True)
enhanced_df = combined_df.merge(mapping_df, how="left",
                                left_on=["metric_name", "Industry"],
                                right_on=["Metric", "Industry"])

# === Ontology Schema ===
ontology_schema = [
    (EX.ESGObservation, RDF.type, OWL.Class),
    (EX.Company, RDF.type, OWL.Class),
    (EX.Industry, RDF.type, OWL.Class),
    (EX.Metric, RDF.type, OWL.Class),
    (EX.Category, RDF.type, OWL.Class),
    (EX.hasCompany, RDF.type, OWL.ObjectProperty),
    (EX.hasIndustry, RDF.type, OWL.ObjectProperty),
    (EX.hasMetric, RDF.type, OWL.ObjectProperty),
    (EX.hasCategory, RDF.type, OWL.ObjectProperty),
    (EX.hasValue, RDF.type, OWL.DatatypeProperty),
    (EX.hasYear, RDF.type, OWL.DatatypeProperty),
    (EX.hasMetricName, RDF.type, OWL.DatatypeProperty),
    (EX.hasMetricDescription, RDF.type, OWL.DatatypeProperty),
]
for triple in ontology_schema:
    g.add(triple)

# === Add RDF Triples ===
for _, row in enhanced_df.iterrows():
    try:
        company = row["company_name"]
        industry = row["Industry"]
        metric = row["metric_name"]
        year = int(row["year"])
        value = float(row["metric_value_scaled"])
        category = row["Topic"] if pd.notnull(row["Topic"]) else row["metric_description"]

        # URIs
        obs_uri = safe_uri(f"{company}_{industry}_{year}_{metric}")
        company_uri = safe_uri(f"Company_{company}")
        industry_uri = safe_uri(f"Industry_{industry}")
        metric_uri = safe_uri(metric)
        category_uri = safe_uri(f"Category_{category}")

        # Add triples
        g.add((obs_uri, RDF.type, EX.ESGObservation))
        g.add((obs_uri, EX.hasCompany, company_uri))
        g.add((obs_uri, EX.hasIndustry, industry_uri))
        g.add((obs_uri, EX.hasMetric, metric_uri))
        g.add((obs_uri, EX.hasCategory, category_uri))
        g.add((obs_uri, EX.hasValue, Literal(value, datatype=XSD.float)))
        g.add((obs_uri, EX.hasYear, Literal(year, datatype=XSD.gYear)))

        g.add((company_uri, RDF.type, EX.Company))
        g.add((industry_uri, RDF.type, EX.Industry))
        g.add((metric_uri, RDF.type, EX.Metric))
        g.add((category_uri, RDF.type, EX.Category))
        g.add((metric_uri, EX.hasMetricName, Literal(metric)))
        g.add((metric_uri, EX.hasMetricDescription, Literal(row["metric_description"])))
    except Exception as e:
        print(f"❌ Error with row: {e}")
        continue


In [12]:
# === Serialize RDF to TTL ===
output_file = os.path.join(base_path, "industry_enhanced_with_topic.ttl")
g.serialize(destination=output_file, format="turtle")
print(f"✅ RDF exported to: {output_file}")

✅ RDF exported to: /Users/sujanbharadwaj/Documents/Ontology_PCA_Project/Normalized_Data/industry_enhanced_with_topic.ttl
