In [4]:
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd

# GraphDB endpoint
sparql_endpoint = "http://MacBookAir.modem:7200/repositories/ESGDataset"

def query_esg_data_by_topic(topic_keyword, industry_name, data_type=None):
    sparql = SPARQLWrapper(sparql_endpoint)

    data_type_filter = f'FILTER(LCASE(STR(?data_type)) = "{data_type.lower()}")' if data_type else ""

    query = f"""
    PREFIX ex: <http://example.org/esg/>
    PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

    SELECT ?company ?year ?metric ?value ?category ?pillar ?industry ?data_type
    WHERE {{
      ?obs a ex:ESGObservation ;
           ex:hasCompany ?company ;
           ex:hasYear ?year ;
           ex:hasMetric ?metric ;
           ex:hasCategory ?category ;
           ex:hasPillar ?pillar ;
           ex:hasIndustry ?industry ;
           ex:hasValue ?value ;
           ex:hasDataType ?data_type .

      FILTER(CONTAINS(LCASE(STR(?category)), "{topic_keyword.lower()}"))
      FILTER(CONTAINS(LCASE(STR(?industry)), "{industry_name.lower()}"))
      {data_type_filter}
    }}
    """

    print("Running SPARQL query...")
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)

    try:
        results = sparql.query().convert()
        print("Query executed successfully.")
    except Exception as e:
        print("SPARQL Query Failed:", e)
        return pd.DataFrame()

    rows = []
    for result in results["results"]["bindings"]:
        try:
            row = {
                "company": result["company"]["value"].rsplit('/', 1)[-1],
                "year": int(result["year"]["value"]),
                "metric": result["metric"]["value"].rsplit('/', 1)[-1],
                "value": float(result["value"]["value"]),
                "category": result["category"]["value"].rsplit('/', 1)[-1],
                "pillar": result["pillar"]["value"].rsplit('/', 1)[-1],
                "industry": result["industry"]["value"].rsplit('/', 1)[-1],
                "data_type": result["data_type"]["value"]
            }
            rows.append(row)
        except Exception as e:
            print("Error processing row:", e)
            print(result)

    df = pd.DataFrame(rows)
    print(f"DataFrame shape: {df.shape}")
    return df

In [5]:
df_avix = query_esg_data_by_topic(topic_keyword="", industry_name="semiconductors", data_type="Quantitative")
df_avix = df_avix[df_avix["company"].str.lower().str.contains("avix")]

# Show first few rows
print(df_avix[["company", "year", "metric", "pillar", "category", "value"]].head())

Running SPARQL query...
SPARQL Query Failed: <urlopen error [Errno 8] nodename nor servname provided, or not known>


KeyError: 'company'

In [2]:
# üî¨ Test query: get GHG emissions for semiconductors (quantitative only)
df_test = query_esg_data_by_topic(
    topic_keyword="ghg",
    industry_name="semiconductor",
    data_type="quantitative"
)

print("\nüîç Sample results:")
print(df_test.head())

# ‚ûï Optional: Pivot for PCA
if not df_test.empty:
    df_pivot = df_test.pivot_table(
        index=["company", "year"],
        columns="metric",
        values="value"
    ).reset_index()
    print("\nüìä Pivoted data for PCA:")
    print(df_pivot.head())

üì§ Running SPARQL query:
 
    PREFIX ex: <http://example.org/esg/>
    PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

    SELECT ?company ?year ?metric ?value ?category ?industry ?data_type
    WHERE {
      ?obs a ex:ESGObservation ;
           ex:hasCompany ?company ;
           ex:hasYear ?year ;
           ex:hasMetric ?metric ;
           ex:hasCategory ?category ;
           ex:hasIndustry ?industry ;
           ex:hasValue ?value ;
           ex:hasDataType ?data_type .

      FILTER(CONTAINS(LCASE(STR(?category)), "ghg"))
      FILTER(CONTAINS(LCASE(STR(?industry)), "semiconductor"))
      FILTER(LCASE(STR(?data_type)) = "quantitative")
    }
    
‚úÖ Query executed successfully.
üì¶ DataFrame shape: (15229, 7)

üîç Sample results:
                              company  year         metric     value  \
0  Company_3Dfamily_Technology_Co_Ltd  2016  GHG_Emissions  0.005889   
1  Company_3Dfamily_Technology_Co_Ltd  2016  GHG_Emissions  0.091220   
2  Company_3Dfamily_Technol

In [6]:
query_esg_data_by_topic("ghg", "semiconductors", data_type="quantitative")

üì§ Running SPARQL query:
 
    PREFIX ex: <http://example.org/esg/>
    PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

    SELECT ?company ?year ?metric ?value ?category ?industry ?data_type
    WHERE {
      ?obs a ex:ESGObservation ;
           ex:hasCompany ?company ;
           ex:hasYear ?year ;
           ex:hasMetric ?metric ;
           ex:hasCategory ?category ;
           ex:hasIndustry ?industry ;
           ex:hasValue ?value ;
           ex:hasDataType ?data_type .

      FILTER(CONTAINS(LCASE(STR(?category)), "ghg"))
      FILTER(CONTAINS(LCASE(STR(?industry)), "semiconductors"))
      FILTER(LCASE(STR(?data_type)) = "quantitative")
    }
    
‚úÖ Query executed successfully.
üì¶ DataFrame shape: (15229, 7)


Unnamed: 0,company,year,metric,value,category,industry,data_type
0,Company_3Dfamily_Technology_Co_Ltd,2016,GHG_Emissions,0.005889,Category_GHG_Emissions_Scope_1,Industry_Semiconductors,quantitative
1,Company_3Dfamily_Technology_Co_Ltd,2016,GHG_Emissions,0.091220,Category_GHG_Emissions_Scope_1,Industry_Semiconductors,quantitative
2,Company_3Dfamily_Technology_Co_Ltd,2016,GHG_Emissions,0.005889,Category_GHG_Emissions_Scope_2,Industry_Semiconductors,quantitative
3,Company_3Dfamily_Technology_Co_Ltd,2016,GHG_Emissions,0.091220,Category_GHG_Emissions_Scope_2,Industry_Semiconductors,quantitative
4,Company_3Dfamily_Technology_Co_Ltd,2017,GHG_Emissions,0.007325,Category_GHG_Emissions_Scope_1,Industry_Semiconductors,quantitative
...,...,...,...,...,...,...,...
15224,Company_p-ban_com_Corp,2022,GHG_Emissions,0.000080,Category_GHG_Emissions_Scope_1,Industry_Semiconductors,quantitative
15225,Company_p-ban_com_Corp,2022,GHG_Emissions,0.087721,Category_GHG_Emissions_Scope_1,Industry_Semiconductors,quantitative
15226,Company_p-ban_com_Corp,2022,GHG_Emissions,0.000080,Category_GHG_Emissions_Scope_2,Industry_Semiconductors,quantitative
15227,Company_p-ban_com_Corp,2022,GHG_Emissions,0.087721,Category_GHG_Emissions_Scope_2,Industry_Semiconductors,quantitative


In [7]:
print(df_test["metric"].unique())

['GHG_Emissions']


In [8]:
print(df_test.isna().sum())

company      0
year         0
metric       0
value        0
category     0
industry     0
data_type    0
dtype: int64


In [9]:
df_semiconductor = query_esg_data_by_topic("", "semiconductor", data_type="quantitative")
df_biopharma = query_esg_data_by_topic("", "biotechnology", data_type="quantitative")

üì§ Running SPARQL query:
 
    PREFIX ex: <http://example.org/esg/>
    PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

    SELECT ?company ?year ?metric ?value ?category ?industry ?data_type
    WHERE {
      ?obs a ex:ESGObservation ;
           ex:hasCompany ?company ;
           ex:hasYear ?year ;
           ex:hasMetric ?metric ;
           ex:hasCategory ?category ;
           ex:hasIndustry ?industry ;
           ex:hasValue ?value ;
           ex:hasDataType ?data_type .

      FILTER(CONTAINS(LCASE(STR(?category)), ""))
      FILTER(CONTAINS(LCASE(STR(?industry)), "semiconductor"))
      FILTER(LCASE(STR(?data_type)) = "quantitative")
    }
    
‚úÖ Query executed successfully.
üì¶ DataFrame shape: (38234, 7)
üì§ Running SPARQL query:
 
    PREFIX ex: <http://example.org/esg/>
    PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

    SELECT ?company ?year ?metric ?value ?category ?industry ?data_type
    WHERE {
      ?obs a ex:ESGObservation ;
           ex:hasCompany ?co

In [19]:
# üß± Pivot to wide matrix
df_pivot_pharma = df_biopharma.pivot_table(
    index=["company", "year"],  # keep year as part of index
    columns="metric",
    values="value"
).reset_index()

# üß≠ Sort by year (ascending) and company name (optional)
df_pivot_pharma = df_pivot_pharma.sort_values(by=["company", "year"]).reset_index(drop=True)

In [20]:
# üß± Pivot to wide matrix
df_pivot_semi = df_semiconductor.pivot_table(
    index=["company", "year"],  # keep year as part of index
    columns="metric",
    values="value"
).reset_index()

# üß≠ Sort by year (ascending) and company name (optional)
df_pivot_semi = df_pivot_semi.sort_values(by=["company", "year"]).reset_index(drop=True)

In [11]:
df_avix = query_esg_data_by_topic("ghg", "semiconductor", data_type="quantitative")
print(df_avix.columns)
print(df_avix.shape)
print(df_avix.head())

üì§ Running SPARQL query:
 
    PREFIX ex: <http://example.org/esg/>
    PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

    SELECT ?company ?year ?metric ?value ?category ?pillar ?industry ?data_type
    WHERE {
      ?obs a ex:ESGObservation ;
           ex:hasCompany ?company ;
           ex:hasYear ?year ;
           ex:hasMetric ?metric ;
           ex:hasCategory ?category ;
           ex:hasPillar ?pillar ;
           ex:hasIndustry ?industry ;
           ex:hasValue ?value ;
           ex:hasDataType ?data_type .

      FILTER(CONTAINS(LCASE(STR(?category)), "ghg"))
      FILTER(CONTAINS(LCASE(STR(?industry)), "semiconductor"))
      FILTER(LCASE(STR(?data_type)) = "quantitative")
    }
    
Query executed successfully.
DataFrame shape: (0, 0)
RangeIndex(start=0, stop=0, step=1)
(0, 0)
Empty DataFrame
Columns: []
Index: []


In [13]:
df_avix_all = query_esg_data_by_topic(topic_keyword="", industry_name="")

# Now filter by company only
df_avix_all = df_avix_all[df_avix_all["company"].str.lower().str.contains("avix")]

# Show what we have
print("üîç AVIX rows in dataset:", df_avix_all.shape)
print(df_avix_all[["company", "year", "metric", "pillar", "category", "value"]].head())

üì§ Running SPARQL query:
 
    PREFIX ex: <http://example.org/esg/>
    PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

    SELECT ?company ?year ?metric ?value ?category ?pillar ?industry ?data_type
    WHERE {
      ?obs a ex:ESGObservation ;
           ex:hasCompany ?company ;
           ex:hasYear ?year ;
           ex:hasMetric ?metric ;
           ex:hasCategory ?category ;
           ex:hasPillar ?pillar ;
           ex:hasIndustry ?industry ;
           ex:hasValue ?value ;
           ex:hasDataType ?data_type .

      FILTER(CONTAINS(LCASE(STR(?category)), ""))
      FILTER(CONTAINS(LCASE(STR(?industry)), ""))
      
    }
    
Query executed successfully.
DataFrame shape: (0, 0)


KeyError: 'company'

In [12]:
df_avix = query_esg_data_by_topic("", "semiconductor", data_type="quantitative")
df_avix = df_avix[df_avix["company"].str.lower().str.contains("avix")]
print(df_avix[["company", "year", "metric", "pillar", "category", "value"]].head())

üì§ Running SPARQL query:
 
    PREFIX ex: <http://example.org/esg/>
    PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

    SELECT ?company ?year ?metric ?value ?category ?pillar ?industry ?data_type
    WHERE {
      ?obs a ex:ESGObservation ;
           ex:hasCompany ?company ;
           ex:hasYear ?year ;
           ex:hasMetric ?metric ;
           ex:hasCategory ?category ;
           ex:hasPillar ?pillar ;
           ex:hasIndustry ?industry ;
           ex:hasValue ?value ;
           ex:hasDataType ?data_type .

      FILTER(CONTAINS(LCASE(STR(?category)), ""))
      FILTER(CONTAINS(LCASE(STR(?industry)), "semiconductor"))
      FILTER(LCASE(STR(?data_type)) = "quantitative")
    }
    
Query executed successfully.
DataFrame shape: (0, 0)


KeyError: 'company'