In [26]:
# 1. Imports & configuration
import os, sys
from pathlib import Path
import pandas as pd

sys.path.append(os.path.abspath('../'))
from scripts.config import RAW_DATA_PATH, PROCESSED_DATA_PATH  # paths defined in project config

RAW_PATH = Path(RAW_DATA_PATH)
PROC_PATH = Path(PROCESSED_DATA_PATH)
PROC_PATH.parent.mkdir(parents=True, exist_ok=True)

pd.set_option('display.max_columns', 100)
print(f"RAW -> {RAW_PATH}\nPROCESSED -> {PROC_PATH}")

RAW -> ..\data\raw\dataset.csv
PROCESSED -> ..\data\processed\esg_data_cleaned.csv


In [27]:
# 2. Load raw data
if not RAW_PATH.exists():
    raise FileNotFoundError(f"Raw dataset not found at {RAW_PATH}")
raw_df = pd.read_csv(RAW_PATH)
print(f"Raw data shape: {raw_df.shape}")
raw_df.head()

Raw data shape: (503, 15)


Unnamed: 0,Symbol,Name,Address,Sector,Industry,Full Time Employees,Description,Total ESG Risk score,Environment Risk Score,Governance Risk Score,Social Risk Score,Controversy Level,Controversy Score,ESG Risk Percentile,ESG Risk Level
0,ENPH,"Enphase Energy, Inc.","47281 Bayside Parkway\nFremont, CA 94538\nUnit...",Technology,Solar,3157,"Enphase Energy, Inc., together with its subsid...",,,,,,,,
1,EMN,Eastman Chemical Company,"200 South Wilcox Drive\nKingsport, TN 37662\nU...",Basic Materials,Specialty Chemicals,14000,Eastman Chemical Company operates as a special...,25.3,12.8,6.6,5.8,Moderate Controversy Level,2.0,50th percentile,Medium
2,DPZ,Domino's Pizza Inc.,"30 Frank Lloyd Wright Drive\nAnn Arbor, MI 481...",Consumer Cyclical,Restaurants,6500,"Domino's Pizza, Inc., through its subsidiaries...",29.2,10.6,6.3,12.2,Moderate Controversy Level,2.0,66th percentile,Medium
3,DAY,"Dayforce, Inc.","3311 East Old Shakopee Road\nMinneapolis, MN 5...",Technology,Software - Application,9084,"Dayforce Inc., together with its subsidiaries,...",,,,,,,,
4,DVA,Davita Inc.,"2000 16th Street\nDenver, CO 80202\nUnited States",Healthcare,Medical Care Facilities,70000,DaVita Inc. provides kidney dialysis services ...,22.6,0.1,8.4,14.1,Moderate Controversy Level,2.0,38th percentile,Medium


In [28]:
# 3. Basic cleaning (placeholder logic - customize as needed)
# Strip column names
if hasattr(raw_df.columns, 'str'):
    raw_df.columns = raw_df.columns.str.strip()

# Example: drop completely empty columns
empty_cols = [c for c in raw_df.columns if raw_df[c].isna().all()]
if empty_cols:
    raw_df = raw_df.drop(columns=empty_cols)

# Example: fill simple numeric missing values with median
num_cols = raw_df.select_dtypes(include=['int64','float64']).columns
raw_df[num_cols] = raw_df[num_cols].fillna(raw_df[num_cols].median())

clean_df = raw_df.copy()
print(f"Columns removed: {empty_cols}")
print(f"Cleaned shape: {clean_df.shape}")
clean_df.head()

Columns removed: []
Cleaned shape: (503, 15)


Unnamed: 0,Symbol,Name,Address,Sector,Industry,Full Time Employees,Description,Total ESG Risk score,Environment Risk Score,Governance Risk Score,Social Risk Score,Controversy Level,Controversy Score,ESG Risk Percentile,ESG Risk Level
0,ENPH,"Enphase Energy, Inc.","47281 Bayside Parkway\nFremont, CA 94538\nUnit...",Technology,Solar,3157,"Enphase Energy, Inc., together with its subsid...",21.05,4.05,6.1,8.9,,2.0,,
1,EMN,Eastman Chemical Company,"200 South Wilcox Drive\nKingsport, TN 37662\nU...",Basic Materials,Specialty Chemicals,14000,Eastman Chemical Company operates as a special...,25.3,12.8,6.6,5.8,Moderate Controversy Level,2.0,50th percentile,Medium
2,DPZ,Domino's Pizza Inc.,"30 Frank Lloyd Wright Drive\nAnn Arbor, MI 481...",Consumer Cyclical,Restaurants,6500,"Domino's Pizza, Inc., through its subsidiaries...",29.2,10.6,6.3,12.2,Moderate Controversy Level,2.0,66th percentile,Medium
3,DAY,"Dayforce, Inc.","3311 East Old Shakopee Road\nMinneapolis, MN 5...",Technology,Software - Application,9084,"Dayforce Inc., together with its subsidiaries,...",21.05,4.05,6.1,8.9,,2.0,,
4,DVA,Davita Inc.,"2000 16th Street\nDenver, CO 80202\nUnited States",Healthcare,Medical Care Facilities,70000,DaVita Inc. provides kidney dialysis services ...,22.6,0.1,8.4,14.1,Moderate Controversy Level,2.0,38th percentile,Medium


In [29]:
# 4. Persist processed data
clean_df.to_csv(PROC_PATH, index=False)
print(f"✅ Saved processed dataset to {PROC_PATH}")

✅ Saved processed dataset to ..\data\processed\esg_data_cleaned.csv


In [30]:
# 5. Quick quality report
missing_summary = clean_df.isnull().sum().sort_values(ascending=False)
print("Missing values (top 20):")
print(missing_summary.head(20))

print("\nDescriptive stats:")
clean_df.describe(include='all').transpose().head(25)

Missing values (top 20):
Controversy Level         73
ESG Risk Percentile       73
ESG Risk Level            73
Full Time Employees        5
Address                    1
Sector                     1
Industry                   1
Description                1
Symbol                     0
Name                       0
Total ESG Risk score       0
Environment Risk Score     0
Governance Risk Score      0
Social Risk Score          0
Controversy Score          0
dtype: int64

Descriptive stats:


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Symbol,503.0,503.0,ENPH,1.0,,,,,,,
Name,503.0,503.0,"Enphase Energy, Inc.",1.0,,,,,,,
Address,502.0,498.0,"1211 Avenue of the Americas\nNew York, NY 1003...",4.0,,,,,,,
Sector,502.0,11.0,Technology,76.0,,,,,,,
Industry,502.0,116.0,Utilities - Regulated Electric,23.0,,,,,,,
Full Time Employees,498.0,394.0,18000,10.0,,,,,,,
Description,502.0,499.0,Alphabet Inc. offers various products and plat...,2.0,,,,,,,
Total ESG Risk score,503.0,,,,21.463519,6.370885,7.1,16.9,21.05,25.15,41.7
Environment Risk Score,503.0,,,,5.494533,4.745167,0.0,2.0,4.05,8.0,25.0
Governance Risk Score,503.0,,,,6.634394,2.053097,3.0,5.4,6.1,7.4,19.4
