In [2]:
!pip install ydata-profiling



In [3]:
from ydata_profiling import ProfileReport



  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import os, re, warnings, math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", 200)
sns.set_theme(context="notebook", style="whitegrid")

In [5]:
REPORT_DIR = "eda_report"
os.makedirs(REPORT_DIR, exist_ok=True)

def savefig(name):
    path = os.path.join(REPORT_DIR, f"{name}.png")
    plt.tight_layout()
    plt.savefig(path, dpi=140, bbox_inches="tight")
    print(f" Saved: {path}")

In [6]:
df = pd.read_csv("CO2 Emissions_Canada.csv")

In [7]:
def to_snake(name: str) -> str:
    name = name.strip()
    name = re.sub(r"[^\w\s]", " ", name)      # replace punctuation with space
    name = re.sub(r"\s+", "_", name)          # collapse spaces to underscores
    return name.lower()

df.columns = [to_snake(c) for c in df.columns]
print(" Cleaned columns:", list(df.columns))

 Cleaned columns: ['make', 'model', 'vehicle_class', 'engine_size_l_', 'cylinders', 'transmission', 'fuel_type', 'fuel_consumption_city_l_100_km_', 'fuel_consumption_hwy_l_100_km_', 'fuel_consumption_comb_l_100_km_', 'fuel_consumption_comb_mpg_', 'co2_emissions_g_km_']


In [8]:
print("\nShape:", df.shape)
print("\nDtypes:\n", df.dtypes)
print("\nMemory usage (MB):", round(df.memory_usage(deep=True).sum() / 1e6, 2))

# Save head/tail/info
df.head(10).to_csv(os.path.join(REPORT_DIR, "preview_head.csv"), index=False)
df.tail(10).to_csv(os.path.join(REPORT_DIR, "preview_tail.csv"), index=False)
print("Saved preview_head.csv and preview_tail.csv")


Shape: (7385, 12)

Dtypes:
 make                                object
model                               object
vehicle_class                       object
engine_size_l_                     float64
cylinders                            int64
transmission                        object
fuel_type                           object
fuel_consumption_city_l_100_km_    float64
fuel_consumption_hwy_l_100_km_     float64
fuel_consumption_comb_l_100_km_    float64
fuel_consumption_comb_mpg_           int64
co2_emissions_g_km_                  int64
dtype: object

Memory usage (MB): 2.47
Saved preview_head.csv and preview_tail.csv


In [9]:
for col in df.columns:
    if df[col].dtype == "object":
        # strip units and commas if they exist
        cleaned = (df[col]
                   .astype(str)
                   .str.replace(",", "", regex=False)
                   .str.replace(r"[^\d\.\-eE+]", "", regex=True))
        # convert where reasonable
        maybe_num = pd.to_numeric(cleaned, errors="coerce")
        # only adopt if we meaningfully converted more than we lost
        if maybe_num.notna().sum() >= max(10, int(0.3 * len(df))):
            df[col] = maybe_num

In [10]:
num_cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])]
#cat_cols = [c for c in df.columns if c not in num_cols and df[c].dtype.name != "datetime64[ns]"]




In [11]:
max_hists = min(12, len(num_cols))
for col in num_cols[:max_hists]:
    plt.figure(figsize=(6,4))
    sns.histplot(df[col].dropna(), bins=30, kde=True)
    plt.title(f"Distribution: {col}")
    savefig(f"dist_{col}")
    plt.close()

 Saved: eda_report\dist_engine_size_l_.png
 Saved: eda_report\dist_cylinders.png
 Saved: eda_report\dist_transmission.png
 Saved: eda_report\dist_fuel_consumption_city_l_100_km_.png
 Saved: eda_report\dist_fuel_consumption_hwy_l_100_km_.png
 Saved: eda_report\dist_fuel_consumption_comb_l_100_km_.png
 Saved: eda_report\dist_fuel_consumption_comb_mpg_.png
 Saved: eda_report\dist_co2_emissions_g_km_.png


In [12]:
if len(num_cols) >= 2:
    corr = df[num_cols].corr(numeric_only=True, method="pearson")
    corr.to_csv(os.path.join(REPORT_DIR, "correlation_pearson.csv"))
    plt.figure(figsize=(min(12, 0.6*len(num_cols)+4), min(10, 0.6*len(num_cols)+4)))
    sns.heatmap(corr, annot=False, cmap="viridis", center=0)
    plt.title("Correlation Heatmap (Pearson)")
    savefig("corr_heatmap")
    plt.close()
    print("Saved: correlation_pearson.csv & corr_heatmap.png")


 Saved: eda_report\corr_heatmap.png
Saved: correlation_pearson.csv & corr_heatmap.png


In [13]:
possible_targets = [
    "co2_emissions_g_km", "co2_emissions_g_per_km", "co2_emissions_gkm",
    "co2_emissions", "co2_emissions_g/_km", "co2_emissions(g/km)".replace("(", "_").replace(")", "").replace("/", "_").lower()
]
target_col = None
for cand in possible_targets:
    if cand in df.columns:
        target_col = cand
        break

if target_col:
    print(f"\nDetected target column: {target_col}")
    # scatter vs top correlated numeric features
    if len(num_cols) > 1:
        corrs = df[num_cols].corr(numeric_only=True)[target_col].dropna().abs().sort_values(ascending=False)
        for feat in [c for c in corrs.index if c != target_col][:6]:
            plt.figure(figsize=(6,4))
            sns.scatterplot(x=df[feat], y=df[target_col], alpha=0.5)
            plt.title(f"{target_col} vs {feat}")
            savefig(f"target_scatter_{target_col}_vs_{feat}")
            plt.close()

else:
    print("\nℹ No obvious CO2 target column found. Skipping target-aware plots.")




ℹ No obvious CO2 target column found. Skipping target-aware plots.


In [14]:
def data_dictionary(data: pd.DataFrame) -> pd.DataFrame:
    rows = []
    for c in data.columns:
        dtype = str(data[c].dtype)
        nunique = data[c].nunique(dropna=True)
        missing = data[c].isna().sum()
        example = data[c].dropna().iloc[0] if data[c].notna().any() else None
        rows.append({"column": c, "dtype": dtype, "nunique": nunique,
                     "missing": missing, "example_value": example})
    return pd.DataFrame(rows).sort_values("column")

dd = data_dictionary(df)
dd.to_csv(os.path.join(REPORT_DIR, "data_dictionary.csv"), index=False)
print("Saved: data_dictionary.csv")

Saved: data_dictionary.csv


In [15]:
from ydata_profiling import ProfileReport
profile = ProfileReport(df, explorative=True, title="CO2 Emissions — Automated EDA")
html_path = os.path.join(REPORT_DIR, "profile_report.html")
profile.to_file(html_path)
print(f"Profiling report saved: {html_path}")


print("\nDone! Check the 'eda_report' folder for figures & summaries.")

100%|██████████| 12/12 [00:00<00:00, 347.17it/s]<00:00, 13.64it/s, Describe variable: co2_emissions_g_km_]   
Summarize dataset: 100%|██████████| 85/85 [00:07<00:00, 11.32it/s, Completed]                                                               
Generate report structure: 100%|██████████| 1/1 [00:01<00:00,  1.92s/it]
Render HTML: 100%|██████████| 1/1 [00:01<00:00,  1.86s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 83.18it/s]

Profiling report saved: eda_report\profile_report.html

Done! Check the 'eda_report' folder for figures & summaries.



