In [17]:
from pandas_profiling import ProfileReport

In [1]:
import os, re, warnings, math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", 200)
sns.set_theme(context="notebook", style="whitegrid")

In [None]:
REPORT_DIR = "eda_report"
os.makedirs(REPORT_DIR, exist_ok=True)

def savefig(name):
    path = os.path.join(REPORT_DIR, f"{name}.png")
    plt.tight_layout()
    plt.savefig(path, dpi=140, bbox_inches="tight")
    print(f" Saved: {path}")

In [20]:
df = pd.read_csv("CO2 Emissions_Canada.csv")

In [2]:
def to_snake(name: str) -> str:
    name = name.strip()
    name = re.sub(r"[^\w\s]", " ", name)      # replace punctuation with space
    name = re.sub(r"\s+", "_", name)          # collapse spaces to underscores
    return name.lower()

df.columns = [to_snake(c) for c in df.columns]
print(" Cleaned columns:", list(df.columns))

NameError: name 'df' is not defined

In [4]:
print("\nShape:", df.shape)
print("\nDtypes:\n", df.dtypes)
print("\nMemory usage (MB):", round(df.memory_usage(deep=True).sum() / 1e6, 2))

# Save head/tail/info
df.head(10).to_csv(os.path.join(REPORT_DIR, "preview_head.csv"), index=False)
df.tail(10).to_csv(os.path.join(REPORT_DIR, "preview_tail.csv"), index=False)
print("Saved preview_head.csv and preview_tail.csv")






















NameError: name 'df' is not defined

In [24]:
for col in df.columns:
    if df[col].dtype == "object":
        # strip units and commas if they exist
        cleaned = (df[col]
                   .astype(str)
                   .str.replace(",", "", regex=False)
                   .str.replace(r"[^\d\.\-eE+]", "", regex=True))
        # convert where reasonable
        maybe_num = pd.to_numeric(cleaned, errors="coerce")
        # only adopt if we meaningfully converted more than we lost
        if maybe_num.notna().sum() >= max(10, int(0.3 * len(df))):
            df[col] = maybe_num

In [25]:
for col in df.columns:
    if df[col].dtype == "object":
        try:
            parsed = pd.to_datetime(df[col], errors="raise", infer_datetime_format=True)
            # adopt if sufficient success
            if parsed.notna().mean() > 0.8:
                df[col] = pd.to_datetime(df[col], errors="coerce")
        except Exception:
            pass

print("\n Dtypes after coercion:\n", df.dtypes)


 Dtypes after coercion:
 make                                object
model                               object
vehicle_class                       object
engine_size_l_                     float64
cylinders                            int64
transmission                       float64
fuel_type                           object
fuel_consumption_city_l_100_km_    float64
fuel_consumption_hwy_l_100_km_     float64
fuel_consumption_comb_l_100_km_    float64
fuel_consumption_comb_mpg_           int64
co2_emissions_g_km_                  int64
dtype: object


In [26]:
def missing_table(data: pd.DataFrame) -> pd.DataFrame:
    mis = data.isna().sum()
    pct = 100 * mis / len(data)
    out = (pd.DataFrame({"missing": mis, "missing_%": pct})
           .loc[mis.sort_values(ascending=False).index])
    return out[out["missing"] > 0]

missing_summary = missing_table(df)
missing_summary.to_csv(os.path.join(REPORT_DIR, "missing_summary.csv"))
print("\n Missing values (top):\n", missing_summary.head(20))
print("Saved: missing_summary.csv")

dup_count = df.duplicated().sum()
print(f"\n Duplicate rows: {dup_count}")
if dup_count:
    df_nodup = df.drop_duplicates()
else:
    df_nodup = df



 Missing values (top):
               missing  missing_%
transmission      295   3.994584
Saved: missing_summary.csv

 Duplicate rows: 1140


In [27]:
num_cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])]
cat_cols = [c for c in df.columns if c not in num_cols and df[c].dtype.name != "datetime64[ns]"]

desc_num = df[num_cols].describe().T
desc_cat = df[cat_cols].describe().T if cat_cols else pd.DataFrame()

desc_num.to_csv(os.path.join(REPORT_DIR, "describe_numeric.csv"))
if not desc_cat.empty:
    desc_cat.to_csv(os.path.join(REPORT_DIR, "describe_categorical.csv"))
print("Saved: describe_numeric.csv", "(and describe_categorical.csv)" if not desc_cat.empty else "")



Saved: describe_numeric.csv (and describe_categorical.csv)


In [29]:
max_hists = min(12, len(num_cols))
for col in num_cols[:max_hists]:
    plt.figure(figsize=(6,4))
    sns.histplot(df[col].dropna(), bins=30, kde=True)
    plt.title(f"Distribution: {col}")
    savefig(f"dist_{col}")
    plt.close()

📁 Saved: eda_report\dist_engine_size_l_.png
📁 Saved: eda_report\dist_cylinders.png
📁 Saved: eda_report\dist_transmission.png
📁 Saved: eda_report\dist_fuel_consumption_city_l_100_km_.png
📁 Saved: eda_report\dist_fuel_consumption_hwy_l_100_km_.png
📁 Saved: eda_report\dist_fuel_consumption_comb_l_100_km_.png
📁 Saved: eda_report\dist_fuel_consumption_comb_mpg_.png
📁 Saved: eda_report\dist_co2_emissions_g_km_.png


In [30]:
for col in cat_cols[:8]:
    plt.figure(figsize=(7,4))
    vc = df[col].astype(str).value_counts().head(15)
    sns.barplot(x=vc.values, y=vc.index)
    plt.title(f"Top categories: {col}")
    plt.xlabel("count"); plt.ylabel(col)
    savefig(f"bar_{col}")
    plt.close()


📁 Saved: eda_report\bar_make.png
📁 Saved: eda_report\bar_model.png
📁 Saved: eda_report\bar_vehicle_class.png
📁 Saved: eda_report\bar_fuel_type.png


In [31]:
if len(num_cols) >= 2:
    corr = df[num_cols].corr(numeric_only=True, method="pearson")
    corr.to_csv(os.path.join(REPORT_DIR, "correlation_pearson.csv"))
    plt.figure(figsize=(min(12, 0.6*len(num_cols)+4), min(10, 0.6*len(num_cols)+4)))
    sns.heatmap(corr, annot=False, cmap="viridis", center=0)
    plt.title("Correlation Heatmap (Pearson)")
    savefig("corr_heatmap")
    plt.close()
    print("Saved: correlation_pearson.csv & corr_heatmap.png")


📁 Saved: eda_report\corr_heatmap.png
Saved: correlation_pearson.csv & corr_heatmap.png


In [33]:
def iqr_outlier_counts(s: pd.Series):
    q1, q3 = s.quantile(0.25), s.quantile(0.75)
    iqr = q3 - q1
    if iqr == 0 or pd.isna(iqr):
        return 0
    lo, hi = q1 - 1.5*iqr, q3 + 1.5*iqr
    return int(((s < lo) | (s > hi)).sum())

outlier_summary = pd.DataFrame({
    "outliers_iqr": {c: iqr_outlier_counts(df[c].dropna()) for c in num_cols},
    "non_null": {c: int(df[c].notna().sum()) for c in num_cols},
})
outlier_summary["outlier_%"] = (100 * outlier_summary["outliers_iqr"] / outlier_summary["non_null"].clip(lower=1))
outlier_summary = outlier_summary.sort_values("outlier_%", ascending=False)
outlier_summary.to_csv(os.path.join(REPORT_DIR, "outliers_iqr_summary.csv"))
print("\nOutlier columns (top 10):\n", outlier_summary.head(10))
print(" Saved: outliers_iqr_summary.csv")


Outlier columns (top 10):
                                  outliers_iqr  non_null  outlier_%
fuel_consumption_hwy_l_100_km_            208      7385   2.816520
cylinders                                 196      7385   2.654028
fuel_consumption_comb_l_100_km_           142      7385   1.922817
engine_size_l_                            137      7385   1.855112
fuel_consumption_city_l_100_km_           132      7385   1.787407
fuel_consumption_comb_mpg_                114      7385   1.543670
co2_emissions_g_km_                        80      7385   1.083277
transmission                                0      7090   0.000000
 Saved: outliers_iqr_summary.csv


In [34]:
possible_targets = [
    "co2_emissions_g_km", "co2_emissions_g_per_km", "co2_emissions_gkm",
    "co2_emissions", "co2_emissions_g/_km", "co2_emissions(g/km)".replace("(", "_").replace(")", "").replace("/", "_").lower()
]
target_col = None
for cand in possible_targets:
    if cand in df.columns:
        target_col = cand
        break

if target_col:
    print(f"\nDetected target column: {target_col}")
    # scatter vs top correlated numeric features
    if len(num_cols) > 1:
        corrs = df[num_cols].corr(numeric_only=True)[target_col].dropna().abs().sort_values(ascending=False)
        for feat in [c for c in corrs.index if c != target_col][:6]:
            plt.figure(figsize=(6,4))
            sns.scatterplot(x=df[feat], y=df[target_col], alpha=0.5)
            plt.title(f"{target_col} vs {feat}")
            savefig(f"target_scatter_{target_col}_vs_{feat}")
            plt.close()

    # boxplots by top categoricals
    for col in cat_cols[:5]:
        top = df[col].value_counts().index[:8]
        plt.figure(figsize=(8,4))
        sns.boxplot(x=df[col].where(df[col].isin(top)), y=df[target_col])
        plt.title(f"{target_col} by {col} (top cats)")
        plt.xticks(rotation=30, ha="right")
        savefig(f"target_box_{target_col}_by_{col}")
        plt.close()
else:
    print("\nℹ No obvious CO2 target column found. Skipping target-aware plots.")




ℹ No obvious CO2 target column found. Skipping target-aware plots.


In [None]:
def data_dictionary(data: pd.DataFrame) -> pd.DataFrame:
    rows = []
    for c in data.columns:
        dtype = str(data[c].dtype)
        nunique = data[c].nunique(dropna=True)
        missing = data[c].isna().sum()
        example = data[c].dropna().iloc[0] if data[c].notna().any() else None
        rows.append({"column": c, "dtype": dtype, "nunique": nunique,
                     "missing": missing, "example_value": example})
    return pd.DataFrame(rows).sort_values("column")

dd = data_dictionary(df)
dd.to_csv(os.path.join(REPORT_DIR, "data_dictionary.csv"), index=False)
print("Saved: data_dictionary.csv")

📁 Saved: data_dictionary.csv


In [37]:
try:
    from ydata_profiling import ProfileReport
    profile = ProfileReport(df, explorative=True, title="CO2 Emissions — Automated EDA")
    html_path = os.path.join(REPORT_DIR, "profile_report.html")
    profile.to_file(html_path)
    print(f"Profiling report saved: {html_path}")
except Exception as e:
    print("ydata_profiling not available or failed. Skipping HTML profile.")
    print("   Reason:", e)

print("\nDone! Check the 'eda_report' folder for figures & summaries.")

100%|██████████| 12/12 [00:00<00:00, 427.35it/s]00:01, 13.17it/s, Describe variable: co2_emissions_g_km_]    
Summarize dataset: 100%|██████████| 85/85 [00:05<00:00, 16.06it/s, Completed]                                                               
Generate report structure: 100%|██████████| 1/1 [00:02<00:00,  2.20s/it]
Render HTML: 100%|██████████| 1/1 [00:01<00:00,  1.41s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 57.86it/s]

Profiling report saved: eda_report\profile_report.html

Done! Check the 'eda_report' folder for figures & summaries.



