# Covertype v3 Exploration

Interactive exploration notebook for `ML-Project-2`. This notebook targets the OpenML `covertype` dataset, version `3`, and is configured for the root `.venv311` environment.

In [1]:
from pathlib import Path
import sys

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.feature_selection import mutual_info_classif

PROJECT_ROOT = Path.cwd()
if PROJECT_ROOT.name == "notebooks":
    PROJECT_ROOT = PROJECT_ROOT.parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

from src.explore import (
    TARGET,
    build_feature_matrix,
    build_indicator_labels,
    get_feature_groups,
    load_covertype_v3,
)

sns.set_theme(style="whitegrid")
pd.set_option("display.max_columns", 60)


In [2]:
df = load_covertype_v3()
continuous_cols, wilderness_cols, soil_cols = get_feature_groups(df)
derived = build_indicator_labels(df)

print(f"Shape: {df.shape}")
print(f"Continuous features: {len(continuous_cols)}")
print(f"Wilderness indicators: {len(wilderness_cols)}")
print(f"Soil indicators: {len(soil_cols)}")
df.head()

Shape: (581012, 55)
Continuous features: 10
Wilderness indicators: 4
Soil indicators: 40


Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area1,Wilderness_Area2,Wilderness_Area3,Wilderness_Area4,Soil_Type1,Soil_Type2,Soil_Type3,Soil_Type4,Soil_Type5,Soil_Type6,Soil_Type7,Soil_Type8,Soil_Type9,Soil_Type10,Soil_Type11,Soil_Type12,Soil_Type13,Soil_Type14,Soil_Type15,Soil_Type16,Soil_Type17,Soil_Type18,Soil_Type19,Soil_Type20,Soil_Type21,Soil_Type22,Soil_Type23,Soil_Type24,Soil_Type25,Soil_Type26,Soil_Type27,Soil_Type28,Soil_Type29,Soil_Type30,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,class
0,0.368684,0.141667,0.045455,0.184681,0.223514,0.071659,0.870079,0.913386,0.582677,0.875366,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5
1,0.365683,0.155556,0.030303,0.151754,0.215762,0.054798,0.866142,0.925197,0.594488,0.867838,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5
2,0.472736,0.386111,0.136364,0.19184,0.307494,0.446817,0.92126,0.937008,0.531496,0.853339,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2
3,0.463232,0.430556,0.272727,0.173228,0.375969,0.434172,0.937008,0.937008,0.480315,0.865886,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2
4,0.368184,0.125,0.030303,0.10952,0.222222,0.054939,0.866142,0.92126,0.590551,0.860449,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5


In [3]:
overview = pd.DataFrame(
    {
        "Metric": [
            "Rows",
            "Columns",
            "Target column",
            "Target classes",
            "Duplicate rows",
            "Missing cells",
            "Memory usage (MB)",
        ],
        "Value": [
            len(df),
            len(df.columns),
            TARGET,
            df[TARGET].astype(str).nunique(),
            int(df.duplicated().sum()),
            int(df.isna().sum().sum()),
            round(df.memory_usage(deep=True).sum() / 1024**2, 2),
        ],
    }
)

display(overview)
display(df[continuous_cols].describe().T)


Unnamed: 0,Metric,Value
0,Rows,581012
1,Columns,55
2,Target column,class
3,Target classes,7
4,Duplicate rows,0
5,Missing cells,0
6,Memory usage (MB),69.26


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Elevation,581012.0,0.550458,0.140062,0.0,0.475238,0.568784,0.652326,1.0
Aspect,581012.0,0.43238,0.310871,0.0,0.161111,0.352778,0.722222,1.0
Slope,581012.0,0.213693,0.113458,0.0,0.136364,0.19697,0.272727,1.0
Horizontal_Distance_To_Hydrology,581012.0,0.192862,0.152147,0.0,0.077309,0.156049,0.274875,1.0
Vertical_Distance_To_Hydrology,581012.0,0.283487,0.075317,0.0,0.232558,0.262274,0.312661,1.0
Horizontal_Distance_To_Roadways,581012.0,0.330216,0.219089,0.0,0.155403,0.280596,0.467613,1.0
Hillshade_9am,581012.0,0.835221,0.105393,0.0,0.779528,0.858268,0.909449,1.0
Hillshade_Noon,581012.0,0.879208,0.07783,0.0,0.838583,0.889764,0.933071,1.0
Hillshade_3pm,581012.0,0.561135,0.150687,0.0,0.468504,0.562992,0.661417,1.0
Horizontal_Distance_To_Fire_Points,581012.0,0.276076,0.184608,0.0,0.142758,0.238394,0.3555,1.0


In [None]:
class_counts = (
    df[TARGET]
    .astype(str)
    .value_counts()
    .sort_index(key=lambda idx: idx.astype(int))
)
class_share = class_counts.div(len(df)).mul(100).round(2)
display(
    pd.DataFrame(
        {
            "count": class_counts,
            "share_percent": class_share,
        }
    )
)

fig, ax = plt.subplots(figsize=(8, 4))
sns.barplot(x=class_counts.index, y=class_counts.values, ax=ax, palette="crest")
ax.set_title("Covertype v3 class balance")
ax.set_xlabel(TARGET)
ax.set_ylabel("count")
plt.show()

In [None]:
X, discrete_mask = build_feature_matrix(df)
sample_idx = X.sample(min(len(X), 50000), random_state=42).index
X_sample = X.loc[sample_idx]
y_sample = df.loc[sample_idx, TARGET].astype(str)

mi_scores = pd.Series(
    mutual_info_classif(
        X_sample,
        y_sample,
        discrete_features=discrete_mask,
        random_state=42,
    ),
    index=X.columns,
).sort_values(ascending=False)

display(mi_scores.head(15).rename("mutual_information").to_frame())

fig, ax = plt.subplots(figsize=(10, 5))
mi_scores.head(15).sort_values().plot.barh(ax=ax, color="#2a9d8f")
ax.set_title("Top 15 features by mutual information")
ax.set_xlabel("mutual information")
plt.show()

In [None]:
top_continuous = [feature for feature in mi_scores.index if feature in continuous_cols][:4]
plot_df = df.sample(min(len(df), 20000), random_state=42)[top_continuous + [TARGET]].copy()

fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()
for i, feature in enumerate(top_continuous):
    sns.boxplot(
        data=plot_df,
        x=TARGET,
        y=feature,
        hue=TARGET,
        legend=False,
        ax=axes[i],
        palette="Set2",
    )
    axes[i].set_title(f"{feature} by class")
    axes[i].tick_params(axis="x", labelsize=8)
plt.tight_layout()
plt.show()

class_profiles = (
    df[continuous_cols]
    .join(df[TARGET].astype(str))
    .groupby(TARGET)
    .mean()
    .sort_index(key=lambda idx: idx.astype(int))
)
class_profiles = (class_profiles - df[continuous_cols].mean()).div(
    df[continuous_cols].std(ddof=0).replace(0, 1)
)

plt.figure(figsize=(12, 5))
sns.heatmap(class_profiles, cmap="vlag", center=0, annot=True, fmt=".2f")
plt.title("Standardized class-wise mean profile")
plt.show()

In [None]:
terrain = derived.join(df[TARGET].astype(str))

wilderness_share = pd.crosstab(
    terrain["Wilderness_Area"], terrain[TARGET], normalize="index"
).mul(100)
soil_counts = terrain["Soil_Type"].value_counts().head(15)
soil_share = pd.crosstab(
    terrain["Soil_Type"], terrain[TARGET], normalize="index"
).mul(100).loc[soil_counts.index]

display(terrain["Wilderness_Area"].value_counts().rename("count").to_frame())
display(soil_counts.rename("count").to_frame())

fig, axes = plt.subplots(1, 2, figsize=(16, 9))
sns.heatmap(wilderness_share, annot=True, fmt=".1f", cmap="YlGnBu", ax=axes[0], cbar=False)
axes[0].set_title("Class mix within wilderness areas (%)")
axes[0].set_xlabel(TARGET)
axes[0].set_ylabel("Wilderness_Area")

sns.heatmap(soil_share, annot=True, fmt=".1f", cmap="YlGnBu", ax=axes[1], cbar=False)
axes[1].set_title("Class mix within top 15 soil types (%)")
axes[1].set_xlabel(TARGET)
axes[1].set_ylabel("Soil_Type")
plt.tight_layout()
plt.show()

If you want the static markdown report and saved figures as well, run:

```powershell
..\.venv311\Scripts\python.exe -m src.explore
```