# Data Exploration Demo Notebook
This is a demo notebook to showcase the Covertype dataset

In [2]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_openml

In [8]:
# Load dataset
dataset = fetch_openml('covertype', version=3, as_frame="auto").frame
print("Dataset shape:", dataset.shape)
print("Dataset columns:", dataset.columns)
print("First 5 rows of the dataset:", dataset.head())

Dataset shape: (581012, 55)
Dataset columns: Index(['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology',
       'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',
       'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',
       'Horizontal_Distance_To_Fire_Points', 'Wilderness_Area1',
       'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4',
       'Soil_Type1', 'Soil_Type2', 'Soil_Type3', 'Soil_Type4', 'Soil_Type5',
       'Soil_Type6', 'Soil_Type7', 'Soil_Type8', 'Soil_Type9', 'Soil_Type10',
       'Soil_Type11', 'Soil_Type12', 'Soil_Type13', 'Soil_Type14',
       'Soil_Type15', 'Soil_Type16', 'Soil_Type17', 'Soil_Type18',
       'Soil_Type19', 'Soil_Type20', 'Soil_Type21', 'Soil_Type22',
       'Soil_Type23', 'Soil_Type24', 'Soil_Type25', 'Soil_Type26',
       'Soil_Type27', 'Soil_Type28', 'Soil_Type29', 'Soil_Type30',
       'Soil_Type31', 'Soil_Type32', 'Soil_Type33', 'Soil_Type34',
       'Soil_Type35', 'Soil_Type36', 'Soil_Type37', 'Soi

In [16]:
n_rows, n_cols = dataset.shape
numeric = dataset.select_dtypes(include="number")
categorical = dataset.select_dtypes(exclude="number")
mem_mb = dataset.memory_usage(deep=True).sum() / 1024**2

overview = pd.DataFrame(
    {
        "Metric": [
            "Rows",
            "Columns",
            "Numeric columns",
            "Categorical columns",
            "Target column",
            "Target values",
            "Memory usage (MB)",
            "Duplicate rows",
            "Missing values",
        ],
        "Value": [
            n_rows,
            n_cols,
            len(numeric.columns),
            len(categorical.columns),
            dataset.columns[-1],
            dataset[dataset.columns[-1]].nunique(),
            round(mem_mb, 2),
            int(dataset.duplicated().sum()),
            int(dataset.isnull().sum().sum())
        ],
    }
)

display(overview)

Unnamed: 0,Metric,Value
0,Rows,581012
1,Columns,55
2,Numeric columns,10
3,Categorical columns,45
4,Target column,class
5,Target values,7
6,Memory usage (MB),69.26
7,Duplicate rows,0
8,Missing values,0
