In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
from ydata_profiling import ProfileReport
import plotly.express as px

sys.path.append('../../src')
from dataloader import *
from prep_utils import *

%reload_ext autoreload
%autoreload 2

df = load_train_df(
    PATH = '../../data',
    decode_dummies=True,
    add_geo_features = True
)

In [2]:
### THIS REMOVES DEPENDENCY ON INDEX ###
# (not using the index as a feature is sufficient, this fix is for displaying properly)
df['ind'] = np.random.permutation(df.shape[0])

df.set_index('ind', inplace=True)
df.sort_index(inplace=True)
df.reset_index(inplace=True, drop=True)

In [3]:
y, X = df.Cover_Type, df.drop('Cover_Type', axis=1)

In [4]:
X.sample(5)

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Soil_Type,Wilderness_Area,Climatic_Zone,Geologic_Zone
13205,2478.0,248.0,3.0,511.0,122.0,864.0,214.0,241.0,165.0,1355.0,6,4,2,7
8688,2752.0,175.0,13.0,60.0,16.0,942.0,227.0,247.0,149.0,511.0,3,3,2,7
11949,2140.0,22.0,11.0,42.0,0.0,1315.0,213.0,217.0,139.0,1299.0,17,4,6,1
3751,2736.0,145.0,12.0,67.0,6.0,2467.0,237.0,238.0,130.0,2215.0,11,3,4,7
13538,2294.0,21.0,13.0,162.0,46.0,793.0,211.0,212.0,135.0,1048.0,11,4,4,7


This is where the dependency on index is best displayed

In [5]:
X = encode_sin_cos(X, cols=['Aspect'])
X[['Aspect_sin', 'Aspect_cos']].head(3)

Unnamed: 0,Aspect_sin,Aspect_cos
0,-0.897928,0.440143
1,-0.587795,-0.80901
2,0.363199,0.931711


In [6]:
X['Distance_To_Hydrology'] = np.sqrt(X['Horizontal_Distance_To_Hydrology']**2 + X['Vertical_Distance_To_Hydrology']**2)
X.sample(3)

Unnamed: 0,Elevation,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Soil_Type,Wilderness_Area,Climatic_Zone,Geologic_Zone,Aspect_sin,Aspect_cos,Distance_To_Hydrology
7333,2726.0,25.0,67.0,-15.0,150.0,247.0,189.0,57.0,2236.0,30,1,7,7,-0.923458,-0.383698,68.658576
855,2838.0,16.0,182.0,82.0,1473.0,234.0,206.0,99.0,1392.0,33,3,7,7,0.826829,-0.562454,199.619638
14006,3394.0,14.0,457.0,56.0,1622.0,183.0,227.0,184.0,698.0,39,3,8,7,-0.643561,-0.765395,460.418288


In [7]:
from sklearn.preprocessing import MinMaxScaler

cols_minmax = ['Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm']
mm_scaler = MinMaxScaler().set_output(transform='pandas')

X[cols_minmax] = mm_scaler.fit_transform(X[cols_minmax])

In [8]:
cols_skewed = ['Horizontal_Distance_To_Roadways', 'Horizontal_Distance_To_Fire_Points', 'Distance_To_Hydrology']
X = encode_log(X, cols_skewed)

In [9]:
from sklearn.preprocessing import StandardScaler

cols_scale = ['Elevation', 'Slope'] + [col+"_log" for col in cols_skewed]
scaler = StandardScaler()
X[cols_scale] = scaler.fit_transform(X[cols_scale])

In [10]:
X = X.drop(['Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Hydrology'], axis=1)

In [11]:
X.sample(3)

Unnamed: 0,Elevation,Slope,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Soil_Type,Wilderness_Area,Climatic_Zone,Geologic_Zone,Aspect_sin,Aspect_cos,Horizontal_Distance_To_Roadways_log,Horizontal_Distance_To_Fire_Points_log,Distance_To_Hydrology_log
6453,-0.770055,0.40346,0.544554,0.787097,0.784861,10,4,4,7,-0.831791,-0.555088,-0.234184,-0.366044,0.325791
2318,-1.156694,1.458025,0.826733,0.432258,0.250996,10,4,4,7,-0.262375,0.964966,-1.133688,0.419874,0.652414
7558,-0.992015,1.106503,1.0,0.677419,0.239044,3,4,2,7,0.945435,-0.32581,-0.344633,0.248501,0.608716


In [12]:
from sklearn.preprocessing import OneHotEncoder
cols_ohe = ['Wilderness_Area', 'Climatic_Zone', 'Geologic_Zone', 'Soil_Type']

oh_enc = OneHotEncoder(drop='first', sparse_output=False).set_output(transform='pandas')

X_ohe_encoded = oh_enc.fit_transform(X[cols_ohe])
X = pd.concat((X, X_ohe_encoded), axis=1)
X = X.drop(cols_ohe, axis=1)

In [15]:
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
report = ProfileReport(X, title='Pandas Profiling Report', explorative=True)
report.to_file('../../assets/report_prep.html')

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]