## Kaggle ML2
## Matteo A. D'Alessandro, Carlo A. Patti

For basic statistics and visualizations check the profile_report.html file in ../assets

In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
from ydata_profiling import ProfileReport
import plotly.express as px

sys.path.append('../src')
from dataloader import *

%reload_ext autoreload
%autoreload 2

plots_theme = "plotly_dark"

In [30]:
df = load_train_df(
    PATH = '../data',
    decode_dummies=False
)

Converting horizontal and vertical distances to hydrology to the Euclidean one

In [31]:
df['Distance_To_Hydrology'] = np.sqrt(df['Horizontal_Distance_To_Hydrology']**2 + df['Vertical_Distance_To_Hydrology']**2)

hydrology = df[['Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology', 'Distance_To_Hydrology']]
df.drop(['Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology'], axis=1, inplace=True)

In [32]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

fig = make_subplots(rows=1, cols=3, subplot_titles=("Horizontal Distance", "Vertical Distance", "Distance to Hydrology"))


fig.add_trace(
    go.Histogram(x=hydrology['Horizontal_Distance_To_Hydrology'], name='Horizontal Distance'),
    row=1, col=1
)

fig.add_trace(
    go.Histogram(x=hydrology['Vertical_Distance_To_Hydrology'], name='Vertical Distance'),
    row=1, col=2
)

fig.add_trace(
    go.Histogram(x=hydrology['Distance_To_Hydrology'], name='Distance to Hydrology'),
    row=1, col=3
)


fig.update_layout(
    height=400,
    width=1000,
    title_text="Distance to Hydrology",
    template=plots_theme
    )

In [33]:
df = load_train_df(
    PATH = '../data',
    decode_dummies=True,
    add_geo_features=True
)

In [47]:
# plot cover type distribution by wilderness area
import plotly.express as px

fig = px.density_heatmap(
    df,
    x='Cover_Type',
    y='Wilderness_Area',
    title='Cover Type Distribution by Wilderness Area',
    labels={'Wilderness_Area': 'Wilderness Area', 'Cover_Type': 'Cover Type'},
    color_continuous_scale='Blues',
    histfunc='count',
    nbinsx=7,
    nbinsy=4,
    # aspect='auto'
)

fig.update_xaxes(tickvals=list(range(1, 8)))
fig.update_yaxes(tickvals=list(range(1, 5)))
fig.update_layout(
    height=400,
    width=1000,
    template=plots_theme
    )

fig.show()

In [44]:
# plot cover type distribution by wilderness area 

df_viz = df.groupby(['Cover_Type', 'Wilderness_Area']).size().reset_index(name='count')
df_viz = df_viz.pivot(index='Cover_Type', columns='Wilderness_Area', values='count').reset_index()

fig = go.Figure(data=[
    go.Bar(name='Wilderness Area 1', x=df_viz['Cover_Type'], y=df_viz[1]),
    go.Bar(name='Wilderness Area 2', x=df_viz['Cover_Type'], y=df_viz[2]),
    go.Bar(name='Wilderness Area 3', x=df_viz['Cover_Type'], y=df_viz[3]),
    go.Bar(name='Wilderness Area 4', x=df_viz['Cover_Type'], y=df_viz[4])
])

fig.update_layout(
    barmode='stack',
    title='Cover Type Distribution by Wilderness Area',
    xaxis_title='Cover Type',
    yaxis_title='Count',
    template=plots_theme
)

fig.show()






In [45]:
df.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Cover_Type,Soil_Type,Wilderness_Area,Climatic_Zone,Geologic_Zone
0,2881.0,130.0,22.0,210.0,54.0,1020.0,250.0,221.0,88.0,342.0,1,30,1,7,7
1,3005.0,351.0,14.0,242.0,-16.0,1371.0,194.0,215.0,159.0,842.0,1,24,3,7,7
2,3226.0,63.0,14.0,618.0,2.0,1092.0,232.0,210.0,107.0,2018.0,1,29,1,7,7
3,3298.0,317.0,8.0,661.0,60.0,752.0,198.0,233.0,174.0,1248.0,1,23,2,7,2
4,3080.0,35.0,6.0,175.0,26.0,3705.0,219.0,227.0,144.0,2673.0,1,24,1,7,7
