## Kaggle ML2
## Matteo A. D'Alessandro, Carlo A. Patti

In [150]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
from ydata_profiling import ProfileReport

%reload_ext autoreload
%autoreload 2

In [151]:
df = pd.read_csv('../data/train.csv').drop('Id', axis=1)

In [152]:
# import warnings
# warnings.filterwarnings('ignore', category=FutureWarning)

# profile = ProfileReport(df, title='Pandas Profiling Report', explorative=True)
# profile.to_file("../assets/profile_report.html")

Converting horizontal and vertical distances to hydrology to the Euclidean one

In [153]:
df['Distance_To_Hydrology'] = np.sqrt(df['Horizontal_Distance_To_Hydrology']**2 + df['Vertical_Distance_To_Hydrology']**2)
df.drop(['Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology'], axis=1, inplace=True)

In [154]:
# reverse one-hot encoding for soil type, in order to extract climatic and geologic zone information
soil_cols = [col for col in df.columns if col.startswith('Soil')]
df['soil_type'] = df[soil_cols].idxmax(axis=1).str.replace('Soil_Type','').astype(int)

In [155]:
# load soil number to elu code mapping (from kaggle data description file, formatted via ChatGPT)
elu_data_raw = pd.read_csv('..//data//num_to_elu.txt', sep=":", header=None)

In [156]:
elu_data_raw.head()

Unnamed: 0,0,1
0,1,"ELU 2702, Cathedral family - Rock outcrop com..."
1,2,"ELU 2703, Vanet - Ratake families complex, ve..."
2,3,"ELU 2704, Haploborolis - Rock outcrop complex..."
3,4,"ELU 2705, Ratake family - Rock outcrop comple..."
4,5,"ELU 2706, Vanet family - Rock outcrop complex..."



First Digit: Climatic Zone

- **1**: Lower Montane Dry
- **2**: Lower Montane
- **3**: Montane Dry
- **4**: Montane
- **5**: Montane Dry and Montane
- **6**: Montane and Subalpine
- **7**: Subalpine
- **8**: Alpine

Second Digit: Geologic Zones
- **1**: Alluvium
- **2**: Glacial
- **3**: Shale
- **4**: Sandstone
- **5**: Mixed Sedimentary
- **6**: Unspecified in the USFS ELU Survey
- **7**: Igneous and Metamorphic
- **8**: Volcanic

In [157]:
# extract ELU code only
elu_data_raw['ELU'] = elu_data_raw[1].str[5:9]

# compute climatic and geologic zone from ELU code
elu_data_raw['climatic_zone'] = elu_data_raw['ELU'].str[0]
elu_data_raw['geologic_zone'] = elu_data_raw['ELU'].str[1]

elu_data_raw.drop([1, 'ELU'], axis=1, inplace=True)
elu_data_raw.columns = ['soil_type', 'climatic_zone', 'geologic_zone']

In [158]:
elu_data_raw.head()

Unnamed: 0,soil_type,climatic_zone,geologic_zone
0,1,2,7
1,2,2,7
2,3,2,7
3,4,2,7
4,5,2,7


In [159]:
# perform left join to merge climatic and geologic zone to main dataframe
df = df.merge(elu_data_raw, on='soil_type', how='left')
df.drop(['soil_type'], axis=1, inplace=True)

In [160]:
df_soil_cols = df[soil_cols].copy()
df.drop(soil_cols, axis=1, inplace=True)

In [161]:
df.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area1,Wilderness_Area2,Wilderness_Area3,Wilderness_Area4,Cover_Type,Distance_To_Hydrology,climatic_zone,geologic_zone
0,2881,130,22,1020,250,221,88,342,1,0,0,0,1,216.831732,7,7
1,3005,351,14,1371,194,215,159,842,0,0,1,0,1,242.528349,7,7
2,3226,63,14,1092,232,210,107,2018,1,0,0,0,1,618.003236,7,7
3,3298,317,8,752,198,233,174,1248,0,1,0,0,1,663.71756,7,2
4,3080,35,6,3705,219,227,144,2673,1,0,0,0,1,176.920886,7,7


In [162]:
df_soil_cols.drop('Soil_Type1', axis=1, inplace=True)
soil_cols.remove('Soil_Type1')
df_soil_cols.sample(5)

Unnamed: 0,Soil_Type2,Soil_Type3,Soil_Type4,Soil_Type5,Soil_Type6,Soil_Type7,Soil_Type8,Soil_Type9,Soil_Type10,Soil_Type11,...,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40
8145,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3435,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1776,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2611,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
12524,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Wilderness Areas, dominant vegetation type (expected):

- **1:** Rawah Wilderness Area -- **(2, 5)**
- **2:** Neota Wilderness Area -- **(1)**
- **3:** Comanche Peak Wilderness Area -- **(2, 5)**
- **4:** Cache la Poudre Wilderness Area -- **(3 ,4, 6)**

In [163]:
# reverse one-hot encoding for wilderness area
wilderness_cols = [col for col in df.columns if col.startswith('Wilderness_Area')]
df['wilderness_area'] = df[wilderness_cols].idxmax(axis=1).str.replace('Wilderness_Area','').astype(int)

Demonstrate succesful decoding

In [164]:
df[
    wilderness_cols + ['wilderness_area']
    ].sample(3)

Unnamed: 0,Wilderness_Area1,Wilderness_Area2,Wilderness_Area3,Wilderness_Area4,wilderness_area
1905,0,0,1,0,3
7206,0,0,0,1,4
6241,0,0,1,0,3


In [165]:
# plot cover type distribution by wilderness area
import plotly.express as px

fig = px.density_heatmap(
    df,
    x='Cover_Type',
    y='wilderness_area',
    title='Cover Type Distribution by Wilderness Area',
    labels={'wilderness_area': 'Wilderness Area', 'Cover_Type': 'Cover Type'},
    color_continuous_scale='Reds',
    histfunc='count',
    nbinsx=7,
    nbinsy=4,
    # aspect='auto'
)

fig.update_xaxes(tickvals=list(range(1, 8)))
fig.update_yaxes(tickvals=list(range(1, 5)))

fig.show()

In [166]:
# plot cover type distribution by wilderness area 

df_viz = df.groupby(['Cover_Type', 'wilderness_area']).size().reset_index(name='count')
fig = px.bar(df_viz, x="Cover_Type", y='wilderness_area', color="wilderness_area", barmode="group")
fig.update_layout(title='Cover Type Distribution by Wilderness Area', xaxis_title='Cover Type', yaxis_title='Count')

fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)

fig.update_coloraxes(
    colorbar_dtick=1.0,
    colorbar_tickmode='array',
    colorbar_tickvals=[1, 2, 3, 4],
    )

fig.show()

In [167]:
df_wilderness_cols = df[wilderness_cols].copy().drop('Wilderness_Area1', axis=1)
df.drop(wilderness_cols + ["wilderness_area"], axis=1, inplace=True)
wilderness_cols.remove('Wilderness_Area1')

In [168]:
df.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Cover_Type,Distance_To_Hydrology,climatic_zone,geologic_zone
0,2881,130,22,1020,250,221,88,342,1,216.831732,7,7
1,3005,351,14,1371,194,215,159,842,1,242.528349,7,7
2,3226,63,14,1092,232,210,107,2018,1,618.003236,7,7
3,3298,317,8,752,198,233,174,1248,1,663.71756,7,2
4,3080,35,6,3705,219,227,144,2673,1,176.920886,7,7


In [169]:
y = df['Cover_Type']
X = df.drop(['Cover_Type'], axis=1)
X = pd.concat([X, df_soil_cols, df_wilderness_cols], axis=1)

In [178]:
X = X.astype(float)
X[soil_cols] = X[soil_cols].astype('category')
X[wilderness_cols] = X[wilderness_cols].astype('category')
X['climatic_zone'] = X['climatic_zone'].astype('category')
X['geologic_zone'] = X['geologic_zone'].astype('category')

y = y.astype('category')

In [182]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [194]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

cols_numeric = X.select_dtypes(include=['float']).columns
cols_to_ohe = ['climatic_zone', 'geologic_zone']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), cols_numeric),
        ('cat', OneHotEncoder(), cols_to_ohe)
    ])

In [202]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=300, random_state=42)

pipe_rf = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', clf)
    ])

pipe_rf.fit(X_train, y_train)

y_pred = pipe_rf.predict(X_val)

In [203]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print(accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred))

0.8373015873015873
              precision    recall  f1-score   support

           1       0.76      0.74      0.75       427
           2       0.77      0.66      0.71       464
           3       0.81      0.78      0.80       400
           4       0.91      0.98      0.94       436
           5       0.87      0.93      0.90       452
           6       0.80      0.81      0.81       415
           7       0.92      0.96      0.94       430

    accuracy                           0.84      3024
   macro avg       0.83      0.84      0.83      3024
weighted avg       0.83      0.84      0.83      3024



In [204]:
df_preds = pd.DataFrame({'y_true': y_val, 'y_pred': y_pred})
df_preds['correct_pred'] = df_preds['y_true'] == df_preds['y_pred']

In [205]:
df_preds[df_preds['y_true'] == 4].y_pred.value_counts()

y_pred
4    426
6      8
3      2
Name: count, dtype: int64