In [1]:
import gradio as gr

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.ensemble import IsolationForest
from sklearn.model_selection import cross_validate

In [2]:
df = pd.read_csv("train.csv")

In [3]:
# Check for duplicate entries
training_features = df.drop(['Cover_Type','Id'], axis=1)
if len(training_features.duplicated().unique()) > 1:
    print('Has Duplicates')
else:
    print('No Duplicates')

No Duplicates


In [4]:
# Check class distribution
targets = df['Cover_Type']
for i in range(1,8):
    print("Number of Cover Type " + str(i) + ": ")
    print(targets[targets == i].count())

Number of Cover Type 1: 
2160
Number of Cover Type 2: 
2160
Number of Cover Type 3: 
2160
Number of Cover Type 4: 
2160
Number of Cover Type 5: 
2160
Number of Cover Type 6: 
2160
Number of Cover Type 7: 
2160


In [5]:
#All examples have a soil type
my_count = 0
for i in range(1,41):
    soil_type = 'Soil_Type' + str(i)
    my_count += df[soil_type][df[soil_type] == 1].count()

print("Number of examples with a soil type:")
print(my_count)

#All examples have a wilderness area
my_count = 0
for i in range(1,5):
    wilderness_type = 'Wilderness_Area' + str(i)
    my_count += df[wilderness_type][df[wilderness_type] == 1].count()

print("Number of examples with a wilderness type:")
print(my_count)

Number of examples with a soil type:
15120
Number of examples with a wilderness type:
15120


In [6]:
#Outliers Before Preprocessing
training_features = df.drop(['Cover_Type','Id'], axis=1)
outlier_detector = IsolationForest(random_state=0).fit_predict(training_features)
outlier_detector = pd.DataFrame(outlier_detector)
print(df['Id'][outlier_detector[0] == -1])

2696    2697
Name: Id, dtype: int64


In [7]:
def forest_data_preprocessor(forest_data):
    targets = forest_data['Cover_Type']
    
    #Drop ID and target columns
    training_features = forest_data.drop(['Cover_Type','Id'], axis=1)
    
    
    #Hydrology distance euclidean
    water_dist = np.asarray([training_features['Horizontal_Distance_To_Hydrology'],training_features['Vertical_Distance_To_Hydrology']])
    water_euclidean_dist = np.sqrt(np.square(water_dist[0]) + np.square(water_dist[1]))

    training_features['Distance_To_Hydrology'] = pd.Series(water_euclidean_dist)
    training_features = training_features.drop(['Horizontal_Distance_To_Hydrology','Vertical_Distance_To_Hydrology'], axis=1)

    #Sine of Aspect
    aspect = np.asarray(training_features['Aspect'])
    aspect_sine = np.sin(aspect * np.pi / 180)

    training_features['Sine_Of_Aspect'] = pd.Series(aspect_sine)
    
    training_features = training_features.drop(['Aspect'], axis=1)
    
    #Average Hillshade
    avg_hillshade = np.asarray([training_features['Hillshade_9am'],training_features['Hillshade_Noon'],training_features['Hillshade_3pm']])
    avg_hillshade = (avg_hillshade[0] + avg_hillshade[1] + avg_hillshade[2]) / 3
    
    training_features['Average_Hillshade'] = pd.Series(avg_hillshade)
    
    #Drop remaining unwanted features
    #training_features = training_features.drop(['Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon','Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points'], axis=1)
    training_features = training_features.drop(['Hillshade_9am', 'Hillshade_Noon','Hillshade_3pm'], axis=1)
    
    
    return training_features, targets

In [8]:
#Outliers After Numerical Feature Preprocessing (There are no remaining outliers)
training_features, targets = forest_data_preprocessor(df)
outlier_detector = IsolationForest(random_state=0).fit_predict(training_features)
outlier_detector = pd.DataFrame(outlier_detector)
print(df['Id'][outlier_detector[0] == -1].count())

0
