In [1]:
import gradio as gr

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.ensemble import IsolationForest
from sklearn.model_selection import cross_validate

In [2]:
def forest_data_preprocessor(forest_data):
    targets = forest_data['Cover_Type']
    
    #Drop ID and target columns
    training_features = forest_data.drop(['Cover_Type','Id'], axis=1)
    
    #Hydrology distance euclidean
    water_dist = np.asarray([training_features['Horizontal_Distance_To_Hydrology'],training_features['Vertical_Distance_To_Hydrology']])
    water_euclidean_dist = np.sqrt(np.square(water_dist[0]) + np.square(water_dist[1]))

    training_features['Distance_To_Hydrology'] = pd.Series(water_euclidean_dist)
    training_features = training_features.drop(['Horizontal_Distance_To_Hydrology','Vertical_Distance_To_Hydrology'], axis=1)

    #Sine of Aspect
    aspect = np.asarray(training_features['Aspect'])
    aspect_sine = np.sin(aspect * np.pi / 180)

    training_features['Sine_Of_Aspect'] = pd.Series(aspect_sine)
    
    training_features = training_features.drop(['Aspect'], axis=1)
    
    #Average Hillshade
    avg_hillshade = np.asarray([training_features['Hillshade_9am'],training_features['Hillshade_Noon'],training_features['Hillshade_3pm']])
    avg_hillshade = (avg_hillshade[0] + avg_hillshade[1] + avg_hillshade[2]) / 3
    
    training_features['Average_Hillshade'] = pd.Series(avg_hillshade)
    
    #Drop remaining unwanted features
    #training_features = training_features.drop(['Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon','Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points'], axis=1)
    training_features = training_features.drop(['Hillshade_9am', 'Hillshade_Noon','Hillshade_3pm'], axis=1)


    soil_groups = [ 
                    [1,2,3,4,5,6,7,8,9],
                    [10,11,12,13,14,16,17],
                    [18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33],
                    [34,35,36,37,38,39,40]
                ]

    for i in range(len(soil_groups)):
        soil_group = 'Soil_Group' + str(i+1)
        training_features[soil_group] = pd.Series(np.zeros_like(np.asarray(training_features['Soil_Type1'])))
        for j in soil_groups[i]:
            soil_type = 'Soil_Type' + str(j)
            training_features[soil_group] += training_features[soil_type]

    soil_types = []
    for i in range(1,41):
        soil_type = 'Soil_Type' + str(i)
        soil_types.append(soil_type)

    training_features = training_features.drop(soil_types, axis=1)

    
    return training_features, targets


In [3]:
def test_data_preprocessor(forest_data):
    #Drop ID column
    features = forest_data.drop(['Id'], axis=1)
    
    #Hydrology distance euclidean
    water_dist = np.asarray([features['Horizontal_Distance_To_Hydrology'],features['Vertical_Distance_To_Hydrology']])
    water_euclidean_dist = np.sqrt(np.square(water_dist[0]) + np.square(water_dist[1]))

    features['Distance_To_Hydrology'] = pd.Series(water_euclidean_dist)
    features = features.drop(['Horizontal_Distance_To_Hydrology','Vertical_Distance_To_Hydrology'], axis=1)

    #Sine of Aspect
    aspect = np.asarray(features['Aspect'])
    aspect_sine = np.sin(aspect * np.pi / 180)

    features['Sine_Of_Aspect'] = pd.Series(aspect_sine)
    
    features = features.drop(['Aspect'], axis=1)
    
    #Average Hillshade
    avg_hillshade = np.asarray([features['Hillshade_9am'],features['Hillshade_Noon'],features['Hillshade_3pm']])
    avg_hillshade = (avg_hillshade[0] + avg_hillshade[1] + avg_hillshade[2]) / 3
    
    features['Average_Hillshade'] = pd.Series(avg_hillshade)
    
    #Drop remaining unwanted features
    #features = features.drop(['Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon','Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points'], axis=1)
    features = features.drop(['Hillshade_9am', 'Hillshade_Noon','Hillshade_3pm'], axis=1)


    soil_groups = [ 
                    [1,2,3,4,5,6,7,8,9],
                    [10,11,12,13,14,16,17],
                    [18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33],
                    [34,35,36,37,38,39,40]
                ]

    for i in range(len(soil_groups)):
        soil_group = 'Soil_Group' + str(i+1)
        features[soil_group] = pd.Series(np.zeros_like(np.asarray(features['Soil_Type1'])))
        for j in soil_groups[i]:
            soil_type = 'Soil_Type' + str(j)
            features[soil_group] += features[soil_type]

    soil_types = []
    for i in range(1,41):
        soil_type = 'Soil_Type' + str(i)
        soil_types.append(soil_type)

    features = features.drop(soil_types, axis=1)

    
    return features


In [4]:
def log_reg_test(c_val, max_it, random_seed):
    train_df = pd.read_csv("train.csv")
    training_features, targets = forest_data_preprocessor(train_df)

    test_df = pd.read_csv("test.csv")
    test_features = test_data_preprocessor(test_df)
    
    #Fix Dummy Variable Trap
    training_features = training_features.drop(['Wilderness_Area4','Soil_Group4'], axis=1)
    
    forest_data = np.asarray(training_features)
    forest_targets = np.asarray(targets)
    
    min_max_scaler = preprocessing.MinMaxScaler()

    forest_data_scaled = min_max_scaler.fit_transform(forest_data)


    # Create a logistic regression model for our data using Sklearn
    logistic_regression_model = LogisticRegression(max_iter=max_it, random_state=random_seed, solver='sag', C=c_val)
    logistic_regression_model.fit(forest_data_scaled, forest_targets)

    #Fix Dummy Variable Trap
    test_features = test_features.drop(['Wilderness_Area4','Soil_Group4'], axis=1)

    test_features_scaled = min_max_scaler.transform(np.asarray(test_features))

    test_cover_type = logistic_regression_model.predict(test_features_scaled)

    output = pd.DataFrame()
    output['Id'] = test_df['Id']
    output['Cover_Type'] = test_cover_type
    
    output.to_csv(path_or_buf='logistic_regression.csv', index=False)

In [5]:
def knn_test(num_neigh,categorical_scale_factor):
    
    df = pd.read_csv("train.csv")
    training_features, targets = forest_data_preprocessor(df)

    test_df = pd.read_csv("test.csv")
    test_features = test_data_preprocessor(test_df)
    
    temp_df = training_features.drop(['Elevation', 'Average_Hillshade', 'Sine_Of_Aspect', 'Distance_To_Hydrology', 'Horizontal_Distance_To_Fire_Points', 'Horizontal_Distance_To_Roadways', 'Slope'], axis=1)
    training_features = training_features.drop(['Soil_Group1','Soil_Group2','Soil_Group3','Soil_Group4', 'Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4'], axis=1)
    
    
    forest_data = np.asarray(training_features)
    forest_targets = np.asarray(targets)
    
    scaler = preprocessing.StandardScaler()

    forest_data_scaled = scaler.fit_transform(forest_data)

    for i in range(1,5):
        soil_type = 'Soil_Group' + str(i)
        temp_df[soil_type] = temp_df[soil_type].multiply(categorical_scale_factor)

    for i in range(1,5):
        wilderness_type = 'Wilderness_Area' + str(i)
        temp_df[wilderness_type] = temp_df[wilderness_type].multiply(categorical_scale_factor)

    temp_df = np.asarray(temp_df)

    forest_data_scaled = np.concatenate((forest_data_scaled, temp_df), axis=1)

    # Create a kNN model for our data using Sklearn
    knn_model = KNeighborsClassifier(n_neighbors=num_neigh, weights='distance', algorithm='brute')
    knn_model.fit(forest_data_scaled, forest_targets)

    #Prepare test data
    example_temp1 = test_features.drop(['Elevation', 'Average_Hillshade', 'Sine_Of_Aspect', 'Distance_To_Hydrology', 'Horizontal_Distance_To_Fire_Points', 'Horizontal_Distance_To_Roadways', 'Slope'], axis=1)
    example_temp2 = test_features.drop(['Soil_Group1','Soil_Group2','Soil_Group3','Soil_Group4', 'Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4'], axis=1)

    example_temp2 = np.asarray(example_temp2)
    example_scaled = scaler.transform(example_temp2)

    for i in range(1,5):
        soil_type = 'Soil_Group' + str(i)
        example_temp1[soil_type] = example_temp1[soil_type].multiply(categorical_scale_factor)

    for i in range(1,5):
        wilderness_type = 'Wilderness_Area' + str(i)
        example_temp1[wilderness_type] = example_temp1[wilderness_type].multiply(categorical_scale_factor)
    
    example_temp1 = np.asarray(example_temp1)

    example_processed = np.concatenate((example_scaled, example_temp1), axis=1)

    test_cover_type = knn_model.predict(example_processed)
    
    output = pd.DataFrame()
    output['Id'] = test_df['Id']
    output['Cover_Type'] = test_cover_type
    
    output.to_csv(path_or_buf='kNN.csv', index=False)

In [6]:
def d_tree_test(criterion, max_depth, random_seed):

    the_c = 'gini'
    if criterion == 1:
        the_c = 'entropy'
    
    df = pd.read_csv("train.csv")
    training_features, targets = forest_data_preprocessor(df)

    test_df = pd.read_csv("test.csv")
    example = test_data_preprocessor(test_df)

    # Integer labeling
    training_features['Wilderness_Area'] = np.asarray(training_features['Wilderness_Area1']) + 2*np.asarray(training_features['Wilderness_Area2']) + 3*np.asarray(training_features['Wilderness_Area3']) + 4*np.asarray(training_features['Wilderness_Area4'])
    training_features['Soil_Group'] = np.asarray(training_features['Soil_Group1']) + 2*np.asarray(training_features['Soil_Group2']) + 3*np.asarray(training_features['Soil_Group3']) + 4*np.asarray(training_features['Soil_Group4'])

    training_features = training_features.drop(['Wilderness_Area1','Wilderness_Area2','Wilderness_Area3','Wilderness_Area4','Soil_Group1','Soil_Group2','Soil_Group3','Soil_Group4'], axis=1)
    
    forest_data = np.asarray(training_features)
    forest_targets = np.asarray(targets)
    
    scaler = preprocessing.StandardScaler()

    forest_data_scaled = scaler.fit_transform(forest_data)


    # Create a decision tree classifier model for our data using Sklearn
    decision_tree_model = DecisionTreeClassifier(criterion=the_c, max_depth=max_depth, random_state=random_seed)
    decision_tree_model.fit(forest_data_scaled, forest_targets)

    example['Wilderness_Area'] = np.asarray(example['Wilderness_Area1']) + 2*np.asarray(example['Wilderness_Area2']) + 3*np.asarray(example['Wilderness_Area3']) + 4*np.asarray(example['Wilderness_Area4'])
    example['Soil_Group'] = np.asarray(example['Soil_Group1']) + 2*np.asarray(example['Soil_Group2']) + 3*np.asarray(example['Soil_Group3']) + 4*np.asarray(example['Soil_Group4'])

    example = example.drop(['Wilderness_Area1','Wilderness_Area2','Wilderness_Area3','Wilderness_Area4','Soil_Group1','Soil_Group2','Soil_Group3','Soil_Group4'], axis=1)

    example_scaled = scaler.transform(np.asarray(example))
    
    test_cover_type = decision_tree_model.predict(example_scaled)

    output = pd.DataFrame()
    output['Id'] = test_df['Id']
    output['Cover_Type'] = test_cover_type
    
    output.to_csv(path_or_buf='decision_tree.csv', index=False)

In [7]:
def rf_test(num_estimators, criterion, max_depth, random_seed):
    the_c = 'gini'
    if criterion == 1:
        the_c = 'entropy'

    df = pd.read_csv("train.csv")
    training_features, targets = forest_data_preprocessor(df)

    test_df = pd.read_csv("test.csv")
    example = test_data_preprocessor(test_df)

    training_features['Wilderness_Area'] = np.asarray(training_features['Wilderness_Area1']) + 2*np.asarray(training_features['Wilderness_Area2']) + 3*np.asarray(training_features['Wilderness_Area3']) + 4*np.asarray(training_features['Wilderness_Area4'])
    training_features['Soil_Group'] = np.asarray(training_features['Soil_Group1']) + 2*np.asarray(training_features['Soil_Group2']) + 3*np.asarray(training_features['Soil_Group3']) + 4*np.asarray(training_features['Soil_Group4'])

    training_features = training_features.drop(['Wilderness_Area1','Wilderness_Area2','Wilderness_Area3','Wilderness_Area4','Soil_Group1','Soil_Group2','Soil_Group3','Soil_Group4'], axis=1)

 
    forest_data = np.asarray(training_features)
    forest_targets = np.asarray(targets)
    
    scaler = preprocessing.StandardScaler()

    forest_data_scaled = scaler.fit_transform(forest_data)


    # Create a random forest classifier model for our data using Sklearn
    rf_model = RandomForestClassifier(num_estimators, criterion=the_c, max_depth=max_depth, random_state=random_seed)
    rf_model.fit(forest_data_scaled, forest_targets)

    example['Wilderness_Area'] = np.asarray(example['Wilderness_Area1']) + 2*np.asarray(example['Wilderness_Area2']) + 3*np.asarray(example['Wilderness_Area3']) + 4*np.asarray(example['Wilderness_Area4'])
    example['Soil_Group'] = np.asarray(example['Soil_Group1']) + 2*np.asarray(example['Soil_Group2']) + 3*np.asarray(example['Soil_Group3']) + 4*np.asarray(example['Soil_Group4'])

    example = example.drop(['Wilderness_Area1','Wilderness_Area2','Wilderness_Area3','Wilderness_Area4','Soil_Group1','Soil_Group2','Soil_Group3','Soil_Group4'], axis=1)

    example_scaled = scaler.transform(np.asarray(example))
    
    test_cover_type = rf_model.predict(example_scaled)


    output = pd.DataFrame()
    output['Id'] = test_df['Id']
    output['Cover_Type'] = test_cover_type
    
    output.to_csv(path_or_buf='random_forest.csv', index=False)

In [8]:
log_reg_test(64,600,13124)
knn_test(1,5)
d_tree_test(1,13,13124)
rf_test(150,1,19,13124)