In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [15]:
# Imports

# Basics
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from scipy.stats import randint
import re

# Useful model and scoring imports
from sklearn.model_selection import cross_val_score, RandomizedSearchCV, train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay

# Random forest
from sklearn.ensemble import RandomForestClassifier

# Put model name here and import
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression

# Data scalers
from sklearn.preprocessing import StandardScaler


# Visualisation
from sklearn.tree import export_graphviz
from IPython.display import Image
import graphviz

In [3]:
%matplotlib inline


# Helper to print with prettier colors
class c:
    PURPLE = '\033[95m'
    BLUE = '\033[94m'
    CYAN = '\033[96m'
    GREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    END = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

print(f"Printing {c.PURPLE}with {c.GREEN}pretty {c.FAIL}colors{c.END}{c.BOLD}!{c.END}")


Printing [95mwith [92mpretty [91mcolors[0m[1m![0m


In [7]:
# Loading and cleaning the data
df = pd.read_csv('./train_dataset.csv')

columns_old = df.columns
columns_new = []

for i, col in enumerate(columns_old):
    col = re.sub('[(].+[)]', '', col)
    col = col.lower()
    col = col.strip()
    col = re.sub('\s', '_', col)
    columns_new.append(col)

df.columns = columns_new
df.head()

  col = re.sub('\s', '_', col)


Unnamed: 0,elevation,aspect,slope,horizontal_distance_to_hydrology,vertical_distance_to_hydrology,horizontal_distance_to_roadways,hillshade_9am,hillshade_noon,hillshade_3pm,horizontal_distance_to_fire_points,...,7756,7757,7790,8703,8707,8708,8771,8772,8776,forest_cover_type_classes
0,2843,311,19,30,10,2850,167,224,196,2147,...,0,0,0,0,0,0,0,0,0,2
1,3190,358,13,552,57,4287,199,215,153,3355,...,0,0,0,0,0,0,0,0,0,1
2,3288,296,16,67,16,3050,172,233,200,713,...,0,0,0,0,0,0,0,0,0,1
3,3382,8,7,272,19,659,212,227,152,832,...,0,0,0,0,0,0,0,0,1,2
4,3382,258,8,350,47,3561,201,245,182,2305,...,0,0,0,0,0,0,1,0,0,7


# Prune Outliers

In [None]:
def prune_outliers(df):
    for i in range(10):
        # calculate IQR for column Height
        col = data.columns[i]

        Q1 = data[col].quantile(0.25)
        Q3 = data[col].quantile(0.75)
        IQR = Q3 - Q1

        # identify outliers
        threshold = 1.5
        outliers = data[(data[col] < Q1 - threshold * IQR) |
                        (data[col] > Q3 + threshold * IQR)]

        data = data.drop(outliers.index)
    return data

# Logistic Regression

In [18]:
def logistic_regression(data, labels, test_size=0.2):

    std_scaler = StandardScaler()
    data_scaled = std_scaler.fit_transform(data)

    model = LogisticRegression(max_iter=1000, multi_class='multinomial')

    X_train, X_test, y_train, y_test = train_test_split(
        data_scaled, labels, test_size=test_size)

    log_reg = model.fit(X_train, y_train)
    
    return log_reg.score(X_test, y_test)       

# Random Forest

In [8]:

"""
 # Random forest classifer

Requires:
from sklearn.ensamble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
possibly more stuff
"""
def random_forest(data):
  # Data
  sample = data #data.sample(frac=0.05)
  y = sample[sample.columns[-1]]
  X = sample[sample.columns[:-1]]
  # Split
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
  # Model
  rf = RandomForestClassifier()
  # Training
  rf.fit(X_train, y_train)
  # Testing
  y_pred = rf.predict(X_test)
  # Accuracy
  accuracy = accuracy_score(y_test, y_pred)
  """
  for i in range(3):
    tree = rf.estimators_[i]
    dot_data = export_graphviz(tree,
                               feature_names=X_train.columns,
                               filled=True,
                               max_depth=2,
                               impurity=False,
                               proportion=True)
    graph = graphviz.Source(dot_data)
    display(graph)
  """
  return accuracy

def random_forest_hyperparameter_tuning(data):
  # Data
  sample = data #data.sample(frac=0.1)
  y = sample[sample.columns[-1]]
  X = sample[sample.columns[:-1]]
  # Split
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
  # Model
  rf = RandomForestClassifier()
  # Parameter distribtions
  param_dist = {'n_estimators': randint(50,500), 'max_depth': randint(1,20)}
  # Search for best hyperparameters
  rand_search = RandomizedSearchCV(rf, param_distributions = param_dist, n_iter=5, cv=5)
  # Fit the data
  rand_search.fit(X_train, y_train)
  # Create a variable for the best model
  best_rf = rand_search.best_estimator_
  max_depth, n_estimators = rand_search.best_params_['max_depth'], rand_search.best_params_['n_estimators']
  # Print the best hyperparameters
  print('Best hyperparameters:',  rand_search.best_params_)
  print('max_depth:', max_depth)
  print('n_estimators:', n_estimators)
  rf_tuned = RandomForestClassifier(n_estimators=int(n_estimators), max_depth=int(max_depth))
  rf_tuned.fit(X_train, y_train)
  y_pred = rf_tuned.predict(X_test)
  accuracy = accuracy_score(y_test, y_pred)
  print('Tuned accuracy:', accuracy)
  return accuracy
  """ # frac=0.5
  Best hyperparameters: {'max_depth': 10, 'n_estimators': 289}
  max_depth: 10
  n_estimators: 289
  Tuned accuracy: 0.7453078407296966
  0.7453078407296966
      # frac=0.1
  Best hyperparameters: {'max_depth': 18, 'n_estimators': 257}
  max_depth: 18
  n_estimators: 257
  Tuned accuracy: 0.8308920270151741
  0.8308920270151741
    # frac=1?
  Best hyperparameters: {'max_depth': 19, 'n_estimators': 158}
  max_depth: 19
  n_estimators: 158
  Tuned accuracy: 0.8806641755697069
  0.8806641755697069
  """



#print(random_forest_hyperparameter_tuning(df)) # Latest run 2min 4s 'max_depth': 6, 'n_estimators': 468
#print(random_forest(df))
