In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [1]:
# Imports

# Basics
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from scipy.stats import randint
import re

# Useful model and scoring imports
from sklearn.model_selection import cross_val_score, RandomizedSearchCV, train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay

# Random forest
from sklearn.ensemble import RandomForestClassifier

# Put model name here and import

# Visualisation
from sklearn.tree import export_graphviz
from IPython.display import Image
import graphviz

In [None]:
%matplotlib inline


# Helper to print with prettier colors
class c:
    PURPLE = '\033[95m'
    BLUE = '\033[94m'
    CYAN = '\033[96m'
    GREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    END = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

print(f"Printing {c.PURPLE}with {c.GREEN}pretty {c.FAIL}colors{c.END}{c.BOLD}!{c.END}")


Printing [95mwith [92mpretty [91mcolors[0m[1m![0m


In [None]:
# Loading and cleaning the data
df = pd.read_csv('/train_dataset.csv.gz')

columns_old = df.columns
columns_new = []

for i, col in enumerate(columns_old):
    col = re.sub('[(].+[)]', '', col)
    col = col.lower()
    col = col.strip()
    col = re.sub('\s', '_', col)
    columns_new.append(col)

df.columns = columns_new
df.head()

Unnamed: 0,elevation,aspect,slope,horizontal_distance_to_hydrology,vertical_distance_to_hydrology,horizontal_distance_to_roadways,hillshade_9am,hillshade_noon,hillshade_3pm,horizontal_distance_to_fire_points,...,7756,7757,7790,8703,8707,8708,8771,8772,8776,forest_cover_type_classes
0,2843,311,19,30,10,2850,167,224,196,2147,...,0,0,0,0,0,0,0,0,0,2
1,3190,358,13,552,57,4287,199,215,153,3355,...,0,0,0,0,0,0,0,0,0,1
2,3288,296,16,67,16,3050,172,233,200,713,...,0,0,0,0,0,0,0,0,0,1
3,3382,8,7,272,19,659,212,227,152,832,...,0,0,0,0,0,0,0,0,1,2
4,3382,258,8,350,47,3561,201,245,182,2305,...,0,0,0,0,0,0,1,0,0,7


In [None]:

"""
 # Random forest classifer

Requires:
from sklearn.ensamble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
possibly more stuff
"""
def random_forest(data):
  # Data
  sample = data #data.sample(frac=0.05)
  y = sample[sample.columns[-1]]
  X = sample[sample.columns[:-1]]
  # Split
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
  # Model
  rf = RandomForestClassifier()
  # Training
  rf.fit(X_train, y_train)
  # Testing
  y_pred = rf.predict(X_test)
  # Accuracy
  accuracy = accuracy_score(y_test, y_pred)
  """
  for i in range(3):
    tree = rf.estimators_[i]
    dot_data = export_graphviz(tree,
                               feature_names=X_train.columns,
                               filled=True,
                               max_depth=2,
                               impurity=False,
                               proportion=True)
    graph = graphviz.Source(dot_data)
    display(graph)
  """
  return accuracy

def random_forest_hyperparameter_tuning(data):
  # Data
  sample = data #data.sample(frac=0.1)
  y = sample[sample.columns[-1]]
  X = sample[sample.columns[:-1]]
  # Split
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
  # Model
  rf = RandomForestClassifier()
  # Parameter distribtions
  param_dist = {'n_estimators': randint(50,500), 'max_depth': randint(1,20)}
  # Search for best hyperparameters
  rand_search = RandomizedSearchCV(rf, param_distributions = param_dist, n_iter=5, cv=5)
  # Fit the data
  rand_search.fit(X_train, y_train)
  # Create a variable for the best model
  best_rf = rand_search.best_estimator_
  max_depth, n_estimators = rand_search.best_params_['max_depth'], rand_search.best_params_['n_estimators']
  # Print the best hyperparameters
  print('Best hyperparameters:',  rand_search.best_params_)
  print('max_depth:', max_depth)
  print('n_estimators:', n_estimators)
  rf_tuned = RandomForestClassifier(n_estimators=int(n_estimators), max_depth=int(max_depth))
  rf_tuned.fit(X_train, y_train)
  y_pred = rf_tuned.predict(X_test)
  accuracy = accuracy_score(y_test, y_pred)
  print('Tuned accuracy:', accuracy)
  return accuracy
  """ # frac=0.5
  Best hyperparameters: {'max_depth': 10, 'n_estimators': 289}
  max_depth: 10
  n_estimators: 289
  Tuned accuracy: 0.7453078407296966
  0.7453078407296966
      # frac=0.1
  Best hyperparameters: {'max_depth': 18, 'n_estimators': 257}
  max_depth: 18
  n_estimators: 257
  Tuned accuracy: 0.8308920270151741
  0.8308920270151741
    # frac=1?
  Best hyperparameters: {'max_depth': 19, 'n_estimators': 158}
  max_depth: 19
  n_estimators: 158
  Tuned accuracy: 0.8806641755697069
  0.8806641755697069
  """



#print(random_forest_hyperparameter_tuning(df)) # Latest run 2min 4s 'max_depth': 6, 'n_estimators': 468
#print(random_forest(df))


Best hyperparameters: {'max_depth': 19, 'n_estimators': 158}
max_depth: 19
n_estimators: 158
Tuned accuracy: 0.8806641755697069
0.8806641755697069


In [None]:

#"The fit time scales at least quadratically with the number of samples and may be impractical beyond tens of thousands of samples" - Sklearn
#if features are not separate from target:

def support_vector_machine(data):
    #"The fit time scales at least quadratically with the number of samples and may be impractical beyond tens of thousands of samples" - Sklearn
    features = df[df.columns[:-1]] 
    target = df[df.columns[-1]]
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
    svc = svm.SVC(random_state=42, C= 2, degree= 1, gamma=1).fit(X_train, y_train)
    return svc.score(X_test, y_test)
def support_vector_machine_cv(data):
    #"The fit time scales at least quadratically with the number of samples and may be impractical beyond tens of thousands of samples" - Sklearn
    features = df[df.columns[:-1]]
    target = df[df.columns[-1]]
    svc = svm.SVC(random_state=42)
    return np.mean(cross_val_score(svc, features, target, cv= 5))

def linear_support_vector_machine(data):
    features = df[df.columns[:-1]]
    target = df[df.columns[-1]]
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=32)
    lsvc = svm.LinearSVC(random_state=42, max_iter=2000, dual="auto").fit(X_train,y_train) #max_iter needs to be > default to converge on entire dataset, here 2x
    return lsvc.score(X_test, y_test)

def linear_support_vector_machine_cv(data):
    features = df[df.columns[:-1]]
    target = df[df.columns[-1]]
    lsvc = svm.LinearSVC(random_state=42, max_iter=2000, dual="auto") #max_iter needs to be > default to converge on entire dataset, here 2x
    return np.mean(cross_val_score(lsvc, features, target, cv= 5))
""" 
#if features are separate from target:
def support_vector_machine_hyper(features, target):
    #"The fit time scales at least quadratically with the number of samples and may be impractical beyond tens of thousands of samples" - Sklearn
    #could potentially swap to nyström approximation of kernel

    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
    svc = svm.SVC(random_state=42)
    grids = GridSearchCV(svc, param_grid={'C': [0.1,1,2,10], 'degree' : [1,2,3], 'gamma' : [0.1, 1, 5]})
    grids.fit(X_train,y_train)
    print(grids.best_params_)
    print(grids.best_score_)
    svc = svm.SVC(random_state=42).fit(X_train, y_train)
    return svc.score(X_test, y_test)
def support_vector_machine_hyper(features, target):
    #"The fit time scales at least quadratically with the number of samples and may be impractical beyond tens of thousands of samples" - Sklearn
    #could potentially swap to nyström approximation of kernel
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
    svc = svm.SVC(random_state=42).fit(X_train, y_train)
    return svc.score(X_test, y_test)

def support_vector_machine_cv(features, target):
    #"The fit time scales at least quadratically with the number of samples and may be impractical beyond tens of thousands of samples" - Sklearn
    svc = svm.SVC(random_state=42)
    return np.mean(cross_val_score(svc, features, target, cv= 5))

def linear_support_vector_machine(features, target):
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=32)
    
    lsvc = svm.LinearSVC(random_state=42, max_iter=2000, dual="auto")#.fit(X_train,y_train) #max_iter needs to be > default to converge on entire dataset, here 2x

    lsvc = svm.LinearSVC(random_state=42, max_iter=2000, dual="auto").fit(X_train, y_train)
    return lsvc.score(X_test, y_test)

def linear_support_vector_machine_cv(features, target):
    lsvc = svm.LinearSVC(random_state=42, max_iter=2000, dual="auto") #max_iter needs to be > default to converge on entire dataset, here 2x
    
    return np.mean(cross_val_score(lsvc, features, target, cv= 5))
 """
print(support_vector_machine(data, df[df.columns[-1]]))