In [26]:
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import re
from sklearn import preprocessing

%matplotlib inline


# Helper to print with prettier colors
class c:
    PURPLE = '\033[95m'
    BLUE = '\033[94m'
    CYAN = '\033[96m'
    GREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    END = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'
    
print(f"Printing {c.PURPLE}with {c.GREEN}pretty {c.FAIL}colors{c.END}{c.BOLD}!{c.END}")

# Loading the data
df = pd.read_csv('./train_dataset.csv')
#df = sns.load_dataset("./train_dataset.csv/train_dataset")
columns_old = df.columns
columns_new = []

for i, col in enumerate(columns_old):
    #col = re.sub('[(].+[)]', '', col)
    #col = col.lower()
    col = col.strip()
    #col = re.sub('\s', '_', col)
    columns_new.append(col)

df.columns = columns_new
#small_df = df.sample(frac=0.1)
#df = df.drop(columns=df.columns[14:54], axis=1)
df_soil_types = df.iloc[:, 14:54]
df_wilderness = df.iloc[:, 10:14]
df_hillshade = df.iloc[:, 6:9]
df_azimuth = df[["Aspect (azimuth)"]]

df_to_normalize = df.iloc[:, [0, 2, 3, 4, 5, 9]]
normalized = preprocessing.normalize(df_to_normalize)
hillshade = preprocessing.MinMaxScaler().fit_transform(df_hillshade)
azimuth = preprocessing.MinMaxScaler().fit_transform(df_azimuth)
df_normalized = pd.DataFrame({
                                "Elevation (meters)": normalized[:, 0],
                                "Slope (degrees)": normalized[:, 1],
                                "Horizontal_Distance_To_Hydrology (meters)": normalized[:, 2],
                                "Vertical_Distance_To_Hydrology (meters)": normalized[:, 3],
                                "Horizontal_Distance_To_Roadways(meters)": normalized[:, 4],
                                "Horizontal_Distance_To_Fire_Points (meters)": normalized[:, 5],
                            })

df_hillshade = pd.DataFrame({
                                "Hillshade_9am (0-255)": hillshade[:, 0],
                                "Hillshade_Noon (0-255)": hillshade[:, 1],
                                "Hillshade_3pm (0-255)": hillshade[:, 2]
                            })

df_azimuth = pd.DataFrame({"Aspect (azimuth)": azimuth[:,0]})

soil_type_numerical = np.argmax(df_soil_types, axis=1)
df_soil_type_numerical = pd.DataFrame({"Soil Type": soil_type_numerical})

wilderness_numerical = np.argmax(df_wilderness, axis=1)
df_wilderness_numerical = pd.DataFrame({"Wilderness": wilderness_numerical})



df_preprocessed_numerical = pd.concat([df_normalized, df_azimuth, df_hillshade, df_wilderness_numerical, df_soil_type_numerical, df["Forest Cover Type Classes"]], axis=1)
df_preprocessed = pd.concat([df_normalized, df_azimuth, df_hillshade, df_wilderness, df_soil_types, df["Forest Cover Type Classes"]], axis=1)
df_numerical = pd.concat([df_to_normalize, df_azimuth, df_hillshade, df_wilderness_numerical, df_soil_type_numerical, df["Forest Cover Type Classes"]], axis=1)
df["Forest Cover Type Classes"].value_counts()

df_only_soil = pd.concat([df_to_normalize, df["Forest Cover Type Classes"]], axis=1)


df_list = []
for i in range(1,8):
    new_df = df[df["Forest Cover Type Classes"] == i].sample(n=2000)
    df_list.append(new_df)
df_sampled = pd.concat(df_list)

df_list = []
for i in range(1,8):
    new_df = df_numerical[df_numerical["Forest Cover Type Classes"] == i].sample(n=2000)
    df_list.append(new_df)
df_sampled_numerical = pd.concat(df_list)


df_list = []
for i in range(1,8):
    new_df = df_preprocessed_numerical[df_preprocessed_numerical["Forest Cover Type Classes"] == i].sample(n=2000)
    df_list.append(new_df)
df_sampled_preprocessed_numerical = pd.concat(df_list)


Printing [95mwith [92mpretty [91mcolors[0m[1m![0m


In [21]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

def under_sample(df):

    X = df.drop(df.columns[-1], axis=1)
    y = df[df.columns[-1]]

    rus = RandomUnderSampler(sampling_strategy="not minority")

    X_res, y_res = rus.fit_resample(X, y)

    return X_res, y_res


def over_sample(df):

    X = df.drop(df.columns[-1], axis=1)
    y = df[df.columns[-1]]

    ros = RandomOverSampler(sampling_strategy="not majority")

    X_res, y_res = ros.fit_resample(X, y)

    return X_res, y_res

In [27]:
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score
from sklearn import svm, tree, neighbors, ensemble
from sklearn import utils

#kNN = neighbors.KNeighborsClassifier()
#kNN_score = cross_val_score(kNN, X, y, cv=10)
#  df_preprocessed, df_preprocessed_numerical, 

def do(model, mod_str):
    print(mod_str)
    for test_df in [df_only_soil, df]:
        X, y = under_sample(test_df)
        #y = test_df["Forest Cover Type Classes"]
        #X = test_df.loc[:, test_df.columns != "Forest Cover Type Classes"]
        #X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.2, random_state=42)
        score = cross_val_score(model, X, y, cv=10)
        #clf = ensemble.RandomForestClassifier(random_state=42)
        #model.fit(X_train, y_train)
        #y_pred = model.predict(X_test)
        #acc = accuracy_score(y_test, y_pred)
        print("%.3f" %score.mean())
        #feature_scores = pd.Series(model.feature_importances_, index = X_train.columns).sort_values(ascending = False)
    print()

#do(svm.LinearSVC(random_state=42, dual="auto"), "SVC-linear")
#do(svm.SVC(random_state=42), "SVC")
do(ensemble.RandomForestClassifier(random_state=42), "RandomForest")
#do(tree.DecisionTreeClassifier(random_state=42), "DecisionTree")
#do(neighbors.KNeighborsClassifier(), "kNN")
#SVC-linear
#df_sampled:                        0.658
#df_sampled_numerical:              0.567
#df_sampled_preprocessed_numerical: 0.555
#
#SVC
#df_sampled:                        0.641
#df_sampled_numerical:              0.623
#df_sampled_preprocessed_numerical: 0.528
#
#RandomForest
#df_sampled:                        0.866
#df_sampled_numerical:              0.863
#df_sampled_preprocessed_numerical: 0.838
#
#DecisionTree
#df_sampled:                        0.787
#df_sampled_numerical:              0.790
#df_sampled_preprocessed_numerical: 0.762
#
#kNN
#df_sampled:                        0.809
#df_sampled_numerical:              0.779
#df_sampled_preprocessed_numerical: 0.791

# in 3m 54.9s, 14000 samples in 3 datasets with 10 folds and random_state=42
#22.4

RandomForest
0.836
0.875

