In [19]:

from sklearn.experimental import enable_halving_search_cv  # noqa
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.model_selection import HalvingRandomSearchCV
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import re
from sklearn import preprocessing

%matplotlib inline


# Helper to print with prettier colors
class c:
    PURPLE = '\033[95m'
    BLUE = '\033[94m'
    CYAN = '\033[96m'
    GREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    END = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'
    
print(f"Printing {c.PURPLE}with {c.GREEN}pretty {c.FAIL}colors{c.END}{c.BOLD}!{c.END}")

# Loading the data
df = pd.read_csv('./train_dataset.csv')
#df = sns.load_dataset("./train_dataset.csv/train_dataset")
columns_old = df.columns
columns_new = []

for i, col in enumerate(columns_old):
    #col = re.sub('[(].+[)]', '', col)
    #col = col.lower()
    col = col.strip()
    #col = re.sub('\s', '_', col)
    columns_new.append(col)

df.columns = columns_new
#small_df = df.sample(frac=0.1)
#df = df.drop(columns=df.columns[14:54], axis=1)
df_soil_types = df.iloc[:, 14:54]
df_wilderness = df.iloc[:, 10:14]
df_hillshade = df.iloc[:, 6:9]
df_azimuth = df[["Aspect (azimuth)"]]

df_vals = df.iloc[:, 0:10]

df_to_normalize = df.iloc[:, [0, 2, 3, 4, 5, 9]]
df.head()


Printing [95mwith [92mpretty [91mcolors[0m[1m![0m


Unnamed: 0,Elevation (meters),Aspect (azimuth),Slope (degrees),Horizontal_Distance_To_Hydrology (meters),Vertical_Distance_To_Hydrology (meters),Horizontal_Distance_To_Roadways(meters),Hillshade_9am (0-255),Hillshade_Noon (0-255),Hillshade_3pm (0-255),Horizontal_Distance_To_Fire_Points (meters),...,7756 (32/40),7757 (33/40),7790 (34/40),8703 (35/40),8707 (36/40),8708 (37/40),8771 (38/40),8772 (39/40),8776 (40/40),Forest Cover Type Classes
0,2843,311,19,30,10,2850,167,224,196,2147,...,0,0,0,0,0,0,0,0,0,2
1,3190,358,13,552,57,4287,199,215,153,3355,...,0,0,0,0,0,0,0,0,0,1
2,3288,296,16,67,16,3050,172,233,200,713,...,0,0,0,0,0,0,0,0,0,1
3,3382,8,7,272,19,659,212,227,152,832,...,0,0,0,0,0,0,0,0,1,2
4,3382,258,8,350,47,3561,201,245,182,2305,...,0,0,0,0,0,0,1,0,0,7


In [48]:
normalized = preprocessing.normalize(df_to_normalize)
hillshade = preprocessing.MinMaxScaler().fit_transform(df_hillshade)
azimuth = preprocessing.MinMaxScaler().fit_transform(df_azimuth)
df_normalized = pd.DataFrame({
                                "Elevation (meters)": normalized[:, 0],
                                "Slope (degrees)": normalized[:, 1],
                                "Horizontal_Distance_To_Hydrology (meters)": normalized[:, 2],
                                "Vertical_Distance_To_Hydrology (meters)": normalized[:, 3],
                                "Horizontal_Distance_To_Roadways(meters)": normalized[:, 4],
                                "Horizontal_Distance_To_Fire_Points (meters)": normalized[:, 5],
                            })

df_hillshade = pd.DataFrame({
                                "Hillshade_9am (0-255)": hillshade[:, 0],
                                "Hillshade_Noon (0-255)": hillshade[:, 1],
                                "Hillshade_3pm (0-255)": hillshade[:, 2]
                            })

df_azimuth = pd.DataFrame({"Aspect (azimuth)": azimuth[:,0]})

soil_type_numerical = np.argmax(df_soil_types, axis=1)
df_soil_type_numerical = pd.DataFrame({"Soil Type": soil_type_numerical})

wilderness_numerical = np.argmax(df_wilderness, axis=1)
df_wilderness_numerical = pd.DataFrame({"Wilderness": wilderness_numerical})

df_preprocessed_numerical = pd.concat([df_normalized, df_azimuth, df_hillshade, df_wilderness_numerical, df_soil_type_numerical, df["Forest Cover Type Classes"]], axis=1)
df_preprocessed = pd.concat([df_normalized, df_azimuth, df_hillshade, df_wilderness, df_soil_types, df["Forest Cover Type Classes"]], axis=1)
df_numerical = pd.concat([df_to_normalize, df_azimuth, df_hillshade, df_wilderness_numerical, df_soil_type_numerical, df["Forest Cover Type Classes"]], axis=1)
df["Forest Cover Type Classes"].value_counts()

df_only_soil = pd.concat([df_to_normalize, df["Forest Cover Type Classes"]], axis=1)

all_normal = pd.DataFrame(data=preprocessing.normalize(df_vals), columns=df_vals.columns)
nws = pd.concat([all_normal, df_wilderness_numerical, df_soil_type_numerical, df["Forest Cover Type Classes"]], axis=1)
nw = pd.concat([all_normal, df_wilderness_numerical, df["Forest Cover Type Classes"]], axis=1)
ns = pd.concat([all_normal, df_soil_type_numerical, df["Forest Cover Type Classes"]], axis=1)
ws = pd.concat([df_wilderness_numerical, df_soil_type_numerical, df["Forest Cover Type Classes"]], axis=1)


df_all_normal = pd.concat([all_normal, ], axis=1)

df.columns

smaller_df = df.drop(['Hillshade_9am (0-255)', 'Hillshade_Noon (0-255)', 'Hillshade_3pm (0-255)', 'Slope (degrees)', 'Aspect (azimuth)'], axis=1)

meter_df = smaller_df.iloc[:, 0:5]
mn_df = pd.DataFrame(data=preprocessing.normalize(meter_df), columns=meter_df.columns)
smn_df = pd.concat([mn_df, smaller_df.iloc[:, 5:55]], axis=1)

smnwst_df = pd.concat([mn_df, df_wilderness_numerical / 4, df_soil_type_numerical / 40, df["Forest Cover Type Classes"]], axis=1)
smnwst_df.head()

Unnamed: 0,Elevation (meters),Horizontal_Distance_To_Hydrology (meters),Vertical_Distance_To_Hydrology (meters),Horizontal_Distance_To_Roadways(meters),Horizontal_Distance_To_Fire_Points (meters),Wilderness,Soil Type,Forest Cover Type Classes
0,0.623133,0.006575,0.002192,0.624667,0.470583,0.0,0.7,2
1,0.503638,0.08715,0.008999,0.676833,0.529688,0.0,0.55,1
2,0.723966,0.014752,0.003523,0.671563,0.156992,0.5,0.55,1
3,0.951308,0.07651,0.005344,0.185367,0.23403,0.25,0.975,2
4,0.622083,0.064379,0.008645,0.655008,0.42398,0.5,0.925,7


In [49]:
aa = pd.DataFrame(data=preprocessing.normalize(df_soil_type_numerical), columns=df_soil_type_numerical.columns)
aa.head()

Unnamed: 0,Soil Type
0,1.0
1,1.0
2,1.0
3,1.0
4,1.0


In [50]:
df_list = []
for i in range(1,8):
    new_df = df[df["Forest Cover Type Classes"] == i].sample(n=2000)
    df_list.append(new_df)
df_sampled = pd.concat(df_list)

df_list = []
for i in range(1,8):
    new_df = df_numerical[df_numerical["Forest Cover Type Classes"] == i].sample(n=2000)
    df_list.append(new_df)
df_sampled_numerical = pd.concat(df_list)


df_list = []
for i in range(1,8):
    new_df = df_preprocessed_numerical[df_preprocessed_numerical["Forest Cover Type Classes"] == i].sample(n=2000)
    df_list.append(new_df)
df_sampled_preprocessed_numerical = pd.concat(df_list)

In [17]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

def under_sample(df):

    X = df.drop(df.columns[-1], axis=1)
    y = df[df.columns[-1]]

    rus = RandomUnderSampler(sampling_strategy="not minority", random_state=69343)

    X_res, y_res = rus.fit_resample(X, y)

    return X_res, y_res


def over_sample(df):

    X = df.drop(df.columns[-1], axis=1)
    y = df[df.columns[-1]]

    ros = RandomOverSampler(sampling_strategy="not majority")

    X_res, y_res = ros.fit_resample(X, y)

    return X_res, y_res

In [52]:
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score
from sklearn import svm, tree, neighbors, ensemble
from sklearn import utils
import scipy as sp

#kNN = neighbors.KNeighborsClassifier()
#kNN_score = cross_val_score(kNN, X, y, cv=10)
#  df_preprocessed, df_preprocessed_numerical, 

def do(model, mod_str, param_grid):
    print(mod_str)
    for test_df in [df]:
        X, y = under_sample(test_df)
        #y = test_df["Forest Cover Type Classes"]
        #X = test_df.loc[:, test_df.columns != "Forest Cover Type Classes"]
        #X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.2, random_state=42)
        sh = HalvingGridSearchCV(model, n_jobs=-1, random_state=69343).fit(X, y)
        

        score = cross_val_score(model, X, y)
        #clf = ensemble.RandomForestClassifier(random_state=42)
        #model.fit(X_train, y_train)
        #y_pred = model.predict(X_test)
        #acc = accuracy_score(y_test, y_pred)
        print("%.3f" %score.mean())
        return sh
        #feature_scores = pd.Series(model.feature_importances_, index = X.columns).sort_values(ascending = False)
    print()

#do(svm.LinearSVC(random_state=69343, dual="auto"), "SVC-linear")
#do(svm.SVC(random_state=69343), "SVC")

#do(tree.DecisionTreeClassifier(random_state=69343), "DecisionTree")
#do(neighbors.KNeighborsClassifier(), "kNN")

#SVC-linear
#df_sampled:                        0.658
#df_sampled_numerical:              0.567
#df_sampled_preprocessed_numerical: 0.555
#
#SVC
#df_sampled:                        0.641
#df_sampled_numerical:              0.623
#df_sampled_preprocessed_numerical: 0.528
#
#RandomForest
#df_sampled:                        0.866
#df_sampled_numerical:              0.863
#df_sampled_preprocessed_numerical: 0.838
#
#DecisionTree
#df_sampled:                        0.787
#df_sampled_numerical:              0.790
#df_sampled_preprocessed_numerical: 0.762
#
#kNN
#df_sampled:                        0.809
#df_sampled_numerical:              0.779
#df_sampled_preprocessed_numerical: 0.791

# in 3m 54.9s, 14000 samples in 3 datasets with 10 folds and random_state=42
#22.4

# kNN
# nw 0.934
# ns 0.946
# ws 0.595
# df 0.958
# nws 0.949

#RandomForest
# smaller_df 0.961
# df 0.955
#
#kNN
# smaller_df 0.934
# df 0.958


In [11]:
do(ensemble.RandomForestClassifier(random_state=69343, n_jobs=-1), "RandomForest")

RandomForest
0.866



In [53]:
param_grid = {"max_depth": [1, 5, 10, 15, 20, 50, 54,55, 100], "criterion": ["gini", "entropy"]}
sh = do(tree.DecisionTreeClassifier(random_state=69343), "DecisionTree", param_grid)
print(sh.best_estimator_, sh.best_params_, sh.best_score_)

DecisionTree


TypeError: HalvingGridSearchCV.__init__() missing 1 required positional argument: 'param_grid'

In [13]:
do(neighbors.KNeighborsClassifier(n_neighbors = 20, weights= "distance", algorithm= "kd_tree", n_jobs=-1), "kNN")

kNN
0.792



In [14]:
do(svm.LinearSVC(random_state=69343, dual="auto"), "SVC-linear")

SVC-linear
0.656



In [8]:
do(svm.SVC(random_state=69343), "SVC")

SVC
0.642

