In [None]:
#Based on tutorial: https://machinelearningmastery.com/random-forest-ensemble-in-python/
#Run this code before you can classify

# Use numpy to convert to arrays
import numpy as np
from numpy import mean, std

# Pandas is used for data manipulation
import pandas as pd

# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split

# Import the models we are using
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold

#Import mathplotlib for plotting
import matplotlib.pyplot as plt

#load in the dataset
features = pd.read_csv('heloc_dataset_v1.csv')
feature_names = features.columns

#the columns that stores the labels
labelDimension = "RiskPerformance"
feature_names = feature_names.drop(labelDimension)



# Labels are the values we want to predict
labels = np.array(features[labelDimension])
# Remove the labels from the features
# axis 1 refers to the columns
features= features.drop(labelDimension, axis = 1)

# Convert to numpy array
features = np.array(features)

# Split the data into training and testing sets (heavily overfit on provided dataset to get as close as possible to the original model)
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.30)

print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)


def build_rf_model(train_features, train_labels, features, labels):

    # Instantiate model with 1000 decision trees
    rf = RandomForestClassifier(n_estimators = 1500)
    # Train the model on training data
    rf.fit(train_features, train_labels)

    #evaluate the model
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=1)
    n_scores = cross_val_score(rf, features, labels, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')

    print("done!")
    print("evaluating:")

    # report performance
    print(n_scores)
    print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

    return rf

In [None]:
#build a random forest classifier
rf_model = build_rf_model(train_features, train_labels, features, labels)

In [None]:
importances = rf_model.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf_model.estimators_], axis=0)
forest_importances = pd.Series(importances, index=feature_names)

fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=std, ax=ax)
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()

In [None]:
# #get the first datarow of the dataset
# row = features.loc[0,:]

# #remove the label column (first column)
# instance = row[1:len(row)]

# # Use the forest's predict method on the test data
# prediction = rf_model.predict(instance.to_numpy().reshape(1,-1))

# #print prediction
# print(prediction)

In [None]:
def build_kNN_model(train_features, train_labels, features, labels):

    # Instantiate model with 1000 decision trees
    knn = KNeighborsClassifier(n_estimators = 1500)
    # Train the model on training data
    knn.fit(train_features, train_labels)

    #evaluate the model
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=1)
    n_scores = cross_val_score(knn, features, labels, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')

    print("done!")
    print("evaluating:")

    # report performance
    print(n_scores)
    print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

    return knn

In [None]:
#build a kNN
rf_model = build_rf_model(train_features, train_labels, features, labels)