In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Split data

from sklearn.model_selection import train_test_split

df = pd.read_csv('./ready_to_train.csv',index_col=0)

train_data,test_data = train_test_split(df,test_size=0.3)

train_y = np.array(train_data["Pick"])
test_y = np.array(test_data["Pick"])

train_X = train_data.drop(labels=["Pick"],axis=1)
test_X = test_data.drop(labels=["Pick"],axis=1)

train_X.head()

### Scale data

from sklearn.preprocessing import StandardScaler

def scale_data(data):
    data_array = np.array(data)[:,np.newaxis]
    data_scaler = StandardScaler().fit(data_array)
    scaled_data = data_scaler.transform(data_array)[:,0]
    return scaled_data

train_X["WaitingTime"] = scale_data(train_X["WaitingTime"])
train_X["Outflow"] = scale_data(train_X["Outflow"])
train_X["AvgProfit"] = scale_data(train_X["AvgProfit"])
train_X["AvgTime"] = scale_data(train_X["AvgTime"])
train_X["AvgDistance"] = scale_data(train_X["AvgDistance"])
train_X["TotalTrips"] = scale_data(train_X["TotalTrips"])
train_X["AirportRatio"] = scale_data(train_X["AirportRatio"])
train_X["CabNum"] = scale_data(train_X["CabNum"])

test_X["WaitingTime"] = scale_data(test_X["WaitingTime"])
test_X["Outflow"] = scale_data(test_X["Outflow"])
test_X["AvgProfit"] = scale_data(test_X["AvgProfit"])
test_X["AvgTime"] = scale_data(test_X["AvgTime"])
test_X["AvgDistance"] = scale_data(test_X["AvgDistance"])
test_X["TotalTrips"] = scale_data(test_X["TotalTrips"])
test_X["AirportRatio"] = scale_data(test_X["AirportRatio"])
test_X["CabNum"] = scale_data(test_X["CabNum"])

train_X.head()

all_X = np.concatenate((train_X,test_X),axis=0)
all_y = np.concatenate((train_y,test_y),axis=0)

### Build models

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

### Draw learning curves

from sklearn.model_selection import learning_curve

def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5), verbose=0):
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt

RF = RandomForestClassifier()
RF_grid = {
    "n_estimators": [5,10,20,50],
    "criterion": ["gini","entropy"],
    "min_samples_split": [0.25,0.5,0.75,1.0],
    "oob_score": [True,False],
    "n_jobs": [-1],
}
gsRF = GridSearchCV(RF,param_grid=RF_grid,n_jobs=-1,cv=5)
gsRF.fit(train_X,train_y)
RF_best = gsRF.best_score_
RF = gsRF.best_estimator_

plot_learning_curve(RF,title="RF learning_curve",X=train_X,y=train_y,cv=10)

print('The best model\'s CV score is {}'.format(RF_best))

RF_CV_score_train = cross_val_score(RF,train_X,train_y,cv=10)
RF_CV_score_test = cross_val_score(RF,test_X,test_y,cv=10)

print("Random forest cv train mean score: ",RF_CV_score_train.mean())
print("Random forest cv test mean score: ",RF_CV_score_test.mean())

RF

imt = RF.feature_importances_

df.iloc[0:0]

imt.shape

