In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
np.random.seed(10)
#this ensures that all the results are reproducible

In [4]:
mean1 = 55
stddev1 = 10
num_samples = 500

column1_numbers = np.random.normal(mean1,stddev1,num_samples)
column1_numbers = np.clip(column1_numbers,30,120)
column1_numbers = np.round(column1_numbers).astype(int)

mean2 = 18
stddev2 = 3

column2_numbers = np.random.normal(mean2,stddev2,num_samples)
column2_numbers = np.clip(column2_numbers,12,26)
column2_numbers = np.round(column2_numbers).astype(int)

column3_numbers = np.random.randint(2,size=num_samples)

#should use np.where for better randomization

column3_numbers[column1_numbers>mean1] = 1

data = {'Miles_per_week' : column1_numbers,
        'Farthest_run' : column2_numbers,
        'Qualified_Boston_Marathon' : column3_numbers}
df = pd.DataFrame(data)

In [5]:
df

Unnamed: 0,Miles_per_week,Farthest_run,Qualified_Boston_Marathon
0,68,22,1
1,62,22,1
2,40,21,0
3,55,16,0
4,61,22,1
...,...,...,...
495,60,15,1
496,56,17,1
497,60,22,1
498,48,17,0


In [6]:
df.head(10)

Unnamed: 0,Miles_per_week,Farthest_run,Qualified_Boston_Marathon
0,68,22,1
1,62,22,1
2,40,21,0
3,55,16,0
4,61,22,1
5,48,17,0
6,58,18,1
7,56,21,1
8,55,18,1
9,53,18,1


In [7]:
df['Qualified_Boston_Marathon'].value_counts()

Qualified_Boston_Marathon
1    371
0    129
Name: count, dtype: int64

In [8]:
X = df.iloc[:,0:2]

In [9]:
y = df.iloc[:,2]

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=26,test_size=0.3)

In [12]:
from sklearn.ensemble import RandomForestClassifier

In [13]:
rf = RandomForestClassifier()

In [18]:
param_grid = [{  'n_estimators' : [500,1000,1500],
                #'criterion' : ['entropy','gini'],
                'min_samples_split' : [5,10,15],
                'min_samples_leaf' : [1,2,4],
                #'max_depth' : [10,20,30]
              }]

In [19]:
from sklearn.model_selection import GridSearchCV

In [22]:
grid_search = GridSearchCV(
        rf,
    param_grid,
    cv=2,
    scoring='accuracy',
    n_jobs=-1
)

In [23]:
grid_search.fit(X_train,y_train)

In [24]:
grid_search.best_score_

np.float64(0.7971428571428572)

In [25]:
grid_search.best_params_

{'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 500}

In [27]:
#randomized

In [28]:
from sklearn.model_selection import RandomizedSearchCV

In [29]:
param_grid = [{  'n_estimators' : [500,1000,1500],
                'criterion' : ['entropy','gini'],
                'min_samples_split' : [5,10,15],
                'min_samples_leaf' : [1,2,4],
                'max_depth' : [10,20,30]
              }]

In [33]:
random_grid_search = RandomizedSearchCV(rf,
                                        param_grid,
                                        cv=5,
                                       scoring='accuracy',
                                       n_jobs=-1,
                                       random_state=26)

In [34]:
random_grid_search.fit(X_train,y_train)

In [35]:
random_grid_search.best_score_

np.float64(0.76)

In [36]:
random_grid_search.best_params_

{'n_estimators': 1500,
 'min_samples_split': 10,
 'min_samples_leaf': 1,
 'max_depth': 30,
 'criterion': 'gini'}