In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
data = pd.read_excel("SampleDonorData.xlsx")

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19372 entries, 0 to 19371
Data columns (total 25 columns):
TARGET_B                       19372 non-null int64
TARGET_D                       19372 non-null int64
CONTROL_NUMBER                 19372 non-null int64
MONTHS_SINCE_ORIGIN            19372 non-null int64
DONOR_AGE                      14577 non-null float64
IN_HOUSE                       19372 non-null int64
URBANICITY                     19372 non-null object
CLUSTER_CODE                   18918 non-null float64
HOME_OWNER                     19372 non-null object
DONOR_GENDER                   19372 non-null object
INCOME_GROUP                   14980 non-null float64
PUBLISHED_PHONE                19372 non-null int64
WEALTH_RATING                  10562 non-null float64
MEDIAN_HOME_VALUE              19372 non-null int64
MEDIAN_HOUSEHOLD_INCOME        19372 non-null int64
PCT_OWNER_OCCUPIED             19372 non-null int64
PEP_STAR                       19372 non-null in

In [4]:
columns_as_category = ['TARGET_B','CONTROL_NUMBER','IN_HOUSE','CLUSTER_CODE','INCOME_GROUP','PUBLISHED_PHONE','WEALTH_RATING','RECENT_STAR_STATUS','PEP_STAR']

In [5]:
data[columns_as_category] = data[columns_as_category].astype('object')

In [6]:
num_var = data.select_dtypes(exclude='object')
cat_var = data.select_dtypes(include='object')

In [7]:
num_var.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19372 entries, 0 to 19371
Data columns (total 12 columns):
TARGET_D                       19372 non-null int64
MONTHS_SINCE_ORIGIN            19372 non-null int64
DONOR_AGE                      14577 non-null float64
MEDIAN_HOME_VALUE              19372 non-null int64
MEDIAN_HOUSEHOLD_INCOME        19372 non-null int64
PCT_OWNER_OCCUPIED             19372 non-null int64
RECENT_CARD_RESPONSE_PROP      19372 non-null float64
MONTHS_SINCE_LAST_PROM_RESP    19126 non-null float64
LAST_GIFT_AMT                  19372 non-null int64
NUMBER_PROM_12                 19372 non-null int64
MONTHS_SINCE_LAST_GIFT         19372 non-null int64
MONTHS_SINCE_FIRST_GIFT        19372 non-null int64
dtypes: float64(3), int64(9)
memory usage: 1.8 MB


In [8]:
for col in num_var.columns:
    num_var[col].fillna(num_var[col].mean(),inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [9]:
cat_var.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19372 entries, 0 to 19371
Data columns (total 13 columns):
TARGET_B               19372 non-null object
CONTROL_NUMBER         19372 non-null object
IN_HOUSE               19372 non-null object
URBANICITY             19372 non-null object
CLUSTER_CODE           18918 non-null object
HOME_OWNER             19372 non-null object
DONOR_GENDER           19372 non-null object
INCOME_GROUP           14980 non-null object
PUBLISHED_PHONE        19372 non-null object
WEALTH_RATING          10562 non-null object
PEP_STAR               19372 non-null object
RECENT_STAR_STATUS     19372 non-null object
recency_freq_status    19372 non-null object
dtypes: object(13)
memory usage: 1.9+ MB


In [10]:
for col in cat_var.columns:
    cat_var[col].fillna(cat_var[col].mode()[0],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [11]:
cat_var=cat_var.astype('category')

In [12]:
data_join = pd.concat([num_var,cat_var],axis=1)

In [13]:
data_join.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19372 entries, 0 to 19371
Data columns (total 25 columns):
TARGET_D                       19372 non-null int64
MONTHS_SINCE_ORIGIN            19372 non-null int64
DONOR_AGE                      19372 non-null float64
MEDIAN_HOME_VALUE              19372 non-null int64
MEDIAN_HOUSEHOLD_INCOME        19372 non-null int64
PCT_OWNER_OCCUPIED             19372 non-null int64
RECENT_CARD_RESPONSE_PROP      19372 non-null float64
MONTHS_SINCE_LAST_PROM_RESP    19372 non-null float64
LAST_GIFT_AMT                  19372 non-null int64
NUMBER_PROM_12                 19372 non-null int64
MONTHS_SINCE_LAST_GIFT         19372 non-null int64
MONTHS_SINCE_FIRST_GIFT        19372 non-null int64
TARGET_B                       19372 non-null category
CONTROL_NUMBER                 19372 non-null category
IN_HOUSE                       19372 non-null category
URBANICITY                     19372 non-null category
CLUSTER_CODE                   19372 non-

# Question 5 - Build the Best Classification Model using Machine Learning Methods

In [44]:
#Checing the class imbalance to understand the type of scoring metrics to be used for determining the best model 
print('Proportion of 0',len(data[data.TARGET_B == 0])/ len(data))
print("Proportion of 1",len(data[data.TARGET_B == 1])/ len(data))

Proportion of 0 0.75
Proportion of 1 0.25


In [14]:
#check correlation for dropping variables which are highly correlated
corr_matrix = data_join.corr().abs()
# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.90)]

In [15]:
to_drop

['MONTHS_SINCE_FIRST_GIFT']

In [16]:
data_join = data_join.drop(to_drop, axis=1)

In [17]:
X_data = data_join.drop(columns=['TARGET_B','TARGET_D','CONTROL_NUMBER'])
y_data = pd.DataFrame(data_join['TARGET_B'])

In [18]:
y_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19372 entries, 0 to 19371
Data columns (total 1 columns):
TARGET_B    19372 non-null category
dtypes: category(1)
memory usage: 19.1 KB


In [19]:
X_data_dum = pd.DataFrame(pd.get_dummies(X_data,drop_first= True))

In [20]:
from sklearn.model_selection import train_test_split
X_data_train,X_data_test, y_data_train, y_data_test = train_test_split(X_data_dum, y_data, shuffle = True, test_size = 0.3, random_state=42)

In [21]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_data_train = scaler.fit_transform(X_data_train)
X_data_test = scaler.fit_transform(X_data_test)

  return self.partial_fit(X, y)
  return self.partial_fit(X, y)


##### Logistic Regression

In [32]:
import sklearn
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

In [33]:
from sklearn import linear_model

logistic = linear_model.LogisticRegression()
penalty = ['l1', 'l2']
C = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
hyperparameters = dict(C=C, penalty=penalty)
grid_lg = GridSearchCV(logistic, hyperparameters, cv=5, scoring='recall')
grid_lg.fit(X_data_train,y_data_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='recall', verbose=0)

In [34]:
print(grid_lg.best_params_)
print(grid_lg.best_score_)

{'C': 10, 'penalty': 'l2'}
0.016909330047290363


In [35]:
print(grid_lg.score(X_data_train, y_data_train))
print(grid_lg.score(X_data_test, y_data_test))

0.017798872738059923
0.015625


In [39]:
from sklearn.metrics import classification_report
lg_predict = grid_lg.predict(X_data_test)
print("\nLG scores:")
print(classification_report(y_data_test, lg_predict, target_names=["non-donors", "donors"]))


LG scores:
              precision    recall  f1-score   support

  non-donors       0.75      0.99      0.85      4340
      donors       0.45      0.02      0.03      1472

   micro avg       0.75      0.75      0.75      5812
   macro avg       0.60      0.50      0.44      5812
weighted avg       0.67      0.75      0.65      5812



##### kNN Classifier

In [43]:
from sklearn.neighbors import KNeighborsClassifier
k_range = [1,5,10,15,20]
param_grid = dict(n_neighbors=k_range)
knn = KNeighborsClassifier(n_neighbors=k_range)
grid_kNN = GridSearchCV(knn, param_grid, cv=5, scoring='recall')
grid_kNN.fit(X_data_train, y_data_train)

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=[1, 5, 10, 15, 20],
           p=2, weights='uniform'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_neighbors': [1, 5, 10, 15, 20]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='recall', verbose=0)

In [44]:
print(grid_kNN.best_score_)
print(grid_kNN.best_params_)

0.2678734602752223
{'n_neighbors': 1}


In [45]:
print(grid_kNN.score(X_data_train, y_data_train))
print(grid_kNN.score(X_data_test, y_data_test))

1.0
0.264945652173913


In [46]:
from sklearn.metrics import classification_report
knn_predict = grid_kNN.predict(X_data_test)
print("\nKNN scores:")
print(classification_report(y_data_test, knn_predict, target_names=["non-donors", "donors"]))


KNN scores:
              precision    recall  f1-score   support

  non-donors       0.76      0.77      0.76      4340
      donors       0.28      0.26      0.27      1472

   micro avg       0.64      0.64      0.64      5812
   macro avg       0.52      0.52      0.52      5812
weighted avg       0.64      0.64      0.64      5812



##### Linear SVC

In [40]:
from sklearn.svm import LinearSVC

svc_lin = LinearSVC()
param_grid = {'C':[0.001, 0.01, 0.1, 1, 10, 100]}

grid_svc_lin = GridSearchCV(svc_lin, param_grid, cv = 5, scoring='recall')
grid_svc_lin.fit(X_data_train, y_data_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='recall', verbose=0)

In [41]:
print(grid_svc_lin.best_score_)
print(grid_svc_lin.best_params_)

0.17557073567356912
{'C': 100}


In [42]:
from sklearn.metrics import classification_report
lin_svc_predict = grid_svc_lin.predict(X_data_test)
print("\nLinSVC scores:")
print(classification_report(y_data_test, lin_svc_predict, target_names=["non-donors", "donors"]))


LinSVC scores:
              precision    recall  f1-score   support

  non-donors       0.75      0.90      0.82      4340
      donors       0.29      0.12      0.17      1472

   micro avg       0.70      0.70      0.70      5812
   macro avg       0.52      0.51      0.49      5812
weighted avg       0.64      0.70      0.66      5812



##### Decision Tree

In [47]:
from sklearn.tree import DecisionTreeClassifier
param_grid = {'max_depth': [2, 6, 10, 15, 20]}
grid_dt = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5, scoring = 'recall')
grid_dt.fit(X_data_train,y_data_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': [2, 6, 10, 15, 20]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='recall', verbose=0)

In [48]:
print(grid_dt.best_params_)
print(grid_dt.best_score_)

{'max_depth': 20}
0.22486160315843973


In [49]:
from sklearn.metrics import classification_report
dt_predict = grid_dt.predict(X_data_test)
print("\nDT scores:")
print(classification_report(y_data_test, dt_predict, target_names=["non-donor", "donor"]))


DT scores:
              precision    recall  f1-score   support

   non-donor       0.76      0.83      0.79      4340
       donor       0.30      0.22      0.25      1472

   micro avg       0.68      0.68      0.68      5812
   macro avg       0.53      0.52      0.52      5812
weighted avg       0.64      0.68      0.66      5812



##### Random Forest

In [50]:
from sklearn.ensemble import RandomForestClassifier

param_grid = {'n_estimators':[200,350,500,700], 'max_depth':[2,6,10,15,20]}
grid_rf = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, scoring = 'recall')
grid_rf.fit(X_data_train, y_data_train)

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [200, 350, 500, 700], 'max_depth': [2, 6, 10, 15, 20]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='recall', verbose=0)

In [51]:
print(grid_rf.best_params_)
print(grid_rf.best_score_)

{'max_depth': 20, 'n_estimators': 200}
0.018986037397217954


In [52]:
from sklearn.metrics import classification_report
rf_predict = grid_rf.predict(X_data_test)
print("\nRandomForest scores:")
print(classification_report(y_data_test, rf_predict, target_names=["non-donor", "donor"]))


RandomForest scores:
              precision    recall  f1-score   support

   non-donor       0.75      1.00      0.86      4340
       donor       0.64      0.01      0.02      1472

   micro avg       0.75      0.75      0.75      5812
   macro avg       0.70      0.50      0.44      5812
weighted avg       0.72      0.75      0.64      5812

