## fine tuning decision tree

In [13]:
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import Binarizer, LabelEncoder, OneHotEncoder
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import tree
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV

In [2]:
data = pd.read_csv('sales_loss_win_data.csv')

In [3]:
data = data.drop('Opportunity Number', axis =1 )

In [4]:
le = preprocessing.LabelEncoder()

In [5]:
class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

In [6]:
final_data = MultiColumnLabelEncoder(columns = ['Supplies Subgroup', 'Supplies Group', 'Region', 'Route To Market', 
                                  'Opportunity Result', 'Competitor Type']).fit_transform(data)

In [7]:
yVar = final_data['Opportunity Result']
xVar = final_data.loc[:, final_data.columns != 'Opportunity Result']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(xVar, yVar, test_size=0.2)
print (X_train.shape, y_train.shape)
print (X_test.shape, y_test.shape)

(62420, 17) (62420,)
(15605, 17) (15605,)


## base model with default hyperparameter values

In [10]:
clf_gini = DecisionTreeClassifier(criterion = "gini", random_state = 100,
                               max_depth=3, min_samples_leaf=5)
clf_gini.fit(X_train, y_train)
y_pred_gini = clf_gini.predict(X_test)
print ("Gini accuracy is ", accuracy_score(y_test,y_pred_gini)*100)

Gini accuracy is  81.6661326498


In [11]:
clf_gini.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=5,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=100, splitter='best')

## tune hyperparameters using RandomizedSearchCV
Randomized search on hyper parameters.

RandomizedSearchCV implements a “fit” and a “score” method. It also implements “predict”, “predict_proba”, “decision_function”, “transform” and “inverse_transform” if they are implemented in the estimator used.

The parameters of the estimator used to apply these methods are optimized by cross-validated search over parameter settings.

In contrast to GridSearchCV, not all parameter values are tried out, but rather a fixed number of parameter settings is sampled from the specified distributions. The number of parameter settings that are tried is given by n_iter.

Source: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html

In [14]:
hyperparameters = {"max_depth": [3, None],
                   "max_features": randint(1, 9),
                   "min_samples_leaf": randint(1, 9)
                  }

In [17]:
clf = RandomizedSearchCV(clf_gini, hyperparameters, cv=5)

clf.fit(X_train, y_train)

print("DT Score after CV: %s" % clf.score(X_test, y_test))

DT Score after CV: 0.850560717719
