In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
#from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression
from sklearn import model_selection

In [28]:
import numpy as np
import pandas as pd

In [29]:
class Selector():
        
    '''Non-operational bag of Methods for selecting between model types and feature inputs given a model.  Parent of the
    operant classes GenericClassifier and GenericRegressor'''    
    
    def ModelSelection(self, folds=10, rstate=420):
        
        cv_scores, cv_summary = {}, {}
        
        
        for name, model in self.Models.items():
            
            try:
            
                kfold = model_selection.KFold(n_splits=folds, random_state=rstate) 
                cv_result = model_selection.cross_val_score(model, self.X, self.y, cv=kfold, scoring='accuracy')
                cv_summary = "%s: %f (%f)" % (name, cv_result.mean(), cv_result.std())
                cv_scores[name] = cv_result       
                
            
            except Exception as e:
                
                cv_scores[name] = e
                cv_summary[name] = e
        
        self.cv_scores = cv_scores
        
        # Print Summary
        for k, v in Model.cv_scores.items():
    
           msg = "%s: %f (%f)" % (k, v.mean(), v.std())
           print(msg)
            
        # We could return a 'best model' for ease of use, but it will require us to be explicit about our selection criteria
        # (MSE, std errors, priors) up front -> seems exceptionally black boxy; we should probably just look at the results
        # and decide manually (What else would anyone pay us for?).

        
    def FeatureSelection(self, folds=10, rstate=420):
        
        '''This section is considerably more sketchy than the model selection component; needs work
        before results are to be trusted'''
        
        feature_cols = self.X.columns
        scores = {}
        kfold = model_selection.KFold(n_splits=folds, random_state=rstate)
        model = self.best_model
        model.fit(self.X, self.y)
        mse_scores = -model_selection.cross_val_score(model, X, y, cv=kfold, scoring='neg_mean_squared_error')
        scores[None] = mse_scores
        
        for dropped_x in feature_cols:
    
            feature_subset = [item for item in feature_cols if item != dropped_x]
            X2 = self.X[feature_subset]
            model = self.best_model
            model.fit(X2, y)
            mse_scores = -model_selection.cross_val_score(model, X2, y, cv=kfold, scoring='neg_mean_squared_error')
            scores[dropped_x] = mse_scores
        
        self.feature_scores = scores
        
        summary = {key: {'MEAN MSE': value.mean(), 'MEAN RMSE': np.sqrt(value).mean()} for key, value in scores.items()}
        self.feature_summary = summary 
        
class GenericClassifier(Selector):
    
    def __init__(self, X, y):
        
        self.X = X
        self.y = y
        
        self.Models = {
                       
            'LR': LogisticRegression(),
            'KNN': KNeighborsClassifier(),
            'GBT': GradientBoostingClassifier(),
            'NB': GaussianNB(),
            'SVM': SVC(),
            'DT': DecisionTreeClassifier()
        
        }
        
class GenericRegressor(Selector):
    
    def __init__(self, X, y):
        
        self.X = X
        self.y = y
        
        self.Models = {
                       
            # 'OLS': LinearRegression(),
            # etc..
        
        }

In [74]:
cable_training = pd.read_csv('./Data/training.csv', na_values=(-999, 6)) # value = 6 corresponds to refusal to answer, 6 nowhere else in data
cable_holdout = pd.read_csv('./Data/holdout.csv', na_values=(-999, 6))
cable = pd.concat([cable_training, cable_holdout])

In [75]:
def CleanCableData(df):
    
    # Since this is a purely exploratory excercise in ML, we have no priors about inappropriate information, beyond the obvious (ID)
    
    #drop = ['YES', 'ID', 'age', 'class', 'tele_have']
    #df['value'] = [(i - 3) for i in df['value']] # Normalize (-2 to +2)
    #df = df[[col for col in df.columns if col not in drop]]
    
    drop = ['ID', 'tele_have']
    df = df[[col for col in df.columns if col not in drop]]
    df = df.dropna()
    
    return df

In [76]:
cable = CleanCableData(cable)

cable['constant'] = [1 for i in range(len(cable))]

y = pd.DataFrame(cable['buy'])
X = cable[[col for col in cable.columns if col != 'buy']]
X = np.array(X)

In [77]:
Model = GenericClassifier(X, y)

# Model Selection

In [78]:
import warnings
warnings.filterwarnings('ignore')

Model.ModelSelection(folds=5)

SVM: 0.871505 (0.006737)
GBT: 0.849268 (0.005532)
LR: 0.763249 (0.015521)
DT: 0.998402 (0.001082)
NB: 0.743675 (0.017478)
KNN: 0.998535 (0.000883)


In [38]:
# What are these cv_scores telling me?

# Per documentation: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html
# Returns: Array of scores of the estimators prediction for each run of the cross validation.
# So a cv score of '1' in a classification problem means perfect prediction over the holdout? -> ok, seems fine

# STD of cv scores gives an indication of how sensitive my MSE (generalization error) is w.r.t. my sampling procedure.
# Do I pick a classifier based on lowest mean error (holdout accuracy) or lowest variance of that error?

# Highest Reported Accuracy -> KNN
# Lowest Variance -> Still KNN

# An MSE and standard error of ~zero seems peculiar; its' difficult to believe that the model can predict consumer
# behavior with almost 100% accuracy.

In [70]:
# What is KNN doing?  
# Retrain 'best model' on the entire sample

Model = KNeighborsClassifier()
Model.fit(X, y)

#...

# Double checked the model to make sure predictions were in the correct dimensions, etc. No glaring problems
# detected.  Still, I find this result very odd.  Use LIME as a tie-breaker?

# For a discussion on hyperparameter selection, please see the 'Proof of Concept' notebook for automatic
# Model selection; we're a bit fuzzy on this and could use an in person discussion to clean some stuff up.

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

# Feature Selection

In [80]:
# We have an opinion on the believability of our 'Applied' like model (see: ), but there's no equivalent
# testing framework to evaulate the claims of the KNN model -> Use Linear Approximations

# LIME