In [1]:
# Remember to merge the training and holdout sets

In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
#from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression
from sklearn import model_selection

In [3]:
cable_1 = pd.read_csv('./Data/training.csv', na_values=[-999, 6])
cable_2 = pd.read_csv('./Data/holdout.csv', na_values=[-999, 6])

In [4]:
cable = pd.concat([cable_1, cable_2])

In [5]:
def CleanCableData(df):
    
    # omitting irrelevant/redundant columns and singular dummies (age=1, class=poor(d and e))
    
    drop = ['YES', 'ID', 'age', 'class', 'tele_have', 'd', 'de', 'emp_ft', 'emp_hwife', 'prog_qual', 'race', 'sex']
    df['value'] = [(i - 3) for i in df['value']] # Normalize (-2 to +2)
    df = df[[col for col in df.columns if col not in drop]]
    df = df.dropna()
    
    return df

In [6]:
cable = CleanCableData(cable)

In [11]:
class Selector():
        
    '''Non-operational bag of Methods for selecting between model types and feature inputs given a model.  Parent of the
    operant classes GenericClassifier and GenericRegressor'''    
    
    def ModelSelection(self, folds=10, rstate=420):
        
        cv_scores, cv_summary = {}, {}
        
        
        for name, model in self.Models.items():
            
            try:
            
                kfold = model_selection.KFold(n_splits=folds, random_state=rstate) 
                cv_result = model_selection.cross_val_score(model, self.X, self.y, cv=kfold, scoring='accuracy')
                cv_summary = "%s: %f (%f)" % (name, cv_result.mean(), cv_result.std())
                cv_scores[name] = cv_result       
                
            
            except Exception as e:
                
                cv_scores[name] = e
                cv_summary[name] = e
        
        self.cv_scores = cv_scores
        
        # Print Summary
        for k, v in self.cv_scores.items():
    
           msg = "%s: %f (%f)" % (k, v.mean(), v.std())
           print(msg)
            
        # We could return a 'best model' for ease of use, but it will require us to be explicit about our selection criteria
        # (MSE, std errors, priors) up front -> seems exceptionally black boxy; we should probably just look at the results
        # and decide manually (What else would anyone pay us for?).

        
    def FeatureSelection(self, folds=10, rstate=420):
        
        '''This section is considerably more sketchy than the model selection component; needs work
        before results are to be trusted'''
        
        feature_cols = self.X.columns
        scores = {}
        kfold = model_selection.KFold(n_splits=folds, random_state=rstate)
        model = self.best_model
        model.fit(self.X, self.y)
        mse_scores = -model_selection.cross_val_score(model, X, y, cv=kfold, scoring='neg_mean_squared_error')
        scores[None] = mse_scores
        
        for dropped_x in feature_cols:
    
            feature_subset = [item for item in feature_cols if item != dropped_x]
            X2 = self.X[feature_subset]
            model = self.best_model
            model.fit(X2, y)
            mse_scores = -model_selection.cross_val_score(model, X2, y, cv=kfold, scoring='neg_mean_squared_error')
            scores[dropped_x] = mse_scores
        
        self.feature_scores = scores
        
        summary = {key: {'MEAN MSE': value.mean(), 'MEAN RMSE': np.sqrt(value).mean()} for key, value in scores.items()}
        self.feature_summary = summary 
        
class GenericClassifier(Selector):
    
    def __init__(self, X, y):
        
        self.X = X
        self.y = y
        
        self.Models = {
                       
            'LR': LogisticRegression(),
            'KNN': KNeighborsClassifier(),
            'GBT': GradientBoostingClassifier(),
            'NB': GaussianNB(),
            'SVM': SVC(),
            'DT': DecisionTreeClassifier()
        
        }
        
class GenericRegressor(Selector):
    
    def __init__(self, X, y):
        
        self.X = X
        self.y = y
        
        self.Models = {
                       
            # 'OLS': LinearRegression(),
            # etc..
        
        }

In [12]:
y = pd.DataFrame(cable['buy'])
X = cable[[col for col in cable.columns if col != 'buy']]

feature_names = X.columns
X = np.array(X)

# add a constant column

X_c = np.c_[np.ones(X.shape[0]), X]
feature_names = feature_names.insert(0, 'constant')

In [13]:
Model = GenericClassifier(X, y)

In [15]:
y

Unnamed: 0,buy
1,0.0
2,1.0
3,0.0
4,0.0
5,1.0
6,0.0
8,0.0
9,1.0
11,0.0
12,0.0


In [14]:
Model.ModelSelection()

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, war

KNN: 0.997067 (0.001145)
SVM: 0.862511 (0.011037)
DT: 0.996700 (0.001309)
LR: 0.749863 (0.011237)
NB: 0.677360 (0.006217)
GBT: 0.813566 (0.014707)


In [28]:
Model.cv_scores['LR']

array([0.74335472, 0.74610449, 0.74610449, 0.74610449, 0.74060495,
       0.73877177, 0.74610449, 0.74885426, 0.76718607, 0.77543538])

In [30]:
X_c

array([[1., 0., 0., ..., 1., 1., 1.],
       [1., 1., 0., ..., 0., 1., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 1., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 1., 0.]])