# Brainless MSE Search

In [1]:
import numpy as np
import pandas as pd
from sklearn import model_selection
from sklearn import metrics

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
#from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression

In [3]:
class Selector():
        
        
    def ModelSelection(self, folds=10, rstate=420):
        
        cv = {}
        
        for name, model in self.Models.items():
            
            try:
            
                kfold = model_selection.KFold(n_splits=folds, random_state=rstate) 
                cv_result = model_selection.cross_val_score(model, self.X, self.y, cv=kfold, scoring='accuracy')
                cv[name] = cv_result
            
            except Exception as e:
                
                cv[name] = e
            

        summary = {name: results.mean() for name, results in cv.items()}
        
        best_model = max(summary, key=summary.get)
        best_model = self.Models[best_model]
        
        self.cv_results = cv
        self.model_summary = summary
        self.best_model = best_model
        
    def FeatureSelection(self, folds=10, rstate=420):
        
        feature_cols = self.X.columns
        scores = {}
        kfold = model_selection.KFold(n_splits=folds, random_state=rstate)
        model = self.best_model
        model.fit(self.X, self.y)
        mse_scores = -model_selection.cross_val_score(model, X, y, cv=kfold, scoring='neg_mean_squared_error')
        scores[None] = mse_scores
        
        for dropped_x in feature_cols:
    
            feature_subset = [item for item in feature_cols if item != dropped_x]
            X2 = self.X[feature_subset]
            model = self.best_model
            model.fit(X2, y)
            mse_scores = -model_selection.cross_val_score(model, X2, y, cv=kfold, scoring='neg_mean_squared_error')
            scores[dropped_x] = mse_scores
        
        self.feature_scores = scores
        
        summary = {key: {'MEAN MSE': value.mean(), 'MEAN RMSE': np.sqrt(value).mean()} for key, value in scores.items()}
        self.feature_summary = summary 
        

In [27]:
class GenericClassifier(Selector):
    
    def __init__(self, X, y):
        
        self.X = X
        self.y = y
        
        self.Models = {
                       
            'LR': LogisticRegression(),
            'KNN': KNeighborsClassifier(),
            'GBT': GradientBoostingClassifier(),
            'NB': GaussianNB(),
            'SVM': SVC(),
            'DT': DecisionTreeClassifier()
        
        }

In [28]:
# K-Fold CV claims to not require a holdout, so concat to avoid throwing away data

cable_training = pd.read_csv('./Data/training.csv', na_values=(-999, 6)) # value = 6 corresponds to refusal to answer, 6 nowhere else in data
cable_holdout = pd.read_csv('./Data/holdout.csv', na_values=(-999, 6))
cable = pd.concat([cable_training, cable_holdout])

In [29]:
def CleanCableData(df):
    
    # Since this is a purely exploratory excercise in ML, we have no priors about inappropriate information, beyond the obvious (ID)
    
    #drop = ['YES', 'ID', 'age', 'class', 'tele_have']
    #df['value'] = [(i - 3) for i in df['value']] # Normalize (-2 to +2)
    #df = df[[col for col in df.columns if col not in drop]]
    
    drop = ['ID', 'tele_have']
    df = df[[col for col in df.columns if col not in drop]]
    df = df.dropna()
    
    return df

In [30]:
cable = CleanCableData(cable)

cable['constant'] = [1 for i in range(len(cable))]

y = pd.DataFrame(cable['buy'])
X = cable[[col for col in cable.columns if col != 'buy']]

In [31]:
Model = GenericClassifier(X, y)

In [32]:
import warnings
warnings.filterwarnings('ignore')


Model.ModelSelection(folds=5)

In [40]:
Model.best_model.fit(X, y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [41]:
predictions = [i[0] for i in Model.best_model.predict_proba(X)] # Model Prediction over training set

In [44]:
sum(predictions) / len(predictions), y.mean()[0] # Far too high

(0.6928362183754991, 0.30758988015978694)

In [37]:
Model.FeatureSelection() # Not working properly

In [None]:
# We are unsure wether the MSE minimization approach has caused this issue, or if we have simply straw-manned ML by not doing
# rigorous data analysis up front; lets try a more reasonable X

In [45]:
def CleanCableData(df):
    
    
    drop = ['YES', 'ID', 'age', 'class', 'tele_have']
    df['value'] = [(i - 3) for i in df['value']] # Normalize (-2 to +2)
    df = df[[col for col in df.columns if col not in drop]]
    
    df = df[[col for col in df.columns if col not in drop]]
    df = df.dropna()
    
    return df

In [47]:
cable = pd.concat([cable_training, cable_holdout])
cable = CleanCableData(cable)

In [48]:
Model2 = GenericClassifier(X, y)

In [49]:
Model2.ModelSelection(folds=5)

In [51]:
Model2.best_model.fit(X, y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [52]:
predictions2 = [i[1] for i in Model2.best_model.predict_proba(X)]

In [53]:
sum(predictions2) / len(predictions2), y.mean()[0] # More in line with expectations

(0.30716378162450075, 0.30758988015978694)

# LIME