# Running Different Permutations of Models
- example below is using the car data set and the cros_val_score to test different Decision Tree models for different X-variables
- first section is doing it manually, second section I've automated with a for loop
- I've also automated looking at different scoring metrics

In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import tree
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import (KFold, StratifiedKFold, GroupKFold)

#Codes for logistic
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [9]:
df = pd.read_csv("ExData\car_data.csv", index_col=['User ID'])
df = pd.concat([df, pd.get_dummies(df['Gender'])], axis=1)
df['AnnualSalaryStd'] = StandardScaler().fit_transform(df[['AnnualSalary']])

https://scikit-learn.org/stable/modules/model_evaluation.html

# Permutate Models

In [33]:
import itertools
allcols = ['Age', 'AnnualSalaryStd', 'Female']
allXs = []
for num in range(1,len(allcols)+1):
    combos = [list(x) for x in itertools.combinations(allcols, num)]
    allXs.extend(combos)
len(allXs), allXs

(7,
 [['Age'],
  ['AnnualSalaryStd'],
  ['Female'],
  ['Age', 'AnnualSalaryStd'],
  ['Age', 'Female'],
  ['AnnualSalaryStd', 'Female'],
  ['Age', 'AnnualSalaryStd', 'Female']])

In [34]:
clf1 = tree.DecisionTreeClassifier(random_state=42)
clf2 = LogisticRegression(random_state=42)
typesModels = {'Decision':clf1, 'LogReg':clf2}
y = df['Purchased']
metrics = ['accuracy','recall','precision','f1','roc_auc']

In [35]:
X_train, X_test, y_train, y_test = train_test_split(df[allcols], y, test_size=0.2, random_state=0)
        #800 rows of training, 200 of testing

In [55]:
results = {}
for type, clf in typesModels.items(): #type ='Decision' or 'LogReg', clf = the model
    for metric in metrics:
        models = []
        for cols in allXs:   
            #print(type, cols)
            scores = cross_val_score(clf, X_train[cols], y_train, cv=5, scoring=metric)
                            #only using the 800 training rows, but does more splits 
                            #run 5 tests --> each test might be doing 100 test and 700 train
            models.append({'Model':type,'Scores':scores,'Mean':scores.mean(),
                           'STD':scores.std(),'Min':scores.min(),'Cols':cols})
        resultsTable = pd.DataFrame(models)
        results[(metric,type)] = resultsTable

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [43]:
allModelsDF = pd.concat(results)
allModelsDF.to_excel('Output/Models.xlsx')
allModelsDF.head(50)

Unnamed: 0,Unnamed: 1,Unnamed: 2,Model,Scores,Mean,STD,Min,Cols
accuracy,Decision,0,Decision,"[0.8, 0.85, 0.76875, 0.83125, 0.8375]",0.8175,0.029422,0.76875,[Age]
accuracy,Decision,1,Decision,"[0.70625, 0.71875, 0.70625, 0.725, 0.7125]",0.71375,0.007289,0.70625,[AnnualSalaryStd]
accuracy,Decision,2,Decision,"[0.6, 0.6, 0.59375, 0.59375, 0.59375]",0.59625,0.003062,0.59375,[Female]
accuracy,Decision,3,Decision,"[0.8, 0.90625, 0.85625, 0.85, 0.84375]",0.85125,0.033866,0.8,"[Age, AnnualSalaryStd]"
accuracy,Decision,4,Decision,"[0.78125, 0.825, 0.7625, 0.80625, 0.8375]",0.8025,0.027557,0.7625,"[Age, Female]"
accuracy,Decision,5,Decision,"[0.6875, 0.725, 0.71875, 0.7375, 0.69375]",0.7125,0.018957,0.6875,"[AnnualSalaryStd, Female]"
accuracy,Decision,6,Decision,"[0.8, 0.91875, 0.88125, 0.85, 0.85]",0.86,0.039251,0.8,"[Age, AnnualSalaryStd, Female]"
recall,Decision,0,Decision,"[0.671875, 0.71875, 0.5846153846153846, 0.7230...",0.69351,0.062564,0.584615,[Age]
recall,Decision,1,Decision,"[0.546875, 0.53125, 0.5076923076923077, 0.5230...",0.544856,0.037467,0.507692,[AnnualSalaryStd]
recall,Decision,2,Decision,"[0.0, 0.0, 0.0, 0.0, 0.0]",0.0,0.0,0.0,[Female]


In [57]:
#Final Model
finalX = X_train[['Age', 'AnnualSalaryStd']]
clf1.fit(finalX, y_train)

In [58]:
clf1.score(finalX, y_train)

0.9925

In [59]:
clf1.score(X_test[['Age', 'AnnualSalaryStd']],y_test)

0.905