# Running Different Permutations of Models
- example below is using the car data set and the cros_val_score to test different Decision Tree models for different X-variables
- first section is doing it manually, second section I've automated with a for loop
- I've also automated looking at different scoring metrics

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import tree
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import (KFold, StratifiedKFold, GroupKFold)
print(os.getcwd())

H:\jpmDesk\Desktop\DS Oct 2023\Python Project


In [29]:
df = pd.read_csv("ExData\car_data.csv")
df.set_index(["User ID"], inplace=True)
df = pd.concat([df, pd.get_dummies(df['Gender'])], axis=1)

https://scikit-learn.org/stable/modules/model_evaluation.html

In [30]:
clf = tree.DecisionTreeClassifier(random_state=42)
scores = cross_val_score(clf, df[['Age', 'AnnualSalary', 'Female']], df['Purchased'], cv=5)
scores

array([0.88 , 0.875, 0.905, 0.875, 0.87 ])

In [31]:
X = df[['Age', 'AnnualSalary', 'Female']]

In [32]:
cross_val_score(clf, X, df['Purchased'], cv=5, scoring='precision')

array([0.81111111, 0.83950617, 0.8961039 , 0.84146341, 0.86666667])

In [33]:
cross_val_score(clf, X, df['Purchased'], cv=5, scoring='recall')

array([0.9125    , 0.85      , 0.8625    , 0.85185185, 0.80246914])

In [34]:
cross_val_score(clf, X, df['Purchased'], cv=5, scoring='f1')

array([0.85882353, 0.8447205 , 0.87898089, 0.84662577, 0.83333333])

In [35]:
cross_val_score(clf, X, df['Purchased'], cv=5, scoring='roc_auc')

array([0.88541667, 0.87083333, 0.89791667, 0.87130408, 0.85755784])

In [36]:
clf.fit(X, df['Purchased'])

# Permutate Models

In [56]:
df = pd.read_csv("ExData\car_data.csv", index_col=['User ID'])
df = pd.concat([df, pd.get_dummies(df['Gender'])], axis=1)

In [82]:
clf = tree.DecisionTreeClassifier(random_state=42)
clf2 = clf = LogisticRegression(random_state=0)
X1_cols = ['Age', 'AnnualSalary', 'Female'] #all cols
X2_cols = ['Age', 'AnnualSalary']
X3_cols = ['Age']
X4_cols = ['AnnualSalary']
allXs = [X1_cols, X2_cols, X3_cols, X4_cols]
typesModels = [clf, clf2]

y = df['Purchased']
metrics = ['accuracy','recall','precision','f1','roc_auc']

In [83]:
X_train, X_test, y_train, y_test = train_test_split(df[X1_cols], y, test_size=0.2, random_state=0)
        #800 rows of testing, 200 of training

In [96]:
results = {}
for metric in metrics:
    models = []
    for cols in allXs:   
        scores = cross_val_score(clf, X_train[cols], y_train, cv=5, scoring=metric)
                        #only using the 800 training rows, but does more splits 
                        #run 5 tests --> each test might be doing 100 test and 700 train
        models.append({'Scores':scores,'Mean':scores.mean(),'STD':scores.std(),'Min':scores.min(),'Cols':cols})
    resultsTable = pd.DataFrame(models)
    results[metric] = resultsTable

In [85]:
pd.concat(results)

Unnamed: 0,Unnamed: 1,Scores,Mean,STD,Min,Cols
accuracy,0,"[0.80625, 0.91875, 0.88125, 0.85625, 0.85]",0.8625,0.037081,0.80625,"[Age, AnnualSalary, Female]"
accuracy,1,"[0.80625, 0.90625, 0.85625, 0.85, 0.84375]",0.8525,0.032016,0.80625,"[Age, AnnualSalary]"
accuracy,2,"[0.8, 0.85, 0.76875, 0.83125, 0.8375]",0.8175,0.029422,0.76875,[Age]
accuracy,3,"[0.70625, 0.71875, 0.70625, 0.73125, 0.7125]",0.715,0.009354,0.70625,[AnnualSalary]
recall,0,"[0.71875, 0.859375, 0.8153846153846154, 0.8461...",0.817163,0.051286,0.71875,"[Age, AnnualSalary, Female]"
recall,1,"[0.765625, 0.859375, 0.7846153846153846, 0.784...",0.801923,0.032859,0.765625,"[Age, AnnualSalary]"
recall,2,"[0.671875, 0.71875, 0.5846153846153846, 0.7230...",0.69351,0.062564,0.584615,[Age]
recall,3,"[0.546875, 0.53125, 0.5076923076923077, 0.5230...",0.544856,0.037467,0.507692,[AnnualSalary]
precision,0,"[0.7796610169491526, 0.9322033898305084, 0.883...",0.840225,0.058021,0.779661,"[Age, AnnualSalary, Female]"
precision,1,"[0.7538461538461538, 0.9016393442622951, 0.85,...",0.828916,0.049173,0.753846,"[Age, AnnualSalary]"


In [93]:
#Final Model
finalX = X_train[['Age', 'AnnualSalary']]
clf.fit(finalX, y_train)

In [94]:
clf.score(finalX, y_train)

0.9925

In [95]:
clf.score(X_test[['Age', 'AnnualSalary']],y_test)

0.905

In [86]:
from sklearn.model_selection import cross_validate

In [88]:
scoring = ['accuracy','recall','precision', 'f1','roc_auc']
scores = cross_validate(clf, X_train[X1_cols], y_train, scoring=scoring)
scores

{'fit_time': array([0.00400209, 0.004498  , 0.00400019, 0.00600338, 0.00400543]),
 'score_time': array([0.01351666, 0.01251268, 0.0150032 , 0.01399398, 0.01199627]),
 'test_accuracy': array([0.80625, 0.91875, 0.88125, 0.85625, 0.85   ]),
 'test_recall': array([0.71875   , 0.859375  , 0.81538462, 0.84615385, 0.84615385]),
 'test_precision': array([0.77966102, 0.93220339, 0.88333333, 0.80882353, 0.79710145]),
 'test_f1': array([0.74796748, 0.89430894, 0.848     , 0.82706767, 0.82089552]),
 'test_roc_auc': array([0.79166667, 0.90812174, 0.8708502 , 0.85384615, 0.85595142])}