In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import (KFold, StratifiedKFold, GroupKFold)
# print(os.getcwd())

# Model Selection Assignment

## Car Purchase Dataset Loading

In [2]:
df = pd.read_csv("ExData\car_data.csv")
df.set_index(["User ID"], inplace=True)
df['AnnualSalaryStd'] = StandardScaler().fit_transform(df['AnnualSalary'].values.reshape(-1,1))
df = pd.concat([df, pd.get_dummies(df['Gender'], dtype=int)], axis=1)
df['AgeGroup'] = 0
df.loc[(df['Age'] >= 18) & (df['Age'] <=26), 'AgeGroup'] = 1
df.loc[(df['Age'] >= 27) & (df['Age'] <=35), 'AgeGroup'] = 2
df.loc[(df['Age'] >= 36) & (df['Age'] <=44), 'AgeGroup'] = 3
df.loc[(df['Age'] >= 45) & (df['Age'] <=53), 'AgeGroup'] = 4
df.loc[(df['Age'] >= 54) & (df['Age'] <=62), 'AgeGroup'] = 5
df.loc[(df['Age'] >= 63), 'AgeGroup'] = 6
df.head()
df.head()

Unnamed: 0_level_0,Gender,Age,AnnualSalary,Purchased,AnnualSalaryStd,Female,Male,AgeGroup
User ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
385,Male,35,20000,0,-1.528498,0,1,2
681,Male,40,43500,0,-0.846767,0,1,3
353,Male,49,74000,0,0.038032,0,1,4
895,Male,40,107500,1,1.009861,0,1,3
661,Male,25,79000,0,0.183081,0,1,1


## Question 1 Cross-validation

Perform cross validation using Logistic Regression on the Car Purchase dataset.
1. Create a classifier named `clf`
2. Create a variable named `scores` to store the output from the `cross_val_score()` function. Use five folds.
   1. Use different combinations of Age, AnnualSalaryStd and Female as the dependant variables to find the best model.

In [29]:
clf = LogisticRegression(random_state=42)
scores = cross_val_score(clf, df[['Age', 'AnnualSalaryStd', 'Female']], df['Purchased'], cv=5)
print(scores)
np.min(scores), np.mean(scores), np.max(scores)

[0.82 0.82 0.89 0.84 0.79]


(0.79, 0.8320000000000001, 0.89)

In [22]:
clf = LogisticRegression(random_state=42)
scores = cross_val_score(clf, df[['Age', 'AnnualSalaryStd']], df['Purchased'], cv=5)
print(scores)
np.min(scores), np.mean(scores), np.max(scores)

[0.8   0.825 0.89  0.835 0.795]


(0.795, 0.8290000000000001, 0.89)

In [23]:
clf = LogisticRegression(random_state=42)
scores = cross_val_score(clf, df[['AnnualSalaryStd', 'Female']], df['Purchased'], cv=5)
print(scores)
np.min(scores), np.mean(scores), np.max(scores)

[0.755 0.755 0.78  0.76  0.795]


(0.755, 0.7689999999999999, 0.795)

In [24]:
clf = LogisticRegression(random_state=42)
scores = cross_val_score(clf, df[['Age', 'Female']], df['Purchased'], cv=5)
print(scores)
np.min(scores), np.mean(scores), np.max(scores)

[0.795 0.825 0.85  0.79  0.805]


(0.79, 0.8130000000000001, 0.85)

In [7]:
clf = LogisticRegression(random_state=42)
scores = cross_val_score(clf, df[['Age']], df['Purchased'], cv=5)
print(scores)
np.min(scores), np.mean(scores), np.max(scores)

[0.795 0.825 0.83  0.79  0.815]


(0.79, 0.8109999999999999, 0.83)

In [25]:
clf = LogisticRegression(random_state=42)
scores = cross_val_score(clf, df[['AnnualSalaryStd']], df['Purchased'], cv=5)
print(scores)
np.min(scores), np.mean(scores), np.max(scores)

[0.755 0.76  0.78  0.765 0.795]


(0.755, 0.771, 0.795)

## Question 2 Cross-validation

Perform cross validation using Logistic Regression on the Car Purchase dataset with holdout data.
1. Use `train_test_split()` to generate X_train, X_test, y_train, y_test DataFrames
2. Create a variable named `scores` to store the output from the `cross_val_score()` function. Use five folds.
   1. Use different combinations of Age, AnnualSalaryStd and Female as the dependant variables to find the best model.
3. Fit the classifier `clf` using the best performing variable combination.
4. How well does the model work on unseen data?

In [42]:
X_train, X_test, y_train, y_test = train_test_split(df[['Age', 'AnnualSalaryStd', 'Female']], df['Purchased'], test_size=0.2, random_state=42)
clf = LogisticRegression(random_state=42)
scores = cross_val_score(clf, X_train, y_train, cv=5)
print(scores)
np.min(scores), np.mean(scores), np.max(scores)

[0.85    0.80625 0.83125 0.83125 0.825  ]


(0.80625, 0.82875, 0.85)

In [43]:
X_train, X_test, y_train, y_test = train_test_split(df[['Age', 'AnnualSalaryStd']], df['Purchased'], test_size=0.2, random_state=42)
clf = LogisticRegression(random_state=42)
scores = cross_val_score(clf, X_train, y_train, cv=5)
print(scores)
np.min(scores), np.mean(scores), np.max(scores)

[0.85625 0.79375 0.825   0.825   0.83125]


(0.79375, 0.8262499999999999, 0.85625)

In [44]:
clf.fit(X_train, y_train)
print("All Train Data:\n", classification_report(y_train, clf.predict(X_train)))
print("Test Data:\n", classification_report(y_test, clf.predict(X_test)))

All Train Data:
               precision    recall  f1-score   support

           0       0.83      0.89      0.86       486
           1       0.81      0.73      0.77       314

    accuracy                           0.83       800
   macro avg       0.82      0.81      0.82       800
weighted avg       0.83      0.83      0.83       800

Test Data:
               precision    recall  f1-score   support

           0       0.79      0.93      0.86       112
           1       0.88      0.69      0.78        88

    accuracy                           0.82       200
   macro avg       0.84      0.81      0.82       200
weighted avg       0.83      0.82      0.82       200



## Question 3 - Grouped KFolds

Perform GroupKFold with 6 splits, use cross_val_score instead of a loop.
1. Create a variable named `cv` for a `GroupKFold()` generator with 6 splits.
2. Create a logistic regression classifier `clf`.
3. Use `cross_val_score()` to score the classifier with `AnnualSalaryStd` and `Female`.
   1. Pass the group column to the `groups=` argument.
   2. Pass the generator to the argument `cv`. 


In [45]:
cv = GroupKFold(n_splits=6)  # Must me less than or equal to number of groups

In [46]:
clf = LogisticRegression(random_state=42)
scores = cross_val_score(clf, df[['AnnualSalaryStd', 'Female']], df['Purchased'], groups=df['AgeGroup'], cv=cv)
scores

array([0.83823529, 0.42253521, 0.78571429, 0.4137931 , 0.83333333,
       0.28571429])