In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

import matplotlib.pyplot as plt
%matplotlib inline

# Warmup Exercise
1. Obtain the cars.csv file from the google classroom and read it into python with pandas.

In [2]:
df = pd.read_csv('cars.csv')
df.head()

Unnamed: 0,Id,Price,Year,Mileage,City,State,Vin,Make,Model
0,1,16472,2015,18681,Jefferson City,MO,KL4CJBSBXFB267643,Buick,EncoreConvenience
1,2,15749,2015,27592,Highland,IN,KL4CJASB5FB245057,Buick,EncoreFWD
2,3,16998,2015,13650,Boone,NC,KL4CJCSB0FB264921,Buick,EncoreLeather
3,4,15777,2015,25195,New Orleans,LA,KL4CJASB4FB217542,Buick,EncoreFWD
4,5,16784,2015,22800,Las Vegas,NV,KL4CJBSB3FB166881,Buick,EncoreConvenience


2. Create a feature named gt_avg, which should be either 1 or 0. The value should indicate whether or not a given price is greater than the average price for that car's combination of year, make, and model.

In [3]:
df['YMM_avg_price'] = df.groupby(['Year','Make', 'Model']).Price.transform('mean') 
df[['Year', 'Price', 'YMM_avg_price']].head()

Unnamed: 0,Year,Price,YMM_avg_price
0,2015,16472,17291.768786
1,2015,15749,16721.350598
2,2015,16998,19080.632911
3,2015,15777,16721.350598
4,2015,16784,17291.768786


In [4]:
df['gt_avg'] = (df['Price'] > df['YMM_avg_price']).astype(int)

In [5]:
df.head(2)

Unnamed: 0,Id,Price,Year,Mileage,City,State,Vin,Make,Model,YMM_avg_price,gt_avg
0,1,16472,2015,18681,Jefferson City,MO,KL4CJBSBXFB267643,Buick,EncoreConvenience,17291.768786,0
1,2,15749,2015,27592,Highland,IN,KL4CJASB5FB245057,Buick,EncoreFWD,16721.350598,0


3. Drop the Id, City, and Vin columns.

In [6]:
cars = df.copy()

In [7]:
cars.columns

Index(['Id', 'Price', 'Year', 'Mileage', 'City', 'State', 'Vin', 'Make',
       'Model', 'YMM_avg_price', 'gt_avg'],
      dtype='object')

In [8]:
cars.drop(['Id', 'City', 'Vin', 'Price', 'YMM_avg_price'], axis=1, inplace=True)

In [9]:
cars.head(3)

Unnamed: 0,Year,Mileage,State,Make,Model,gt_avg
0,2015,18681,MO,Buick,EncoreConvenience,0
1,2015,27592,IN,Buick,EncoreFWD,0
2,2015,13650,NC,Buick,EncoreLeather,0


4. Encode the categorical features as necessary. You might wish to use a sklearn.preprocessing.LabelEncoder.

In [10]:
for col in ['Year', 'State', 'Make', 'Model']:
    encoder = LabelEncoder()
    encoder.fit(cars[col])
    cars[col] = encoder.transform(cars[col])

In [11]:
cars.head()

Unnamed: 0,Year,Mileage,State,Make,Model,gt_avg
0,18,18681,28,7,523,0
1,18,27592,19,7,525,0
2,18,13650,32,7,526,0
3,18,25195,22,7,525,0
4,18,22800,38,7,523,0


5. Split the data into training and test sets.

In [12]:
X, y = cars.drop(columns='gt_avg'), cars.gt_avg
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=123)

Create a validation data set from the training data so that we can select models without touching the test data.

In [13]:
# This is an example of the manual way of creating validate set...
x_ttrain, X_validate, y_ttrain, y_validate = train_test_split(X_train, y_train)

Split training data into k sets so that we can train on 1 and 2, predict 3, then train on 1 and 3, predict 2, then train on 2 and 3, predict 1.

In [14]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

tree = DecisionTreeClassifier(max_depth=4)

cross_val_score(tree, X_train, y_train, cv=3) # choosing k=3, usually pick 3 or 4

array([0.64409617, 0.63872105, 0.64158663])

In [15]:
cross_val_score(tree, X_train, y_train, cv=3).mean() # this gives accuracy by default

0.6414679489200495

In [16]:
tree = DecisionTreeClassifier(max_depth=2)
cross_val_score(tree, X_train, y_train, cv=3, scoring='precision')
# choosing precision score to compare, could use f1 or any of the others

array([0.59185407, 0.58847722, 0.64550527])

In [17]:
cross_val_score(tree, X_train, y_train, cv=3, scoring='recall')

array([0.43113837, 0.43108457, 0.28007317])

In [18]:
cross_val_score(tree, X_train, y_train, cv=3, scoring='accuracy')

array([0.59465005, 0.5926863 , 0.59110764])

In [19]:
cross_val_score(tree, X_train, y_train, cv=3).mean() # this gives accuracy

0.5928146656077642

# What happens in a multiclass scenario?

In [20]:
from pydataset import data
from sklearn.metrics import classification_report

iris = data('iris')
iris.columns = [c.lower().replace('.','_') for c in iris]
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
1,5.1,3.5,1.4,0.2,setosa
2,4.9,3.0,1.4,0.2,setosa
3,4.7,3.2,1.3,0.2,setosa
4,4.6,3.1,1.5,0.2,setosa
5,5.0,3.6,1.4,0.2,setosa


In [21]:
X, y = iris.drop(columns='species'), iris.species

In [22]:
tree = DecisionTreeClassifier(max_depth=3)
tree.fit(X,y)
actual = y
predictions = tree.predict(X)
print(classification_report(actual, predictions))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        50
  versicolor       0.98      0.94      0.96        50
   virginica       0.94      0.98      0.96        50

   micro avg       0.97      0.97      0.97       150
   macro avg       0.97      0.97      0.97       150
weighted avg       0.97      0.97      0.97       150



In [23]:
cross_val_score(tree, X, y, scoring='precision_macro')



array([0.98148148, 0.92156863, 0.98039216])

## Grid Search

In [24]:
from sklearn.model_selection import GridSearchCV

hyperparameters = {
    'max_depth': [3, 4, 5],
    'max_features':[None, 2, 3],
    }


grid = GridSearchCV(DecisionTreeClassifier(), param_grid=hyperparameters, cv=3)
grid.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': [3, 4, 5], 'max_features': [None, 2, 3]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [25]:
results = grid.cv_results_

results.keys()

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_max_depth', 'param_max_features', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score', 'split0_train_score', 'split1_train_score', 'split2_train_score', 'mean_train_score', 'std_train_score'])

In [26]:
scores = results['mean_test_score']

scores

array([0.63345768, 0.60026687, 0.60850373, 0.64146795, 0.59371682,
       0.61732804, 0.65489533, 0.60852471, 0.64628502])

In [27]:
params = results['params'] # sets of hyperparameters, correspond to scores
params

[{'max_depth': 3, 'max_features': None},
 {'max_depth': 3, 'max_features': 2},
 {'max_depth': 3, 'max_features': 3},
 {'max_depth': 4, 'max_features': None},
 {'max_depth': 4, 'max_features': 2},
 {'max_depth': 4, 'max_features': 3},
 {'max_depth': 5, 'max_features': None},
 {'max_depth': 5, 'max_features': 2},
 {'max_depth': 5, 'max_features': 3}]

In [28]:
for s, p in zip(scores, params):
    p['score'] = s
    
pd.DataFrame(params).sort_values(by='score')

Unnamed: 0,max_depth,max_features,score
4,4,2.0,0.593717
1,3,2.0,0.600267
2,3,3.0,0.608504
7,5,2.0,0.608525
5,4,3.0,0.617328
0,3,,0.633458
3,4,,0.641468
8,5,3.0,0.646285
6,5,,0.654895


In [29]:
params

[{'max_depth': 3, 'max_features': None, 'score': 0.6334576764756482},
 {'max_depth': 3, 'max_features': 2, 'score': 0.6002668691963293},
 {'max_depth': 3, 'max_features': 3, 'score': 0.6085037281962412},
 {'max_depth': 4, 'max_features': None, 'score': 0.6414679484220729},
 {'max_depth': 4, 'max_features': 2, 'score': 0.5937168249279328},
 {'max_depth': 4, 'max_features': 3, 'score': 0.6173280351126011},
 {'max_depth': 5, 'max_features': None, 'score': 0.6548953293694586},
 {'max_depth': 5, 'max_features': 2, 'score': 0.6085247084789714},
 {'max_depth': 5, 'max_features': 3, 'score': 0.6462850213369475}]