###### Model Exercises

In [1]:
# Setup environment

In [48]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier


import matplotlib.pyplot as plt
import seaborn as sns
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

from acquire import get_iris_data
from prepare import prep_iris
from split_scale import split_my_data

## Logistic Regression Exercises

In [3]:
# Acquire data

In [4]:
df = get_iris_data()

In [5]:
# Prep data

In [6]:
def prep_iris(iris_df):
    iris_df.drop(columns=['species_id', 'measurement_id'], inplace=True)
    iris_df.rename(columns={'species_name':'species'}, inplace=True)
    # encoder = LabelEncoder()
    # encoder.fit(iris_df.species)
    # iris_df.species = encoder.transform(iris_df.species)
    return iris_df

In [7]:
df = prep_iris(df)

1. Fit the logistic regression classifier to your training sample and transform, i.e. make predictions on the training sample

In [8]:
# Create a dataframe for my independent variables
# Create a dataframe for my target

In [9]:
X = df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']]
y = df[['species']]

In [10]:
# Split my data

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.30,random_state=123)

In [12]:
# Train Model

In [13]:
# Create the logistic regression object

In [14]:
logit = LogisticRegression()

In [15]:
# Fit the model to the training data

In [16]:
logit.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [17]:
# Print the coefficients and intercept of the model

In [18]:
print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)

Coefficient: 
 [[ 0.38421538  1.32718255 -2.11307588 -0.94269552]
 [ 0.43099717 -1.34596217  0.4506587  -1.07117492]
 [-1.517952   -1.52141607  2.26046444  2.12613123]]
Intercept: 
 [ 0.25726194  0.58107381 -0.87235291]


In [19]:
# Estimate whether or not a passenger would survive, using the training data

In [20]:
y_train.head()

Unnamed: 0,species
114,virginica
136,virginica
53,versicolor
19,setosa
38,setosa


In [21]:
y_pred = logit.predict(X_train)

In [22]:
# Estimate the probability of a passenger surviving, using the training data

In [23]:
y_pred_proba = logit.predict_proba(X_train)

2. Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [24]:
# Evaluate the accuracy model score

In [25]:
print('Accuracy of Logistic regression on training set: {:.2f}'.format(logit.score(X_train,y_train)))

Accuracy of Logistic regression on training set: 0.95


In [26]:
# Create a confusion matrix

In [27]:
cm = confusion_matrix(y_train,y_pred)

labels = sorted(y_train.species.unique())
pretty_cr = pd.DataFrame(confusion_matrix(y_train,y_pred),index=labels,columns=labels)

In [28]:
pretty_cr

Unnamed: 0,setosa,versicolor,virginica
setosa,32,0,0
versicolor,0,36,4
virginica,0,1,32


In [29]:
# classification report

In [30]:
print(classification_report(y_train,y_pred,output_dict=False))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        32
  versicolor       0.97      0.90      0.94        40
   virginica       0.89      0.97      0.93        33

    accuracy                           0.95       105
   macro avg       0.95      0.96      0.95       105
weighted avg       0.95      0.95      0.95       105



3. Print and clearly label the following:

In [31]:
print('Accuracy of Logistic regression on training set: {:.2f}'.format(logit.score(X_train,y_train)))

Accuracy of Logistic regression on training set: 0.95


In [32]:
FP = cm.sum(axis=0) - np.diag(cm)  
FN = cm.sum(axis=1) - np.diag(cm)
TP = np.diag(cm)
TN = cm.sum() - (FP + FN + TP)

# Sensitivity, hit rate, RECALL, or TRUE POSITIVE rate
TPR = TP/(TP+FN)
# Specificity or TRUE NEGATIVE rate
TNR = TN/(TN+FP) 
# PRECISION or positive predictive value
PPV = TP/(TP+FP)
# Negative predictive value
NPV = TN/(TN+FN)
# Fall out or FALSE POSITIVE rate
FPR = FP/(FP+TN)
# FALSE NEGATIVE rate
FNR = FN/(TP+FN)
# False discovery rate
FDR = FP/(TP+FP)

# Overall ACCURACY
ACC = (TP+TN)/(TP+FP+FN+TN)

- Accuracy

In [33]:
print('Accuracy of Logistic regression on training set: {:.2f}'.format(logit.score(X_train,y_train)))

Accuracy of Logistic regression on training set: 0.95


- True positive rate

In [34]:
TPR

array([1.        , 0.9       , 0.96969697])

- False positive rate

In [35]:
FPR

array([0.        , 0.01538462, 0.05555556])

- True negative rate

In [36]:
TNR

array([1.        , 0.98461538, 0.94444444])

- False negative rate

In [37]:
FNR

array([0.        , 0.1       , 0.03030303])

- Precision

In [38]:
PPV

array([1.        , 0.97297297, 0.88888889])

- Recall

In [39]:
TPR

array([1.        , 0.9       , 0.96969697])

- F1-score

In [40]:
print(classification_report(y_train,y_pred,output_dict=False))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        32
  versicolor       0.97      0.90      0.94        40
   virginica       0.89      0.97      0.93        33

    accuracy                           0.95       105
   macro avg       0.95      0.96      0.95       105
weighted avg       0.95      0.95      0.95       105



In [41]:
f1_score = 2*TP/(2*TP+FP+FN)

- Support

In [42]:
print(classification_report(y_train,y_pred,output_dict=False))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        32
  versicolor       0.97      0.90      0.94        40
   virginica       0.89      0.97      0.93        33

    accuracy                           0.95       105
   macro avg       0.95      0.96      0.95       105
weighted avg       0.95      0.95      0.95       105



4. Look in the scikit-learn documentation to research the solver parameter. What is your best option(s) for the particular problem you are trying to solve and the data to be used?

In [43]:
# Viewed the options available for the solver: newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’}, optional (default=’liblinear’)

5. Run through steps 2-4 using another solver (from question 5)

In [44]:
logit_fit = LogisticRegression(solver='saga')
logit_fit.fit(X_train,y_train)
pred_logit = logit_fit.predict(X_train)

6. Which performs better on your in-sample data?

In [45]:
print(pred_logit.score(X_train, ))
print()

AttributeError: 'numpy.ndarray' object has no attribute 'score'

## Decision Tree Exercises

In [None]:
# Setup environment 

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import numpy as np

from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns

- Acquire and Prep Data

In [None]:
# Acquire from pydata set
df = data('iris')
# Clean data, rename columns
df.columns = [col.lower().replace('.', '_') for col in df]

df.head()

In [None]:
# Training/Test split

X = df.drop(['species'],axis=1)
y = df[['species']]
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = .70, random_state = 123)

1. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [None]:
# Create the Decision Tree Object

# For classification you can change the algorithm to gini or entropy (information gain).  Default is gini.

clf = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=123)

In [None]:
# Fit the model to the training data

clf.fit(X_train, y_train)

In [None]:
# Predict - Estimate species

y_pred = clf.predict(X_train)

# Estimate the probability of a species

y_pred_proba = clf.predict_proba(X_train)

2. Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [None]:
# Accuracy: number of correct predictions over the number of total instances that have been evaluated.

print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))

In [None]:
# Create a confusion matrix

cm = confusion_matrix(y_train, y_pred)
cm

In [None]:
import pandas as pd

labels = sorted(y_train.species.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [None]:
FP = cm.sum(axis=0) - np.diag(cm)  
FN = cm.sum(axis=1) - np.diag(cm)
TP = np.diag(cm)
TN = cm.sum() - (FP + FN + TP)

# Sensitivity, hit rate, RECALL, or TRUE POSITIVE rate
TPR = TP/(TP+FN)
# Specificity or TRUE NEGATIVE rate
TNR = TN/(TN+FP) 
# PRECISION or positive predictive value
PPV = TP/(TP+FP)
# Negative predictive value
NPV = TN/(TN+FN)
# Fall out or FALSE POSITIVE rate
FPR = FP/(FP+TN)
# FALSE NEGATIVE rate
FNR = FN/(TP+FN)
# False discovery rate
FDR = FP/(TP+FP)

# Overall ACCURACY
ACC = (TP+TN)/(TP+FP+FN+TN)

- Accuracy

In [49]:
ACC

array([1.        , 0.95238095, 0.95238095])

- True positive rate

In [None]:
TPR

- False positive rate

In [None]:
FPR

- True negative rate

In [None]:
TNR

- False negative rate

In [None]:
FNR

- Precision

In [None]:
PPV

- Recall

In [None]:
TPR

- f1-score

In [None]:
f1_score = 2*TP/(2*TP+FP+FN)
f1_score

- Support

In [50]:
cm.sum()

105

4. Run through steps 2-4 using entropy as your measure of impurity.

In [None]:
tree_fit = DecisionTreeClassifier(criterion='entropy')
tree_fit.fit(X_train,y_train)
pred

5. Which performs better on your in-sample data?

## Random Forest Exercises

In [None]:
#setup environment

import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from pydataset import data


import matplotlib.pyplot as plt
import seaborn as sns

- Acquire and Prep Data

In [None]:
# Acquire from pydata set
df = data('iris')
# Clean data, rename columns
df.columns = [col.lower().replace('.', '_') for col in df]

df.head()

In [None]:
# Training/Test split

X = df[['sepal_length','sepal_width', 'petal_length', 'petal_width']]
y = df[['species']]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = .70, random_state = 123)

1. Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 20.

In [None]:
# Train Model
# Create the Random Forest Object

rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=1,
                            n_estimators=100,
                            max_depth=20, 
                            random_state=123)

In [None]:
# Fit the model to the training data

rf.fit(X_train, y_train)

- Print Feature Importances

In [None]:
print(rf.feature_importances_)

- Estimate whether or not a passenger would survive, using the training data

In [None]:
y_pred = rf.predict(X_train)

- Estimate the probability of a passenger surviving, using the training data

In [None]:
y_pred_proba = rf.predict_proba(X_train)

2. Evaluate your results using the model score, confusion matrix, and classification report.

- Compute the Accuracy

In [None]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

- Create a confusion matrix

In [None]:
cm = (confusion_matrix(y_train, y_pred))
cm

In [None]:
import pandas as pd

labels = sorted(y_train.species.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

- Create a classificaiton report

In [None]:
print(classification_report(y_train, y_pred))

3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [None]:
FP = cm.sum(axis=0) - np.diag(cm)  
FN = cm.sum(axis=1) - np.diag(cm)
TP = np.diag(cm)
TN = cm.sum() - (FP + FN + TP)

# Sensitivity, hit rate, RECALL, or TRUE POSITIVE rate
TPR = TP/(TP+FN)
# Specificity or TRUE NEGATIVE rate
TNR = TN/(TN+FP) 
# PRECISION or positive predictive value
PPV = TP/(TP+FP)
# Negative predictive value
NPV = TN/(TN+FN)
# Fall out or FALSE POSITIVE rate
FPR = FP/(FP+TN)
# FALSE NEGATIVE rate
FNR = FN/(TP+FN)
# False discovery rate
FDR = FP/(TP+FP)

# Overall ACCURACY
ACC = (TP+TN)/(TP+FP+FN+TN)

- Accuracy

In [None]:
ACC

- True positive rate

In [None]:
TPR

- False positive rate

In [None]:
FPR

- True negative rate

In [None]:
TNR

- False negative rate

In [None]:
FNR

- Precision

In [None]:
PPV

- Recall

In [None]:
TPR

- f1-score

In [None]:
f1_score = 2*TP/(2*TP+FP+FN)
f1_score

- Support

In [46]:
cm.sum()

105

4. Run through steps increasing your min_samples_leaf to 5 and decreasing your max_depth to 3.

5. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

## KNN

1. Fit the K-Nearest Neighbors classifier to your training sample and transform make predictions on the training sample

In [54]:
knn = KNeighborsClassifier()

knn.fit(X_train,y_train)

pred_knn = knn.predict(X_train)

2. Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [53]:
print(knn.score(X_train,y_train))

print(pd.DataFrame(confusion_matrix(y_train, pred_knn), index = labels,columns = labels))

print(classification_report(y_train, pred_knn))

0.9809523809523809
            setosa  versicolor  virginica
setosa          32           0          0
versicolor       0          39          1
virginica        0           1         32
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        32
  versicolor       0.97      0.97      0.97        40
   virginica       0.97      0.97      0.97        33

    accuracy                           0.98       105
   macro avg       0.98      0.98      0.98       105
weighted avg       0.98      0.98      0.98       105



3. Print and clearly label the following: Accuracy ,true positive rate, false positive rate, true negative rate, false negative rate, false negative rate,precision, recall,f1-score, and support.

In [56]:
confusion_matrix_knn = pd.DataFrame(confusion_matrix(y_train,pred_knn), index = labels, columns = labels)
confusion_matrix_knn

Unnamed: 0,setosa,versicolor,virginica
setosa,32,0,0
versicolor,0,39,1
virginica,0,1,32


In [57]:
FP = confusion_matrix_knn.sum(axis=0) - np.diag(confusion_matrix_knn)  
FN = confusion_matrix_knn.sum(axis=1) - np.diag(confusion_matrix_knn)
TP = np.diag(confusion_matrix_knn)
TN = confusion_matrix_knn.values.sum() - (FP + FN + TP)

#True positive rate/recall
TPR = TP/(TP+FN)
# True negative rate
TNR = TN/(TN+FP) 
# Precision
PPV = TP/(TP+FP)
# Negative predictive value
NPV = TN/(TN+FN)
# False positive rate
FPR = FP/(FP+TN)
# False negative rate
FNR = FN/(TP+FN)

# Overall accuracy
ACC = (TP+TN)/(TP+FP+FN+TN)

# F1-score
f1_score=2*TP/(2*TP+FP+FN)

#support
support=len(y_train)

4. Run through steps 2-4 setting k to 10

In [61]:
knn_10 = KNeighborsClassifier(n_neighbors = 10)

knn_10.fit(X_train,y_train)

pred_10 = knn_10.predict(X_train)


print(knn_10.score(X_train, y_train))
print(pd.DataFrame(confusion_matrix(y_train, pred_10), index = labels,columns = labels))
print(classification_report(y_train, pred_10))

0.9714285714285714
            setosa  versicolor  virginica
setosa          32           0          0
versicolor       0          39          1
virginica        0           2         31
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        32
  versicolor       0.95      0.97      0.96        40
   virginica       0.97      0.94      0.95        33

    accuracy                           0.97       105
   macro avg       0.97      0.97      0.97       105
weighted avg       0.97      0.97      0.97       105



5. Run through setps 2-4 setting k to 20

In [62]:
knn_20 = KNeighborsClassifier(n_neighbors = 20)

knn_20.fit(X_train, y_train)

pred_20 = knn_20.predict(X_train)

print(knn_20.score(X_train, y_train))
print(pd.DataFrame(confusion_matrix(y_train, pred_20), index = labels,columns = labels))
print(classification_report(y_train, pred_20))

0.9619047619047619
            setosa  versicolor  virginica
setosa          32           0          0
versicolor       0          39          1
virginica        0           3         30
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        32
  versicolor       0.93      0.97      0.95        40
   virginica       0.97      0.91      0.94        33

    accuracy                           0.96       105
   macro avg       0.97      0.96      0.96       105
weighted avg       0.96      0.96      0.96       105



6. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

The KNN test with the higher value of K performs a little worse. I think it has to do with how far the k = 20 is reaching out too far and performing worse.  The model with k = 10 doesn't need to reach out as far since the plots are closer together so it is not necessary to reach out as far to its closer neighbors.