## Model Exercises

In [1]:
# Setup environment

In [37]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


import matplotlib.pyplot as plt
import seaborn as sns
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

from acquire import get_iris_data
from prepare import prep_iris
from split_scale import split_my_data

## Logistic Regression Exercises

In [3]:
# Acquire data

In [4]:
df = get_iris_data()

In [5]:
# Prep data

In [6]:
def prep_iris(iris_df):
    iris_df.drop(columns=['species_id', 'measurement_id'], inplace=True)
    iris_df.rename(columns={'species_name':'species'}, inplace=True)
    # encoder = LabelEncoder()
    # encoder.fit(iris_df.species)
    # iris_df.species = encoder.transform(iris_df.species)
    return iris_df

In [7]:
df = prep_iris(df)

1. Fit the logistic regression classifier to your training sample and transform, i.e. make predictions on the training sample

In [8]:
# Create a dataframe for my independent variables
# Create a dataframe for my target

In [9]:
X = df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']]
y = df[['species']]

In [10]:
# Split my data

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.30,random_state=123)

In [12]:
# Train Model

In [13]:
# Create the logistic regression object

In [14]:
logit = LogisticRegression()

In [15]:
# Fit the model to the training data

In [16]:
logit.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [17]:
# Print the coefficients and intercept of the model

In [18]:
print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)

Coefficient: 
 [[ 0.38421538  1.32718255 -2.11307588 -0.94269552]
 [ 0.43099717 -1.34596217  0.4506587  -1.07117492]
 [-1.517952   -1.52141607  2.26046444  2.12613123]]
Intercept: 
 [ 0.25726194  0.58107381 -0.87235291]


In [19]:
# Estimate whether or not a passenger would survive, using the training data

In [20]:
y_train.head()

Unnamed: 0,species
114,virginica
136,virginica
53,versicolor
19,setosa
38,setosa


In [33]:
y_pred = logit.predict(X_train)

In [22]:
# Estimate the probability of a passenger surviving, using the training data

In [35]:
y_pred_proba = logit.predict_proba(X_train)

2. Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [24]:
# Evaluate the accuracy model score

In [25]:
print('Accuracy of Logistic regression on training set: {:.2f}'.format(logit.score(X_train,y_train)))

Accuracy of Logistic regression on training set: 0.95


In [26]:
# Create a confusion matrix

In [27]:
cm = confusion_matrix(y_train,y_pred)

labels = sorted(y_train.species.unique())
pretty_cr = pd.DataFrame(confusion_matrix(y_train,y_pred),index=labels,columns=labels)

In [28]:
pretty_cr

Unnamed: 0,setosa,versicolor,virginica
setosa,32,0,0
versicolor,0,36,4
virginica,0,1,32


In [29]:
# classification

In [30]:
print(classification_report(y_train,y_pred,output_dict=False))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        32
  versicolor       0.97      0.90      0.94        40
   virginica       0.89      0.97      0.93        33

    accuracy                           0.95       105
   macro avg       0.95      0.96      0.95       105
weighted avg       0.95      0.95      0.95       105



3. Print and clearly label the following:

In [31]:
print('Accuracy of Logistic regression on training set: {:.2f}'.format(logit.score(X_train,y_train)))

Accuracy of Logistic regression on training set: 0.95


In [45]:
FP = cm.sum(axis=0) - np.diag(cm)  
FN = cm.sum(axis=1) - np.diag(cm)
TP = np.diag(cm)
TN = cm.sum() - (FP + FN + TP)

# Sensitivity, hit rate, RECALL, or TRUE POSITIVE rate
TPR = TP/(TP+FN)
# Specificity or TRUE NEGATIVE rate
TNR = TN/(TN+FP) 
# PRECISION or positive predictive value
PPV = TP/(TP+FP)
# Negative predictive value
NPV = TN/(TN+FN)
# Fall out or FALSE POSITIVE rate
FPR = FP/(FP+TN)
# FALSE NEGATIVE rate
FNR = FN/(TP+FN)
# False discovery rate
FDR = FP/(TP+FP)

# Overall ACCURACY
ACC = (TP+TN)/(TP+FP+FN+TN)

- Accuracy

In [41]:
print('Accuracy of Logistic regression on training set: {:.2f}'.format(logit.score(X_train,y_train)))

Accuracy of Logistic regression on training set: 0.95


- True positive rate

In [47]:
TPR

array([1.        , 0.9       , 0.96969697])

- False positive rate

In [40]:
FPR

array([0.        , 0.01538462, 0.05555556])

- True negative rate

In [44]:
TNR

array([1.        , 0.98461538, 0.94444444])

- False negative rate

In [46]:
FNR

array([0.        , 0.1       , 0.03030303])

- Precision

In [48]:
PPV

array([1.        , 0.97297297, 0.88888889])

- Recall

In [49]:
TPR

array([1.        , 0.9       , 0.96969697])

- F1-score

In [None]:
print(classification_report(y_train,y_pred,output_dict=False))

- Support

In [None]:
print(classification_report(y_train,y_pred,output_dict=False))

4. Look in the scikit-learn documentation to research the solver parameter. What is your best option(s) for the particular problem you are trying to solve and the data to be used?

5. Run through steps 2-4 using another solver (from question 5)

6. Which performs better on your in-sample data?

## Decision Tree Exercises

1. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

2. Evaluate your in-sample results using the model score, confusion matrix, and classification report.

3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

4. Run through steps 2-4 using entropy as your measure of impurity.

5. Which performs better on your in-sample data?