# Data Modeling
Do your work for these exercises in either a notebook or a python script named model.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler

import matplotlib.pyplot as plt
%matplotlib inline

from acquire import get_titanic_data
from prepare import prep_titanic_data

df = get_titanic_data()
df = prep_titanic_data(df)
df.sample(5)

## Logistic Regression
1. Fit the logistic regression classifier to your training sample and transform, i.e. make predictions on the training sample

In [None]:
# Handle missing values in the `age` column.
df.dropna(inplace=True)

In [None]:
X = df[['pclass','age','fare','sibsp','parch']]
y = df[['survived']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state = 123)

X_train.head()

In [None]:
# 1. make the thing
scaler = MinMaxScaler()

# 2. fit the thing
scaler.fit(X_train[['age', 'fare']])

# 3. use the thing
X_train[['age', 'fare']] = scaler.transform(X_train[['age', 'fare']])
X_test[['age', 'fare']] = scaler.transform(X_test[['age', 'fare']])

### Train Model
#### Create the logistic regression object

In [None]:
# from sklearn.linear_model import LogisticRegression

logit = LogisticRegression(C=1, class_weight={1:2}, random_state = 123, solver='saga')

#### Fit the model to the training data

In [None]:
logit.fit(X_train, y_train)

#### Print the coefficients and intercept of the model

In [None]:
print('Coefficient: \n', logit.coef_)
print()
print('Intercept: \n', logit.intercept_)

2. Evaluate your in-sample results using the model score, confusion matrix, and classification report.

#### Estimate whether or not a passenger would survive, using the training data

In [None]:
y_pred = logit.predict(X_train)

#### Estimate the probability of a passenger surviving, using the training data

In [None]:
y_pred_proba = logit.predict_proba(X_train)

In [None]:
X_train['prediction'] = logit.predict(X_train[['pclass','age','fare','sibsp','parch']])

In [None]:
(y_train.survived == X_train.prediction).sum() / y_train.shape[0]

In [None]:
logit.score(X_train[['pclass','age','fare','sibsp','parch']], y_train.survived)

### Evaluate Model
#### Compute the accuracy

In [None]:
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_train.drop(columns='prediction'), y_train)))

#### Create a confusion matrix

In [None]:
print(confusion_matrix(y_train, y_pred))

In [None]:
df = pd.DataFrame(confusion_matrix(y_train.survived, X_train.prediction),
             columns=['Pred -', 'Pred +'], index=['Actual -', 'Actual +'])

df

#### Compute Precision, Recall, F1-score, and Support

In [None]:
print(classification_report(y_train, y_pred))

3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

### Test Model
#### Compute the accuracy of the model when run on the test data

In [None]:
print('Accuracy of Logistic Regression classifier on test set: {:.2f}'
     .format(logit.score(X_test, y_test)))

In [None]:
df = pd.DataFrame(confusion_matrix(y_train.survived, X_train.prediction),
             columns=['Pred -', 'Pred +'], index=['Actual -', 'Actual +'])

df

In [None]:
TN = df['Pred -'][0] # 190
FP = df['Pred +'][0] #103
FN = df['Pred -'][1] # 50
TP = df['Pred +'][1] # 156
total = TN + FP + FN + TP

print('True Negative = ', TN)
print('False Positive = ', FP)
print('False Negative = ', FN)
print('True Positive = ', TP)
print('Total = ', total)

In [None]:
# Accuracy = # correct / total 
#          = (true positive + true negative) / total
accuracy = (TP + TN) / total
print('Accuracy = ', accuracy)

In [None]:
# Recall = Sensitivity
#      = true positive rate 
#      = true positive / (true positive + false negative) 
recall = TP / (TP + FN)
print('Recall = ', recall)

In [None]:
# Specificity = false positive rate
#      = false positive / (false positive + true negative)
specificity = FP / (FP + TN)
print('Specificity = ', specificity)

In [None]:
# true negative rate = true negative / (true negative + false positive)
trueneg = TN / (TN + FP)
print('True Negative Rate = ', trueneg)

In [None]:
# false negative rate = false negative / (false negaitve + true positive)
falseneg = FN / (FN + TP)
print('False Negative Rate = ', falseneg)

In [None]:
# precision = true positive / (true positive + false positive)
precision = TP / (TP + FP)
print('Precision = ', precision)

In [None]:
f1 = (precision + recall) / 2
print('f1-score is ', f1)

In [None]:
died = TN + FP
lived = TP + FN
print(died, 'people died and', lived, 'people lived.')

4. Look in the scikit-learn documentation to research the solver parameter. What is your best option(s) for the particular problem you are trying to solve and the data to be used?

class sklearn.linear_model.LogisticRegression(
    penalty=’l2’, 
    dual=False, 
    tol=0.0001, 
    C=1.0, 
    fit_intercept=True, 
    intercept_scaling=1, 
    class_weight=None, 
    random_state=None, 
    solver=’warn’, 
    max_iter=100, 
    multi_class=’warn’, 
    verbose=0, 
    warm_start=False, 
    n_jobs=None)
    
solver : str, {‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’}, default: ‘liblinear’.
Algorithm to use in the optimization problem.

- For small datasets, ‘liblinear’ is a good choice, whereas ‘sag’ and ‘saga’ are faster for large ones.
- For multiclass problems, only ‘newton-cg’, ‘sag’, ‘saga’ and ‘lbfgs’ handle multinomial loss; ‘liblinear’ is limited to one-versus-rest schemes.

‘newton-cg’, ‘lbfgs’ and ‘sag’ only handle L2 penalty, whereas ‘liblinear’ and ‘saga’ handle L1 penalty.
Note that ‘sag’ and ‘saga’ fast convergence is only guaranteed on features with approximately the same scale. You can preprocess the data with a scaler from sklearn.preprocessing.

We just want the default!

5. Run through steps 2-4 using another solver (from question 5)

In [None]:
# for saga solver:
X_train = []
df = get_titanic_data()
df = prep_titanic_data(df)
# Handle missing values in the `age` column.
df.dropna(inplace=True)
X = df[['pclass','age','fare','sibsp','parch']]
y = df[['survived']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state = 123)

# 1. make the thing
scaler = MinMaxScaler()

# 2. fit the thing
scaler.fit(X_train[['age', 'fare']])

# 3. use the thing
X_train[['age', 'fare']] = scaler.transform(X_train[['age', 'fare']])
X_test[['age', 'fare']] = scaler.transform(X_test[['age', 'fare']])


# from sklearn.linear_model import LogisticRegression
logit = LogisticRegression(C=1, class_weight={1:2}, random_state = 123, solver='saga')
logit.fit(X_train, y_train)
y_pred = logit.predict(X_train)
y_pred_proba = logit.predict_proba(X_train)
X_train['prediction'] = logit.predict(X_train[['pclass','age','fare','sibsp','parch']])
# (y_train.survived == X_train.prediction).sum() / y_train.shape[0]
# logit.score(X_train[['pclass','age','fare','sibsp','parch']], y_train.survived)
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_train.drop(columns='prediction'), y_train)))
print(confusion_matrix(y_train, y_pred))
df = pd.DataFrame(confusion_matrix(y_train.survived, X_train.prediction),
             columns=['Pred -', 'Pred +'], index=['Actual -', 'Actual +'])
print(classification_report(y_train, y_pred))
TN = df['Pred -'][0] # 190
FP = df['Pred +'][0] #103
FN = df['Pred -'][1] # 50
TP = df['Pred +'][1] # 156
total = TN + FP + FN + TP

print('True Negative = ', TN)
print('False Positive = ', FP)
print('False Negative = ', FN)
print('True Positive = ', TP)
print('Total = ', total)

# Accuracy = # correct / total 
#          = (true positive + true negative) / total
accuracy = (TP + TN) / total
print('Accuracy = ', accuracy)

# Recall = Sensitivity
#      = true positive rate 
#      = true positive / (true positive + false negative) 
recall = TP / (TP + FN)
print('Recall = ', recall)

# Specificity = false positive rate
#      = false positive / (false positive + true negative)
specificity = FP / (FP + TN)
print('Specificity = ', specificity)

# true negative rate = true negative / (true negative + false positive)
trueneg = TN / (TN + FP)
print('True Negative Rate = ', trueneg)

# false negative rate = false negative / (false negaitve + true positive)
falseneg = FN / (FN + TP)
print('False Negative Rate = ', falseneg)

# precision = true positive / (true positive + false positive)
precision = TP / (TP + FP)
print('Precision = ', precision)

f1 = (precision + recall) / 2
print('f1-score is ', f1)

died = TN + FP
lived = TP + FN
print(died, 'people died and', lived, 'people lived.')

In [None]:
# liblinear solver
X_train = []
df = get_titanic_data()
df = prep_titanic_data(df)
# Handle missing values in the `age` column.
df.dropna(inplace=True)
X = df[['pclass','age','fare','sibsp','parch']]
y = df[['survived']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state = 123)

# 1. make the thing
scaler = MinMaxScaler()

# 2. fit the thing
scaler.fit(X_train[['age', 'fare']])

# 3. use the thing
X_train[['age', 'fare']] = scaler.transform(X_train[['age', 'fare']])
X_test[['age', 'fare']] = scaler.transform(X_test[['age', 'fare']])



# from sklearn.linear_model import LogisticRegression
logit = LogisticRegression(C=1, class_weight={1:2}, random_state = 123, solver='liblinear')
logit.fit(X_train, y_train)
y_pred = logit.predict(X_train)
y_pred_proba = logit.predict_proba(X_train)
X_train['prediction'] = logit.predict(X_train[['pclass','age','fare','sibsp','parch']])
# (y_train.survived == X_train.prediction).sum() / y_train.shape[0]
# logit.score(X_train[['pclass','age','fare','sibsp','parch']], y_train.survived)
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_train.drop(columns='prediction'), y_train)))
print(confusion_matrix(y_train, y_pred))
df = pd.DataFrame(confusion_matrix(y_train.survived, X_train.prediction),
             columns=['Pred -', 'Pred +'], index=['Actual -', 'Actual +'])
print(classification_report(y_train, y_pred))
TN = df['Pred -'][0] # 190
FP = df['Pred +'][0] #103
FN = df['Pred -'][1] # 50
TP = df['Pred +'][1] # 156
total = TN + FP + FN + TP

print('True Negative = ', TN)
print('False Positive = ', FP)
print('False Negative = ', FN)
print('True Positive = ', TP)
print('Total = ', total)

# Accuracy = # correct / total 
#          = (true positive + true negative) / total
accuracy = (TP + TN) / total
print('Accuracy = ', accuracy)

# Recall = Sensitivity
#      = true positive rate 
#      = true positive / (true positive + false negative) 
recall = TP / (TP + FN)
print('Recall = ', recall)

# Specificity = false positive rate
#      = false positive / (false positive + true negative)
specificity = FP / (FP + TN)
print('Specificity = ', specificity)

# true negative rate = true negative / (true negative + false positive)
trueneg = TN / (TN + FP)
print('True Negative Rate = ', trueneg)

# false negative rate = false negative / (false negaitve + true positive)
falseneg = FN / (FN + TP)
print('False Negative Rate = ', falseneg)

# precision = true positive / (true positive + false positive)
precision = TP / (TP + FP)
print('Precision = ', precision)

f1 = (precision + recall) / 2
print('f1-score is ', f1)

died = TN + FP
lived = TP + FN
print(died, 'people died and', lived, 'people lived.')

In [None]:
# newton-cg solver
X_train = []
df = get_titanic_data()
df = prep_titanic_data(df)
# Handle missing values in the `age` column.
df.dropna(inplace=True)
X = df[['pclass','age','fare','sibsp','parch']]
y = df[['survived']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state = 123)

# 1. make the thing
scaler = MinMaxScaler()

# 2. fit the thing
scaler.fit(X_train[['age', 'fare']])

# 3. use the thing
X_train[['age', 'fare']] = scaler.transform(X_train[['age', 'fare']])
X_test[['age', 'fare']] = scaler.transform(X_test[['age', 'fare']])



# from sklearn.linear_model import LogisticRegression
logit = LogisticRegression(C=1, class_weight={1:2}, random_state = 123, solver='newton-cg')
logit.fit(X_train, y_train)
y_pred = logit.predict(X_train)
y_pred_proba = logit.predict_proba(X_train)
X_train['prediction'] = logit.predict(X_train[['pclass','age','fare','sibsp','parch']])
# (y_train.survived == X_train.prediction).sum() / y_train.shape[0]
# logit.score(X_train[['pclass','age','fare','sibsp','parch']], y_train.survived)
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_train.drop(columns='prediction'), y_train)))
print(confusion_matrix(y_train, y_pred))
df = pd.DataFrame(confusion_matrix(y_train.survived, X_train.prediction),
             columns=['Pred -', 'Pred +'], index=['Actual -', 'Actual +'])
print(classification_report(y_train, y_pred))
TN = df['Pred -'][0] # 190
FP = df['Pred +'][0] #103
FN = df['Pred -'][1] # 50
TP = df['Pred +'][1] # 156
total = TN + FP + FN + TP

print('True Negative = ', TN)
print('False Positive = ', FP)
print('False Negative = ', FN)
print('True Positive = ', TP)
print('Total = ', total)

# Accuracy = # correct / total 
#          = (true positive + true negative) / total
accuracy = (TP + TN) / total
print('Accuracy = ', accuracy)

# Recall = Sensitivity
#      = true positive rate 
#      = true positive / (true positive + false negative) 
recall = TP / (TP + FN)
print('Recall = ', recall)

# Specificity = false positive rate
#      = false positive / (false positive + true negative)
specificity = FP / (FP + TN)
print('Specificity = ', specificity)

# true negative rate = true negative / (true negative + false positive)
trueneg = TN / (TN + FP)
print('True Negative Rate = ', trueneg)

# false negative rate = false negative / (false negaitve + true positive)
falseneg = FN / (FN + TP)
print('False Negative Rate = ', falseneg)

# precision = true positive / (true positive + false positive)
precision = TP / (TP + FP)
print('Precision = ', precision)

f1 = (precision + recall) / 2
print('f1-score is ', f1)

died = TN + FP
lived = TP + FN
print(died, 'people died and', lived, 'people lived.')

In [None]:
# sag solver
X_train = []
df = get_titanic_data()
df = prep_titanic_data(df)
# Handle missing values in the `age` column.
df.dropna(inplace=True)
X = df[['pclass','age','fare','sibsp','parch']]
y = df[['survived']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state = 123)

# 1. make the thing
scaler = MinMaxScaler()

# 2. fit the thing
scaler.fit(X_train[['age', 'fare']])

# 3. use the thing
X_train[['age', 'fare']] = scaler.transform(X_train[['age', 'fare']])
X_test[['age', 'fare']] = scaler.transform(X_test[['age', 'fare']])



# from sklearn.linear_model import LogisticRegression
logit = LogisticRegression(C=1, class_weight={1:2}, random_state = 123, solver='sag')
logit.fit(X_train, y_train)
y_pred = logit.predict(X_train)
y_pred_proba = logit.predict_proba(X_train)
X_train['prediction'] = logit.predict(X_train[['pclass','age','fare','sibsp','parch']])
# (y_train.survived == X_train.prediction).sum() / y_train.shape[0]
# logit.score(X_train[['pclass','age','fare','sibsp','parch']], y_train.survived)
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_train.drop(columns='prediction'), y_train)))
print(confusion_matrix(y_train, y_pred))
df = pd.DataFrame(confusion_matrix(y_train.survived, X_train.prediction),
             columns=['Pred -', 'Pred +'], index=['Actual -', 'Actual +'])
print(classification_report(y_train, y_pred))
TN = df['Pred -'][0] # 190
FP = df['Pred +'][0] #103
FN = df['Pred -'][1] # 50
TP = df['Pred +'][1] # 156
total = TN + FP + FN + TP

print('True Negative = ', TN)
print('False Positive = ', FP)
print('False Negative = ', FN)
print('True Positive = ', TP)
print('Total = ', total)

# Accuracy = # correct / total 
#          = (true positive + true negative) / total
accuracy = (TP + TN) / total
print('Accuracy = ', accuracy)

# Recall = Sensitivity
#      = true positive rate 
#      = true positive / (true positive + false negative) 
recall = TP / (TP + FN)
print('Recall = ', recall)

# Specificity = false positive rate
#      = false positive / (false positive + true negative)
specificity = FP / (FP + TN)
print('Specificity = ', specificity)

# true negative rate = true negative / (true negative + false positive)
trueneg = TN / (TN + FP)
print('True Negative Rate = ', trueneg)

# false negative rate = false negative / (false negaitve + true positive)
falseneg = FN / (FN + TP)
print('False Negative Rate = ', falseneg)

# precision = true positive / (true positive + false positive)
precision = TP / (TP + FP)
print('Precision = ', precision)

f1 = (precision + recall) / 2
print('f1-score is ', f1)

died = TN + FP
lived = TP + FN
print(died, 'people died and', lived, 'people lived.')

In [None]:
# lbfgs solver
X_train = []
df = get_titanic_data()
df = prep_titanic_data(df)
# Handle missing values in the `age` column.
df.dropna(inplace=True)
X = df[['pclass','age','fare','sibsp','parch']]
y = df[['survived']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state = 123)

# 1. make the thing
scaler = MinMaxScaler()

# 2. fit the thing
scaler.fit(X_train[['age', 'fare']])

# 3. use the thing
X_train[['age', 'fare']] = scaler.transform(X_train[['age', 'fare']])
X_test[['age', 'fare']] = scaler.transform(X_test[['age', 'fare']])



# from sklearn.linear_model import LogisticRegression
logit = LogisticRegression(C=1, class_weight={1:2}, random_state = 123, solver='lbfgs')
logit.fit(X_train, y_train)
y_pred = logit.predict(X_train)
y_pred_proba = logit.predict_proba(X_train)
X_train['prediction'] = logit.predict(X_train[['pclass','age','fare','sibsp','parch']])
# (y_train.survived == X_train.prediction).sum() / y_train.shape[0]
# logit.score(X_train[['pclass','age','fare','sibsp','parch']], y_train.survived)
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_train.drop(columns='prediction'), y_train)))
print(confusion_matrix(y_train, y_pred))
df = pd.DataFrame(confusion_matrix(y_train.survived, X_train.prediction),
             columns=['Pred -', 'Pred +'], index=['Actual -', 'Actual +'])
print(classification_report(y_train, y_pred))
TN = df['Pred -'][0] # 190
FP = df['Pred +'][0] #103
FN = df['Pred -'][1] # 50
TP = df['Pred +'][1] # 156
total = TN + FP + FN + TP

print('True Negative = ', TN)
print('False Positive = ', FP)
print('False Negative = ', FN)
print('True Positive = ', TP)
print('Total = ', total)

# Accuracy = # correct / total 
#          = (true positive + true negative) / total
accuracy = (TP + TN) / total
print('Accuracy = ', accuracy)

# Recall = Sensitivity
#      = true positive rate 
#      = true positive / (true positive + false negative) 
recall = TP / (TP + FN)
print('Recall = ', recall)

# Specificity = false positive rate
#      = false positive / (false positive + true negative)
specificity = FP / (FP + TN)
print('Specificity = ', specificity)

# true negative rate = true negative / (true negative + false positive)
trueneg = TN / (TN + FP)
print('True Negative Rate = ', trueneg)

# false negative rate = false negative / (false negaitve + true positive)
falseneg = FN / (FN + TP)
print('False Negative Rate = ', falseneg)

# precision = true positive / (true positive + false positive)
precision = TP / (TP + FP)
print('Precision = ', precision)

f1 = (precision + recall) / 2
print('f1-score is ', f1)

died = TN + FP
lived = TP + FN
print(died, 'people died and', lived, 'people lived.')

6. Which performs better on your in-sample data?

I got the same results for all of the solvers.

7. Save the best model in logit_fit

In [None]:
logit_fit = logit
logit_fit

## Decision Tree
1. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [None]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import seaborn as sns
from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn import tree

import matplotlib.pyplot as plt
%matplotlib inline

from acquire import get_titanic_data
from prepare import prep_titanic_data

df = data('iris')

df.columns = [col.lower().replace('.', '_') for col in df]

X = df.drop(['species'],axis=1)
y = df[['species']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state = 123)

# for classificaiton you can change the algorithm as gini or entropy 
# (information gain).  Default is gini.
clf = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=123)

clf.fit(X_train, y_train)

print("for features...")
print(X_train.columns)
print(clf.feature_importances_)
print()

y_pred = clf.predict(X_train)
#print(y_pred[0:5]) # ['virginica' 'virginica' 'versicolor' 'setosa' 'setosa']

y_pred_proba = clf.predict_proba(X_train)
print(y_pred_proba)

2. Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [None]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))
cm = confusion_matrix(y_train, y_pred)
cm

In [None]:
sorted(y_train.species.unique())
y_train.species.value_counts()

labels = sorted(y_train.species.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [None]:
print(classification_report(y_train, y_pred))
print('Accuracy of Decision Tree classifier on test set: {:.2f}'
     .format(clf.score(X_test, y_test)))

In [None]:
## need to install graphviz to anaconda
## example: 

from sklearn.datasets import load_iris

iris = load_iris()
clf = tree.DecisionTreeClassifier()
clf = clf.fit(iris.data, iris.target)

import graphviz

from graphviz import Graph

dot_data = tree.export_graphviz(clf, out_file=None) 
graph = graphviz.Source(dot_data) 

graph.render('iris_decision_tree2', view=True)

4. Run through steps 2-4 using entropy as your measure of impurity.

In [None]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn import tree

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from acquire import get_iris_data
from prepare import prep_iris_data

df = data('iris')

df.columns = [col.lower().replace('.', '_') for col in df]

X = df.drop(['species'],axis=1)
y = df[['species']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state = 123)

# for classificaiton you can change the algorithm as gini or entropy 
# (information gain).  Default is gini.
clf = DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=123)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_train)
#print(y_pred[0:5]) # ['virginica' 'virginica' 'versicolor' 'setosa' 'setosa']

y_pred_proba = clf.predict_proba(X_train)

print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))
cm = confusion_matrix(y_train, y_pred)

sorted(y_train.species.unique())
y_train.species.value_counts()

labels = sorted(y_train.species.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

print(classification_report(y_train, y_pred))
print('Accuracy of Decision Tree classifier on test set: {:.2f}'
     .format(clf.score(X_test, y_test)))

cm = pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)
print(classification_report(y_train, y_pred))
cm

In [None]:
## need to install graphviz to anaconda
## example: 

from sklearn.datasets import load_iris

iris = load_iris()
clf = tree.DecisionTreeClassifier()
clf = clf.fit(iris.data, iris.target)

import graphviz

from graphviz import Graph

dot_data = tree.export_graphviz(clf, out_file=None) 
graph = graphviz.Source(dot_data) 

graph.render('iris_decision_tree2', view=True)

5. Which performs better on your in-sample data?

They are the same.

6. Save the best model in tree_fit

In [None]:
tree_fit = clf
tree_fit

## KNN
1. Fit the K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)

In [1]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn import tree

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from acquire import get_iris_data
from prepare import prep_iris_data

df = prep_iris_data(get_iris_data())
df.columns

Index(['species', 'sepal_length', 'sepal_width', 'petal_length', 'petal_width',
       'species_encode'],
      dtype='object')

In [None]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from acquire import get_iris_data
from prepare import prep_iris_data

df = prep_iris_data(get_iris_data())

df.dropna(inplace=True) # handle missing age values

X = df[['pclass','age','fare','sibsp','parch']]
y = df[['survived']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state = 123)

# weights = ['uniform', 'density']
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')
# choosing to be closest to five nearest neighbors
# could weight features

knn.fit(X_train, y_train)

y_pred = knn.predict(X_train)

y_pred_proba = knn.predict_proba(X_train)

2. Evaluate your results using the model score, confusion matrix, and classification report.

In [None]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_train, y_train)))
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [None]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_train, y_train)))
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))
print('Accuracy of KNN classifier on test set: {:.2f}'
     .format(knn.score(X_test, y_test)))
# print(confusion_matrix(y_test, y_pred))
# print(classification_report(y_test, y_pred))

4. Run through steps 1-3 setting k to 10

In [None]:
# weights = ['uniform', 'density']
knn = KNeighborsClassifier(n_neighbors=10, weights='uniform')
# choosing to be closest to five nearest neighbors
# could weight features

knn.fit(X_train, y_train)

y_pred = knn.predict(X_train)

y_pred_proba = knn.predict_proba(X_train)

print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_train, y_train)))
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

print('Accuracy of KNN classifier on test set: {:.2f}'
     .format(knn.score(X_test, y_test)))

5. Run through setps 1-3 setting k to 20

In [None]:
# weights = ['uniform', 'density']
knn = KNeighborsClassifier(n_neighbors=20, weights='uniform')
# choosing to be closest to five nearest neighbors
# could weight features

knn.fit(X_train, y_train)

y_pred = knn.predict(X_train)

y_pred_proba = knn.predict_proba(X_train)

print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_train, y_train)))
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

print('Accuracy of KNN classifier on test set: {:.2f}'
     .format(knn.score(X_test, y_test)))

6. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

The K-Nearest Neighbor mode with k = 5 was the best fit on my in-sample data with 76% accuracy, but the test data only yielded a 67% accuracy. The rest of the metrics look comparable. I guess I'll save k = 5.

7. Save the best model in knn_fit

In [None]:
knn_fit = KNeighborsClassifier(n_neighbors=5, weights='uniform')

## Random Forest
1. Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 20.

In [None]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from acquire import get_titanic_data
from prepare import prep_titanic_data

df = prep_titanic_data(get_titanic_data())

# Handle missing age values
df.dropna(inplace=True)
print('number of nulls = ')
print(df.isnull().sum())
print()

X = df[['pclass','age','fare','sibsp','parch']]
y = df.survived

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state = 123)

# setting the random_state accordingly and 
# setting min_samples_leaf = 1 and max_depth = 20.
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=1,
                            n_estimators=100,
                            max_depth=20, 
                            random_state=123)
# min_samples_leaf is set to only 3 because dataset is small

rf.fit(X_train, y_train)

print('this shows gini-index, shows you the importance of each feature in order')
print('shows that fare is biggest indicator of survival')
print("for features ['pclass','age','fare','sibsp','parch']:")
print(rf.feature_importances_)


y_pred = rf.predict(X_train)

y_pred_proba = rf.predict_proba(X_train)

2. Evaluate your results using the model score, confusion matrix, and classification report.

In [None]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))
print()
print(confusion_matrix(y_train, y_pred))
# y_train is rows
# y_pred is columns

# these numbers are from lesson... need to be changed...
# 248 - pred died, died     |45 -  pred to survive, died
# 79 - pred died, survived  |127 - pred to survive, survived

# accuracy = (248 + 127) / (248 + 79 + 45 + 127)
# recall of surviving = sensitivity = 127 / (79 + 127)
# recall of not surviving = specificity = 248 / (248 + 5)
# precision of surviving = 127 / (45 + 127)
# precision of not surviving = 248 / (248 + 79)
# false negative = 79 / (248 + 79)

print()
print(classification_report(y_train, y_pred))

3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [None]:
print('Accuracy of random forest classifier on TRAIN datat: {:.2f}'
     .format(rf.score(X_train, y_train)))
print()
print(confusion_matrix(y_train, y_pred))
print()
print(classification_report(y_train, y_pred))

print('Accuracy of output when model is run on TEST data:')
print(rf.score(X_test, y_test))
# print()
# print(confusion_matrix(y_train, y_pred))
# print()
# print(classification_report(y_train, y_pred))

4. Run through steps increasing your min_samples_leaf to 5 and decreasing your max_depth to 3.

In [None]:
# setting the random_state accordingly and 
# setting min_samples_leaf = 5 and max_depth = 3.
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=5,
                            n_estimators=100,
                            max_depth=3, 
                            random_state=123)
# min_samples_leaf is set to only 3 because dataset is small

rf.fit(X_train, y_train)

print('this shows gini-index, shows you the importance of each feature in order')
print('shows that fare is biggest indicator of survival')
print("for features ['pclass','age','fare','sibsp','parch']:")
print(rf.feature_importances_)
print()

y_pred = rf.predict(X_train)

y_pred_proba = rf.predict_proba(X_train)

# y_train is rows
# y_pred is columns

# these numbers are from lesson... need to be changed...
# 248 - pred died, died     |45 -  pred to survive, died
# 79 - pred died, survived  |127 - pred to survive, survived

# accuracy = (248 + 127) / (248 + 79 + 45 + 127)
# recall of surviving = sensitivity = 127 / (79 + 127)
# recall of not surviving = specificity = 248 / (248 + 5)
# precision of surviving = 127 / (45 + 127)
# precision of not surviving = 248 / (248 + 79)
# false negative = 79 / (248 + 79)

print('Accuracy of random forest classifier on TRAIN datat: {:.2f}'
     .format(rf.score(X_train, y_train)))
print()
print(confusion_matrix(y_train, y_pred))
print()
print(classification_report(y_train, y_pred))

print('Accuracy of output when model is run on TEST data:')
print(rf.score(X_test, y_test))
# print()
# print(confusion_matrix(y_train, y_pred))
# print()
# print(classification_report(y_train, y_pred))

5. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

The first random forest classifier setting min_samples_leaf = 1 and max_depth = 20 gave MUCH better in-sample results with an accuracy of 98%, but its accuracy on the test data was only 71% suggesting the model was overfit for the data. So I'll go with the second classifier with the min_samples_leaf to 5 and decreasing your max_depth to 3.

6. Save the best model in forest_fit

In [None]:
forest_fit = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=5,
                            n_estimators=100,
                            max_depth=3, 
                            random_state=123)

K-Nearest Neighbor:

Accuracy of KNN classifier on training set: 0.76
[[239  54]
 [ 65 141]]
              precision    recall  f1-score   support

           0       0.79      0.82      0.80       293
           1       0.72      0.68      0.70       206

   micro avg       0.76      0.76      0.76       499
   macro avg       0.75      0.75      0.75       499
weighted avg       0.76      0.76      0.76       499

Accuracy of KNN classifier on test set: 0.67

Random Forest Classifier:

for features ['pclass','age','fare','sibsp','parch']:
[0.31756957 0.13479889 0.39019831 0.07086815 0.08656508]

Accuracy of random forest classifier on TRAIN datat: 0.75

[[247  46]
 [ 79 127]]

              precision    recall  f1-score   support

           0       0.76      0.84      0.80       293
           1       0.73      0.62      0.67       206

   micro avg       0.75      0.75      0.75       499
   macro avg       0.75      0.73      0.73       499
weighted avg       0.75      0.75      0.75       499

Accuracy of output when model is run on TEST data:
0.7441860465116279

Going with K-Nearest Neighbor model setting k = 5.

## Test
Once you have determined which algorithm (with metaparameters) performs the best, try reducing the number of features to the top 4 features in terms of information gained for each feature individually. That is, how close do we get to predicting accurately the survival with each feature?

1. Compute the information gained.

2. Create a new dataframe with top 4 features (train_df_reduced).

3. Use the top performing algorithm with the metaparameters used in that model. Create the object, fit, transform on in-sample data, and evaluate the results. Compare your evaluation metrics with those from the original model (with all the features). Select the best model.

4. Run your final model on your out-of-sample dataframe (test_df). Evaluatethe results.