# Preprocessing Notebook

## In this notebook we will use FeatureTools to generate more features to help us establish relationships with our main dataframe.  We will then add our baseline model in a Logistic Regression here.

### Let's get started again imported the tools we will need for these tasks.

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc
from sklearn.model_selection import train_test_split


### Next let's read in our main dataframe and take a quick look.

In [None]:
df_mod = pd.read_csv(f'/Users/ryanm/Desktop/df-mod.csv')
print(df_mod.shape)
df_mod.head(5)

In [None]:
# FeatureTools work goes here!!!

In [None]:
X = df_mod.drop('reordered', axis = 1)
y = df_mod['reordered']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_y_pred = lr.predict(X_test)

# Establish metrics for the model.
lr_acc = accuracy_score(y_test, lr_y_pred)
lr_conf_matrix = confusion_matrix(y_test, lr_y_pred)
lr_cls_report = classification_report(y_test, lr_y_pred)

print("Metrics for Logistic Regression Model")
print("Accuracy: ", lr_acc)
print("Confusion Matrix: ", lr_conf_matrix)
print(lr_cls_report)

In [None]:
plt.figure(figsize = (8,6))
sns.heatmap(lr_conf_matrix, annot = True, fmt = 'd', cmap = 'Blues', xticklabels = ['Negative (0)', 'Positive (1)'], yticklabels = ['Negative (0)', 'Positive (1)'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix Outcomes')
plt.show()

In [None]:
probs = lr.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, probs)
lr_roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color = 'darkorange', lw = 2, label = 'ROC Curve (area = %0.2f)' % lr_roc_auc)
plt.plot([0,1], [0,1], color = 'navy', lw = 2, linestyle = '--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC')
plt.legend(loc = 'lower right')
plt.show()