**Run the following two cells before you begin.**

In [1]:
%autosave 10

Autosaving every 10 seconds


In [2]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

______________________________________________________________________
**First, import your data set and define the sigmoid function.**
<details>
    <summary>Hint:</summary>
    The definition of the sigmoid is $f(x) = \frac{1}{1 + e^{-X}}$.
</details>

In [3]:
# Import the data set
df = pd.read_csv('cleaned_data.csv')

In [4]:
# Define the sigmoid function
def sigmoid(x):
    sigmoid = 1/(1+np.exp(-x))
    return sigmoid

**Now, create a train/test split (80/20) with `PAY_1` and `LIMIT_BAL` as features and `default payment next month` as values. Use a random state of 24.**

In [5]:
# Create a train/test split
X_train, X_test, Y_train, Y_test = train_test_split(df[['PAY_1', 'LIMIT_BAL']].values, df['default payment next month'].values,test_size=0.2, random_state=24)

______________________________________________________________________
**Next, import LogisticRegression, with the default options, but set the solver to `'liblinear'`.**

In [6]:
lr_model = LogisticRegression(solver='liblinear')

______________________________________________________________________
**Now, train on the training data and obtain predicted classes, as well as class probabilities, using the testing data.**

In [7]:
# Fit the logistic regression model on training data
lr_model.fit(X_train, Y_train)
#X_train.shape

LogisticRegression(solver='liblinear')

In [8]:
# Make predictions using `.predict()`
pred_y = lr_model.predict(X_test)

In [9]:
# Find class probabilities using `.predict_proba()`
pred_proba_y = lr_model.predict_proba(X_test)
#pred_proba_y

______________________________________________________________________
**Then, pull out the coefficients and intercept from the trained model and manually calculate predicted probabilities. You'll need to add a column of 1s to your features, to multiply by the intercept.**

In [10]:
# Add column of 1s to features
ones_col = np.ones((X_test.shape[0],1))

In [11]:
# Get coefficients and intercepts from trained model
coefs = lr_model.coef_
intercepts = lr_model.intercept_


In [12]:
# Manually calculate predicted probabilities
#################################
#adding col of ones to test_features
ones_and_features = np.hstack([ones_col, X_test])
#concatenating coeff and intercepts
intercept_and_coefs = np.concatenate([intercepts.reshape(1,1), coefs], axis=1)


X_lin = np.dot(intercept_and_coefs, np.transpose(ones_and_features))
#manual predicton probabilities
pred_proba_y_manual = sigmoid(X_lin)
pred_Y_manual = pred_proba_y_manual>=0.5
#print(pred_proba_y_manual.shape, pred_Y_manual.shape)


______________________________________________________________________
**Next, using a threshold of `0.5`, manually calculate predicted classes. Compare this to the class predictions output by scikit-learn.**

In [13]:
# Manually calculate predicted classes
pred_Y_manual = pred_proba_y_manual>=0.5
#print(pred_proba_y_manual.shape, pred_Y_manual.shape)


In [14]:
# Compare to scikit-learn's predicted classes
#we compare the shape of pred_y_manual  and pred_y
np.array_equal(pred_y.reshape(1,-1), pred_Y_manual)

#both have same dimensions

True

______________________________________________________________________
**Finally, calculate ROC AUC using both scikit-learn's predicted probabilities, and your manually predicted probabilities, and compare.**

In [15]:
# Use scikit-learn's predicted probabilities to calculate ROC AUC
roc_auc_score1 = metrics.roc_auc_score(Y_test, pred_proba_y[:, 1])
roc_auc_score1

0.627207450280691

In [16]:
# Use manually calculated predicted probabilities to calculate ROC AUC
roc_auc_score2 = metrics.roc_auc_score(Y_test, pred_proba_y_manual.reshape(pred_Y_manual.shape[1],))
roc_auc_score2

0.627207450280691

In [17]:
#roc_auc_score1 = roc_auc_score2