In [1]:
# data manipulation:
import pandas as pd
import numpy as np

# stats:
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy.stats import pearsonr
from scipy import stats
import math

# plotting and images:
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from IPython.display import Image

---
## About the data

In this chapter (using simulated data) we look at how Target is able to predict whether a woman is in her second trimester of pregnancy given her past purchases.

---
### Import the data:
Columns "oct" to "dec" represent the number of bottles of unscented lotion purchased. 

In [2]:
target_df = pd.read_excel('Target.xlsx', sheet_name='Logit', header=2).iloc[:,4:13]
target_df

Unnamed: 0,oct,nov,dec,pregnant,buy1,amt1,buy2,amt2,duedate
0,0,4,0,1,11,1,11,3,2021-05-28
1,0,6,0,1,11,4,11,2,2021-05-15
2,0,0,2,1,12,2,12,0,2021-04-15
3,4,0,0,1,11,0,10,4,2021-06-20
4,0,0,3,1,12,3,12,0,2021-05-24
...,...,...,...,...,...,...,...,...,...
8995,0,0,0,0,12,0,11,0,NaT
8996,0,0,0,0,11,0,10,0,NaT
8997,1,0,0,0,12,0,10,1,NaT
8998,3,0,0,0,10,3,12,0,NaT


---
### Fit a logistic regression model using months Oct-Dec as dependent variables, and pregnant (binary) as the dependent:

In [3]:
# import split module:
from sklearn.model_selection import train_test_split

# define train and test sets:
X = target_df[['oct', 'nov', 'dec']]
y = target_df['pregnant']

# split:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=0)

In [4]:
# import log reg package:
from sklearn.linear_model import LogisticRegression

# instantiate the model (using the default parameters):
logreg = LogisticRegression()

# fit the model with data:
logreg.fit(X_train,y_train)

LogisticRegression()

#### Predict and get probabilities:

In [12]:
# predict
target_df['prediction_probability'] = logreg.predict_proba(X)[:,1]
target_df['prediction_probability'] = round(target_df['prediction_probability'], 2)
target_df['prediction'] = logreg.predict(X)
target_df

Unnamed: 0,oct,nov,dec,pregnant,buy1,amt1,buy2,amt2,duedate,prediction_probability,prediction
0,0,4,0,1,11,1,11,3,2021-05-28,0.71,1
1,0,6,0,1,11,4,11,2,2021-05-15,0.96,1
2,0,0,2,1,12,2,12,0,2021-04-15,0.19,0
3,4,0,0,1,11,0,10,4,2021-06-20,0.63,1
4,0,0,3,1,12,3,12,0,2021-05-24,0.42,0
...,...,...,...,...,...,...,...,...,...,...,...
8995,0,0,0,0,12,0,11,0,NaT,0.02,0
8996,0,0,0,0,11,0,10,0,NaT,0.02,0
8997,1,0,0,0,12,0,10,1,NaT,0.07,0
8998,3,0,0,0,10,3,12,0,NaT,0.37,0


### Get prediction metrics:

In [20]:
from sklearn import metrics

y_pred = logreg.predict(X_test)
print("Accuracy:", round(metrics.accuracy_score(y_test, y_pred), 2))
print("Precision:", round(metrics.precision_score(y_test, y_pred), 2))
print("Recall:", round(metrics.recall_score(y_test, y_pred), 2))

Accuracy: 0.89
Precision: 0.92
Recall: 0.83


In [21]:
Accuracy: 0.89
Precision: 0.92
Recall: 0.83

---
### Get prediction pivot table based on probabilities:

In [57]:
# bin the probabilities:
bins = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]

target_df['probability_binned'] = pd.cut(target_df['prediction_probability'], bins).apply(lambda x: str(x).replace('(', '').replace(']', '').replace(', ', ' - '))
target_df['correc_pred'] = np.where(target_df['prediction']==target_df['pregnant'], 1, 0)
target_df

Unnamed: 0,oct,nov,dec,pregnant,buy1,amt1,buy2,amt2,duedate,prediction_probability,prediction,probability_binned,correc_pred
0,0,4,0,1,11,1,11,3,2021-05-28,0.71,1,0.7 - 0.8,1
1,0,6,0,1,11,4,11,2,2021-05-15,0.96,1,0.9 - 1.0,1
2,0,0,2,1,12,2,12,0,2021-04-15,0.19,0,0.1 - 0.2,0
3,4,0,0,1,11,0,10,4,2021-06-20,0.63,1,0.6 - 0.7,1
4,0,0,3,1,12,3,12,0,2021-05-24,0.42,0,0.4 - 0.5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8995,0,0,0,0,12,0,11,0,NaT,0.02,0,0.0 - 0.1,1
8996,0,0,0,0,11,0,10,0,NaT,0.02,0,0.0 - 0.1,1
8997,1,0,0,0,12,0,10,1,NaT,0.07,0,0.0 - 0.1,1
8998,3,0,0,0,10,3,12,0,NaT,0.37,0,0.3 - 0.4,1


In [59]:
# target_df.groupby(['probability_binned', 'prediction'])['prediction'].count().to_frame()
target_df.pivot_table(index='probability_binned', columns='prediction', values='correc_pred', aggfunc='mean')

prediction,0,1
probability_binned,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0 - 0.1,0.959657,
0.1 - 0.2,0.762876,
0.3 - 0.4,0.756983,
0.4 - 0.5,0.656331,
0.6 - 0.7,,0.717213
0.7 - 0.8,,0.859296
0.8 - 0.9,,0.862928
0.9 - 1.0,,0.980073


As we can see above, the chance that a woman is pregnant (in the second trimester) given the logistic prediction is $98\%.

---