# Targeted marketing predictions with Logistic Regression
- https://towardsdatascience.com/building-a-logistic-regression-in-python-step-by-step-becd4d56c9c8

In [12]:
import pandas as pd
import numpy as np
raw_data = pd.read_csv("https://raw.githubusercontent.com/madmashup/targeted-marketing-predictive-engine/master/banking.csv")
display(raw_data.head(5))
print(len(raw_data.columns))

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed,y
0,44,blue-collar,married,basic.4y,unknown,yes,no,cellular,aug,thu,...,1,999,0,nonexistent,1.4,93.444,-36.1,4.963,5228.1,0
1,53,technician,married,unknown,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-0.1,93.2,-42.0,4.021,5195.8,0
2,28,management,single,university.degree,no,yes,no,cellular,jun,thu,...,3,6,2,success,-1.7,94.055,-39.8,0.729,4991.6,1
3,39,services,married,high.school,no,no,no,cellular,apr,fri,...,2,999,0,nonexistent,-1.8,93.075,-47.1,1.405,5099.1,0
4,55,retired,married,basic.4y,no,yes,no,cellular,aug,fri,...,1,3,1,success,-2.9,92.201,-31.4,0.869,5076.2,1


21


In [19]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
age               41188 non-null int64
job               41188 non-null object
marital           41188 non-null object
education         41188 non-null object
default           41188 non-null object
housing           41188 non-null object
loan              41188 non-null object
contact           41188 non-null object
month             41188 non-null object
day_of_week       41188 non-null object
duration          41188 non-null int64
campaign          41188 non-null int64
pdays             41188 non-null int64
previous          41188 non-null int64
poutcome          41188 non-null object
emp_var_rate      41188 non-null float64
cons_price_idx    41188 non-null float64
cons_conf_idx     41188 non-null float64
euribor3m         41188 non-null float64
nr_employed       41188 non-null float64
y                 41188 non-null int64
dtypes: float64(5), int64(6), object(10)
memory usag

In [20]:
from ml_helpers.engineering import *
numericalize_all(raw_data)
display(raw_data.head(5))
print(raw_data.info())

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed,y
0,44,2,2,1,2,3,1,1,2,3,...,1,999,0,2,1.4,93.444,-36.1,4.963,5228.1,0
1,53,10,2,6,1,1,1,1,8,1,...,1,999,0,2,-0.1,93.2,-42.0,4.021,5195.8,0
2,28,5,3,5,1,3,1,1,5,3,...,3,6,2,3,-1.7,94.055,-39.8,0.729,4991.6,1
3,39,8,2,2,1,1,1,1,1,1,...,2,999,0,2,-1.8,93.075,-47.1,1.405,5099.1,0
4,55,6,2,1,1,3,1,1,2,1,...,1,3,1,3,-2.9,92.201,-31.4,0.869,5076.2,1


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
age               41188 non-null int64
job               41188 non-null int8
marital           41188 non-null int8
education         41188 non-null int8
default           41188 non-null int8
housing           41188 non-null int8
loan              41188 non-null int8
contact           41188 non-null int8
month             41188 non-null int8
day_of_week       41188 non-null int8
duration          41188 non-null int64
campaign          41188 non-null int64
pdays             41188 non-null int64
previous          41188 non-null int64
poutcome          41188 non-null int8
emp_var_rate      41188 non-null float64
cons_price_idx    41188 non-null float64
cons_conf_idx     41188 non-null float64
euribor3m         41188 non-null float64
nr_employed       41188 non-null float64
y                 41188 non-null int64
dtypes: float64(5), int64(6), int8(10)
memory usage: 3.8 MB
None


In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
X = raw_data.drop("y", axis=1)
y = raw_data.y
X_train, X_test, y_train, y_test = train_test_split(X, y)
lr = LogisticRegression().fit(X_train, y_train)
print(f"Accuracy on the test set: {lr.score(X_test, y_test)}")



Accuracy on the test set: 0.907060308827814


## Evaluation
- confusion matrix
- precision, recall, f-score, support

In [28]:
from sklearn.metrics import confusion_matrix
y_pred = lr.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[8863  253]
 [ 704  477]]


In [30]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.97      0.95      9116
           1       0.65      0.40      0.50      1181

   micro avg       0.91      0.91      0.91     10297
   macro avg       0.79      0.69      0.72     10297
weighted avg       0.90      0.91      0.90     10297



## Feature engineering
- group values of `education` column together

In [14]:
print(raw_data.education.unique())
raw_data.education = np.where(raw_data.education == 'basic.9y', 'Basic', raw_data.education)
raw_data.education = np.where(raw_data.education == 'basic.6y', 'Basic', raw_data.education)
raw_data.education = np.where(raw_data.education == 'basic.4y', 'Basic', raw_data.education)
print(raw_data.education.unique())

['basic.4y' 'unknown' 'university.degree' 'high.school' 'basic.9y'
 'professional.course' 'basic.6y' 'illiterate']
['Basic' 'unknown' 'university.degree' 'high.school' 'professional.course'
 'illiterate']


In [None]:
raw_data.education.unique()

In [5]:
X = raw_data.drop("y", axis=1)
y = raw_data.y
print(len(X.columns))

20


## LogReg specifics
- little to no correlation between indep. variables

## Feature Selection

## Evaluation
- Accuracy
- Confusion matrix