In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

%matplotlib inline
pd.set_option('display.max_columns',500)

In [2]:
df = pd.read_csv('Heart3.csv')

In [3]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,restecg,thalach,exang,oldpeak,slope,ca,thal,MEDV
0,63,1,3,145,233,0,150,0,2.3,0,0,1,s
1,37,1,2,130,250,1,187,0,3.5,0,0,2,s
2,41,0,1,130,204,0,172,0,1.4,2,0,2,s
3,56,1,1,120,236,1,178,0,0.8,2,0,2,s
4,57,0,0,120,354,1,163,1,0.6,2,0,2,s


In [4]:
df.shape

(303, 13)

In [5]:
df.columns

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'restecg', 'thalach', 'exang',
       'oldpeak', 'slope', 'ca', 'thal', 'MEDV'],
      dtype='object')

In [6]:
var = ['oldpeak', 'chol', 'thalach', 'trestbps','age', 'cp', 'restecg', 'exang','slope', 'ca', 'thal', 'sex']

In [7]:
X = df[var].copy()

In [8]:
X.head()

Unnamed: 0,oldpeak,chol,thalach,trestbps,age,cp,restecg,exang,slope,ca,thal,sex
0,2.3,233,150,145,63,3,0,0,0,0,1,1
1,3.5,250,187,130,37,2,1,0,0,0,2,1
2,1.4,204,172,130,41,1,0,0,2,0,2,0
3,0.8,236,178,120,56,1,1,0,2,0,2,1
4,0.6,354,163,120,57,0,1,1,2,0,2,0


In [9]:
d = dict(zip(var,['x%d'%i for i in range(1,len(var)+1)]))

In [10]:
X.rename(columns=d,inplace=True)

In [11]:
df['MEDV'].value_counts()

s    165
n    138
Name: MEDV, dtype: int64

In [12]:
y = (df['MEDV'] == 's').astype(int)

In [13]:
X['target'] = y

In [14]:
for i in range(1,4):
    X['x%d'%i] = pd.cut(X['x%d'%i],bins=3,include_lowest=True).astype(str)

In [15]:
for i in range(1,13):
    print(X['x%d'%i].value_counts(normalize=True))
    print('\n')

(-0.0072, 2.067]    0.834983
(2.067, 4.133]      0.148515
(4.133, 6.2]        0.016502
Name: x1, dtype: float64


(125.56099999999999, 272.0]    0.732673
(272.0, 418.0]                 0.264026
(418.0, 564.0]                 0.003300
Name: x2, dtype: float64


(114.667, 158.333]    0.504950
(158.333, 202.0]      0.405941
(70.868, 114.667]     0.089109
Name: x3, dtype: float64


120    0.122112
130    0.118812
140    0.105611
110    0.062706
150    0.056106
138    0.042904
128    0.039604
125    0.036304
160    0.036304
112    0.029703
132    0.026403
118    0.023102
135    0.019802
108    0.019802
124    0.019802
145    0.016502
134    0.016502
152    0.016502
122    0.013201
170    0.013201
100    0.013201
142    0.009901
115    0.009901
136    0.009901
105    0.009901
180    0.009901
126    0.009901
102    0.006601
94     0.006601
144    0.006601
178    0.006601
146    0.006601
148    0.006601
129    0.003300
165    0.003300
101    0.003300
174    0.003300
104    0.003300
172    0.00

In [16]:
for i in range(1,13):
    aux=X[['x%d'%i,'target']].copy()
    aux['n'] = 1
    aux = aux.pivot_table(columns='target',
                          index='x%d'%i,
                          aggfunc='count',
                          fill_value=0)
    aux.columns = aux.columns.droplevel()
    aux.reset_index(inplace=True)
    aux['pne'] = aux[0]/aux[0].sum()
    aux['pe'] = aux[1]/aux[1].sum()
    aux['woe'] = np.log(aux['pne']/aux['pe'])
    X = X.merge(aux[['x%d'%i,'woe']],on='x%d'%i,how='inner')
    X.rename(columns={'woe':'w_x%d'%i},inplace=True)

  if sys.path[0] == '':


In [17]:
var_woe = [x for x in X.columns if x[:2]=='w_']

In [18]:
X[var_woe].describe()

Unnamed: 0,w_x1,w_x2,w_x3,w_x4,w_x5,w_x6,w_x7,w_x8,w_x9,w_x10,w_x11,w_x12
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,0.054787,-inf,-0.012045,,,-0.057207,-0.00445,0.011284,-0.039049,0.002894,-0.037042,-0.030291
std,0.869072,,0.885228,,,1.163008,0.373494,0.950832,0.850294,1.117417,1.185869,0.606845
min,-0.330026,-inf,-0.95271,-inf,-inf,-1.337656,-0.360305,-0.650001,-0.938789,-1.207603,-1.105324,-0.91992
25%,-0.330026,-0.185646,-0.95271,-0.391853,-0.632238,-1.165043,-0.360305,-0.650001,-0.938789,-0.88218,-1.105324,-0.91992
50%,-0.330026,-0.185646,0.401835,0.053529,-0.10899,-0.647987,-0.360305,-0.650001,0.466374,-0.88218,-1.105324,0.382291
75%,-0.330026,0.532332,0.401835,0.497146,0.717688,1.159521,0.328632,1.373931,0.797731,0.918359,1.335124,0.382291
max,2.050494,0.532332,1.927892,inf,inf,1.159521,1.277304,1.373931,0.797731,1.913293,1.335124,0.382291


In [24]:
y = X['target'].copy()
Xw = X[var_woe].copy()

In [25]:
modelo = LogisticRegression()

In [33]:
Xw.drop(['w_x2','w_x4','w_x5'],axis=1,inplace=True)

In [34]:
Xt,Xv,yt,yv = train_test_split(Xw,y,train_size=0.73)



In [35]:
modelo.fit(Xt,yt)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [36]:
print(accuracy_score(y_true=yt,y_pred=modelo.predict(Xt)))
print(accuracy_score(y_true=yv,y_pred=modelo.predict(Xv)))

0.8506787330316742
0.8536585365853658


In [37]:
print(roc_auc_score(y_true=yt,y_score=modelo.predict_proba(Xt)[:,1]))
print(roc_auc_score(y_true=yv,y_score=modelo.predict_proba(Xv)[:,1]))

0.927500418830625
0.9428827751196172
