In [1]:
import numpy as np 
import pandas as pd
# Gráficos
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
# Preprocesado y modelado
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.weightstats import ttest_ind
# Configuración matplotlib
plt.rcParams['image.cmap'] = "bwr"
#plt.rcParams['figure.dpi'] = "100"
plt.rcParams['savefig.bbox'] = "tight"
style.use('ggplot') or plt.style.use('ggplot')
# Configuración warnings
import warnings
warnings.filterwarnings('ignore')

### Importar los datos del CSV

In [4]:
data = pd.read_csv('../data/diabetes.csv', delimiter=';', header=0)
data.shape
data.head(3)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1


In [15]:
X = data[['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']]
y = data['Outcome']
X_train, X_test, y_train, y_test = train_test_split(
X.values,
y.values,
train_size = 0.8,
random_state = 42,
shuffle = True
)
model = LogisticRegression(penalty='none')
model.fit(X = X_train, y = y_train)
# Información del modelo
print("Intercept:", model.intercept_)
print("Coeficiente:", list(zip(X.columns, model.coef_.flatten(), )))
print("Accuracy de entrenamiento:", model.score(X, y))

Intercept: [-9.3646879]
Coeficiente: [('Pregnancies', 0.058036836342826574), ('Glucose', 0.03588136620768462), ('BloodPressure', -0.014115864059986253), ('SkinThickness', 0.003777536363800342), ('Insulin', -0.0021154896837450804), ('BMI', 0.10629780988106857), ('DiabetesPedigreeFunction', 0.8963822287646483), ('Age', 0.03546539960366785)]
Accuracy de entrenamiento: 0.77734375


In [20]:
X_test

array([[  6.   ,  98.   ,  58.   , ...,  34.   ,   0.43 ,  43.   ],
       [  2.   , 112.   ,  75.   , ...,  35.7  ,   0.148,  21.   ],
       [  2.   , 108.   ,  64.   , ...,  30.8  ,   0.158,  21.   ],
       ...,
       [  8.   ,  95.   ,  72.   , ...,  36.8  ,   0.485,  57.   ],
       [  2.   , 146.   ,  70.   , ...,  28.   ,   0.337,  29.   ],
       [  8.   ,  74.   ,  70.   , ...,  35.3  ,   0.705,  39.   ]])

In [16]:

predictions = model.predict_proba(X = X_test)
predictions = pd.DataFrame(predictions, columns = model.classes_)
predictions.head(3)
predictions = model.predict(X = X_test)
predictions

array([0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0])

In [21]:
clasificacion = np.where(predictions<0.5, 0, 1)
clasificacion

array([0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0])

In [22]:
accuracy = accuracy_score(
y_true = y_test,
y_pred = clasificacion,
normalize = True
)
accuracy

0.7597402597402597

In [29]:
result = model.predict([[6,148,74,25,0,26.5,0.627,40]])
if result[0]==0:
    print('No tiene diabetes 😁')
else:
    print('tiene dabetes 😔')

tiene dabetes 😔
