In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn import metrics

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [3]:

X_train=df_train.drop(['smoking'],axis=1)
y_train=df_train['smoking']

X_test = df_test

train_id = X_train['id']
test_id = X_test['id']

X_train = X_train.drop(['id'], axis=1)
X_test = X_test.drop(['id'], axis=1)

In [4]:
def calculo_imc(height_cm, weight_kg):
    altura_m = height_cm / 100
    imc = weight_kg / (altura_m ** 2)
    return imc

In [5]:
#creo columna aplicando la fórmula al df
X_train['IMC'] = X_train.apply(lambda row: calculo_imc(row['height(cm)'], row['weight(kg)']), axis=1)

In [6]:
def peso_no_sano(imc):
    condiciones = (imc < 18.5) | (imc > 30)
    return condiciones.astype(int) #asi el booleano lo paso a 0 y 1

In [7]:
#creo columna aplicando la fórmula al df
X_train['peso_no_sano'] = peso_no_sano(X_train['IMC'])

In [8]:
def grupos_edad(edad):
    if edad<40:
        return 'Joven'
    elif edad <65:
        return 'Adulto'
    else:
        return 'Mayor'

In [9]:
#creo columna aplicando la fórmula al df
X_train['G_Edad'] = X_train.apply(lambda row: grupos_edad(row['age']), axis=1)

In [10]:
#lo paso a dummy para que no me de problemas luego en el escalador
dummy=pd.get_dummies(X_train['G_Edad'], drop_first=True)
X_train=X_train.drop('G_Edad',axis=1)
X_train = pd.concat([X_train, dummy], axis=1)


In [11]:
#lo paso de booleano a 0 y 1 para que no me de problemas luego en el escalador
X_train['Joven']=X_train['Joven'].astype(int)
X_train['Mayor']=X_train['Mayor'].astype(int)

In [12]:
def mala_dieta(df):
    condiciones = [(X_train['LDL'] > 130),(X_train['triglyceride'] > 150),(X_train['fasting blood sugar'] > 100),(X_train['hemoglobin'] < 13.5),(X_train['Urine protein'] > 150),
        (X_train['peso_no_sano']==1)]
    suma_condiciones = sum(condiciones)
    return (suma_condiciones >=3).astype(int) #he querido ser equilibrada pidiendo que sean 3 parámetros los que determinen una mala dieta

In [13]:
#como lo aplicamos al df entero (eran muchas columnas y así se ha definido  en la fórmula) no uso apply ni lambda
X_train['mala_dieta'] = mala_dieta(X_train)

In [14]:
#puede haber sesgo en algunos valores porque los que se encuentran como referencia son los de hombres
#como ants son muchhsa columnas, recibe df
def pos_alcoholismo(df):
    condiciones=[df['AST'] > 50,df['ALT'] > 50, df['Gtp'] > 55,(df['systolic'] > 120) | (df['relaxation'] > 80), df['triglyceride'] > 150]
    suma=sum(condiciones) 
    return (suma>=3).astype(int)

In [15]:
X_train['pos_alcoholismo']=pos_alcoholismo(X_train)

In [16]:
def buenos_habitos(df):
    condiciones=[(1-df['peso_no_sano']),(1-df['mala_dieta']),(1-df['pos_alcoholismo']),(1-df['dental caries'])]
    suma=sum(condiciones)
    return (suma>=3).astype(int)    

In [17]:
X_train['buenos_habitos']= buenos_habitos(X_train)

In [18]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

# Preparar los datos
X = X_train
y = y_train 

# Estandarizar los datos
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, test_size=0.20)

# Definir el modelo
model = KNeighborsClassifier()

In [None]:
# Entrenar el modelo con los datos de entrenamiento
model.fit(X_train2, y_train2)

In [None]:
# Hacer predicciones en los datos de prueba
y_pred = model.predict(X_test2)

In [19]:
# vemos los parametros que puede tener
print(model.get_params())
params = {'n_neighbors': [3,5,7],
         'p': [1,2]}

# scoring: lista de metricas a obtener
# https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
scoring = ['accuracy', 'roc_auc']

{'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 5, 'p': 2, 'weights': 'uniform'}


In [20]:
from sklearn.model_selection import GridSearchCV
# CV (cross validación)
n_cv=5 # 5 folds
# creamos la GridSeacrh # cross-validation
grid_solver = GridSearchCV(estimator = model, # model to train
                   param_grid = params, # param_grid
                   scoring = scoring,
                   cv = n_cv,
                   refit = 'roc_auc', # Aplico lo que requería el ejercicio de Kaggle en este paso
                   verbose = 2)

model_result = grid_solver.fit(X_train2,y_train2)
# hemos ajustado el modelo

# best score es la media del best estimator
print("Mejor score:", model_result.best_score_)
# tenemos los mejores parametros
print("Mejores parametros:",model_result.best_params_)

# En este caso el score devuelve el roc_auc
print("Train Score:", model_result.score(X_train2,y_train2)) # score devulve la metrica del refit
print("Test Score:", model_result.score(X_test2,y_test2))
# Train
y_pred=model_result.predict(X_train2)
probs=model_result.predict_proba(X_train2)
print("Train AUC:", sklearn.metrics.roc_auc_score(y_train2,probs[:,1]))
print("Train Accuracy:", sklearn.metrics.accuracy_score(y_train2,y_pred))


Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] END .................................n_neighbors=3, p=1; total time=  29.0s
[CV] END .................................n_neighbors=3, p=1; total time=  28.3s
[CV] END .................................n_neighbors=3, p=1; total time=  28.8s
[CV] END .................................n_neighbors=3, p=1; total time=  29.5s
[CV] END .................................n_neighbors=3, p=1; total time=  29.2s
[CV] END .................................n_neighbors=3, p=2; total time=   7.9s
[CV] END .................................n_neighbors=3, p=2; total time=   7.5s
[CV] END .................................n_neighbors=3, p=2; total time=   7.7s
[CV] END .................................n_neighbors=3, p=2; total time=   7.9s
[CV] END .................................n_neighbors=3, p=2; total time=   7.9s
[CV] END .................................n_neighbors=5, p=1; total time=  28.6s
[CV] END .................................n_neigh

NameError: name 'sklearn' is not defined

In [25]:
import sklearn
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

# test
model_result.score(X_test2,y_test2)
y_pred=model_result.predict(X_test2)
probs=model_result.predict_proba(X_test2)
print("Test AUC:", sklearn.metrics.roc_auc_score(y_test2,probs[:,1]))
print("Test Accuracy:", sklearn.metrics.accuracy_score(y_test2,y_pred))

Test AUC: 0.8181266904899412
Test Accuracy: 0.7424337561220645


In [26]:
#Revisión del modelo
# tenemos un atributo del mejor modelo
best_model=model_result.best_estimator_
# nos quedamos con el mejor modelo y reestimamos con toda la muestra

In [27]:
# Ajuste del modelo
final_model=best_model.fit(X,y)# resultados de la cross validation con parameter tunning


In [28]:
#Prueba final del modelo
results=model_result.cv_results_
results=pd.DataFrame(results)

In [30]:
#Ahora uso el test de kaggle para hacer una predicción: 
#aplico la modificaciones del X_train al X_Test
X_test['IMC'] = X_test.apply(lambda row: calculo_imc(row['height(cm)'], row['weight(kg)']), axis=1)
X_test['peso_no_sano'] = peso_no_sano(X_test['IMC'])
X_test['G_Edad'] = X_test.apply(lambda row: grupos_edad(row['age']), axis=1)
dummy=pd.get_dummies(X_test['G_Edad'], drop_first=True)
X_test=X_test.drop('G_Edad',axis=1)
X_test = pd.concat([X_test, dummy], axis=1)
X_test['Joven']=X_test['Joven'].astype(int)
X_test['Mayor']=X_test['Mayor'].astype(int)
X_test['mala_dieta'] = mala_dieta(X_test)
X_test['pos_alcoholismo']=pos_alcoholismo(X_test)
X_test['buenos_habitos']= buenos_habitos(X_test)
X_test = scaler.fit_transform(X_test)


In [None]:
#  'final_model' es mi modelo final ya entrenado, entonces:
y_test_pred = final_model.predict(X_test)

# Creo dataframe con mi predicción
smoking_res = pd.DataFrame({'id': test_id,'smoking': y_test_pred})

smoking_res.to_csv('smoking_res.csv', index=False)