In [292]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import confusion_matrix


In [293]:
def limpieza(df):
    df = df.copy()

    df['FALLECIDO'] = df['DATE_DIED'].apply(lambda x: 2 if x == '9999-99-99' else 1)
    
    df.drop(columns=['DATE_DIED'], inplace=True)

    columns = df.columns.drop(['AGE'])

    for col in columns:
        df[col] = df[col].replace({97: np.nan, 98: np.nan, 99: np.nan})

    df['TEST_RESULT'] = df['TEST_RESULT'].apply(lambda x: 1 if x in [1,2,3] else 0)
    non_binary_columns = ['AGE', 'MEDICAL_UNIT']

    df = df[df['AGE'].between(0, 110)]

    binary_cols = df.columns.drop(non_binary_columns)

    df[binary_cols] = df[binary_cols].replace({1:1, 2:0})
    return df

In [294]:


df = pd.read_csv("custom_covid19.csv")
df = limpieza(df)
X, y = df.drop(columns=['FALLECIDO']), df['FALLECIDO']

imputer = SimpleImputer(strategy='most_frequent')

X = imputer.fit_transform(X)
    
col = df.drop(columns=['FALLECIDO']).columns


In [295]:
def train_model(x, y, param_grid, model, scoring):
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5,scoring=scoring, n_jobs=-1)  
    grid_search.fit(x,y)
    return grid_search

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Entrenar un modelo de regresión logística y ajustar el hiperparámetro C
#Además usamos class_weight = balanced 
scoring = ['accuracy', 'recall', 'precision', 'f1']
models = []
param_grid = {'C': [0.1, 1, 10, 100], 'class_weight': [None, 'balanced']}

for sc in scoring:
    model_lr = LogisticRegression(max_iter=1000, n_jobs=-1)
    grid_lr = train_model(X, y, param_grid, model_lr, sc)
    model_lr = grid_lr.best_estimator_
    models.append(model_lr)

    print("Scoring optimizado: ", sc)
    y_pred = model_lr.predict(X_test)
    print("Mejores hiperparámetros:", grid_lr.best_params_)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred, target_names=['No Fallecido', 'Fallecido']))
    cm = confusion_matrix(y_test, y_pred)
    print("Matriz de Confusión:")
    print(cm)  


Scoring optimizado:  accuracy
Mejores hiperparámetros: {'C': 10, 'class_weight': None}
Accuracy: 0.9476
              precision    recall  f1-score   support

No Fallecido       0.96      0.98      0.97     27786
   Fallecido       0.71      0.48      0.58      2214

    accuracy                           0.95     30000
   macro avg       0.84      0.73      0.77     30000
weighted avg       0.94      0.95      0.94     30000

Matriz de Confusión:
[[27357   429]
 [ 1143  1071]]
Scoring optimizado:  recall
Mejores hiperparámetros: {'C': 0.1, 'class_weight': 'balanced'}
Accuracy: 0.9004666666666666
              precision    recall  f1-score   support

No Fallecido       0.99      0.90      0.94     27786
   Fallecido       0.42      0.92      0.58      2214

    accuracy                           0.90     30000
   macro avg       0.71      0.91      0.76     30000
weighted avg       0.95      0.90      0.92     30000

Matriz de Confusión:
[[24978  2808]
 [  178  2036]]
Scoring optimizad

In [296]:
model = models[1]
proj_test_data = pd.read_csv("proj-test-data.csv")
proj_test_class = pd.read_csv("proj-test-class.csv")

proj_test_data = limpieza(proj_test_data)

X_proj = proj_test_data.drop(columns=['FALLECIDO'])
y_proj = proj_test_data['FALLECIDO']

X_proj = imputer.transform(X_proj)


y_pred = model.predict(X_proj)
print("Accuracy:", accuracy_score(y_proj, y_pred))
print(classification_report(y_proj, y_pred, target_names=['No Fallecido', 'Fallecido']))
cm = confusion_matrix(y_proj, y_pred)
print("Matriz de Confusión:")
print(cm)

Accuracy: 0.94
              precision    recall  f1-score   support

No Fallecido       1.00      0.94      0.97        93
   Fallecido       0.54      1.00      0.70         7

    accuracy                           0.94       100
   macro avg       0.77      0.97      0.83       100
weighted avg       0.97      0.94      0.95       100

Matriz de Confusión:
[[87  6]
 [ 0  7]]


# 2

In [297]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
df = pd.read_csv("custom_covid19.csv")
df = limpieza(df)

X, y = df.drop(columns=['AGE']), df['AGE']

X = imputer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

models_2= []
param_grid = {
    'n_estimators': [100, 200],    # número de árboles
    'max_depth': [None, 10],       # profundidad máxima
    'min_samples_split': [2, 5]    # tamaño mínimo para dividir
}
scoring = ['neg_mean_squared_error', 'neg_mean_absolute_error', 'r2']

for sc in scoring:
    model = RandomForestRegressor(random_state=0)
    grid = train_model(X_train, y_train, param_grid, model, sc)
    model = grid.best_estimator_
    models_2.append(model)

    print("Scoring optimizado: ", sc)
    y_pred = model.predict(X_test)
    print("Mejores hiperparámetros:", grid.best_params_)
    print("Mean squared error:", mean_squared_error(y_test, y_pred))
    print("Mean absolute error:", mean_absolute_error(y_test, y_pred))
    print("R2 score:", r2_score(y_test, y_pred))
    print("\n")


Scoring optimizado:  neg_mean_squared_error
Mejores hiperparámetros: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 200}
Mean squared error: 201.79106594300077
Mean absolute error: 11.053125587554247
R2 score: 0.29154212129703283


Scoring optimizado:  neg_mean_absolute_error
Mejores hiperparámetros: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 200}
Mean squared error: 201.79106594300077
Mean absolute error: 11.053125587554247
R2 score: 0.29154212129703283


Scoring optimizado:  r2
Mejores hiperparámetros: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 200}
Mean squared error: 201.79106594300077
Mean absolute error: 11.053125587554247
R2 score: 0.29154212129703283




In [298]:
from sklearn.preprocessing import LabelEncoder
df = pd.read_csv("custom_covid19.csv")
df = limpieza(df)

# Apply age binning
age_bins = [0, 45,  110] #This turns into 1 and 2
age_labels = ['0-44', '45+']
df['AGE_BIN'] = pd.cut(df['AGE'], bins=age_bins, labels=age_labels)
df = df.dropna(subset=['AGE_BIN'])

# Drop rows where AGE_BIN is NaN (can happen if AGE was NaN before filtering)
X = df.drop(columns=['AGE','AGE_BIN'])  # Drop the intubed one for now bc it has lots of NaNs
y = df['AGE_BIN']

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

X = imputer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10],
    'min_samples_split': [2, 5]
}
scoring = ['accuracy', 'recall', 'precision', 'f1']
models = []
for sc in scoring:
    model_rf = RandomForestClassifier(
        random_state=0,
        class_weight='balanced'
    )
    grid = train_model(X_train, y_train, param_grid, model_rf, sc)
    model = grid.best_estimator_
    models.append(model)

    print("Scoring optimizado:", sc)
    print("Mejores hiperparámetros:", grid.best_params_)
    y_pred = model.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred, target_names=age_labels))
    cm = confusion_matrix(y_test, y_pred)
    print("Matriz de Confusión:")
    print(cm)   

Scoring optimizado: accuracy
Mejores hiperparámetros: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 100}
Accuracy: 0.7440942247206049
              precision    recall  f1-score   support

        0-44       0.75      0.87      0.81     18214
         45+       0.73      0.55      0.63     11672

    accuracy                           0.74     29886
   macro avg       0.74      0.71      0.72     29886
weighted avg       0.74      0.74      0.74     29886

Matriz de Confusión:
[[15797  2417]
 [ 5231  6441]]
Scoring optimizado: recall
Mejores hiperparámetros: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 100}
Accuracy: 0.7440942247206049
              precision    recall  f1-score   support

        0-44       0.75      0.87      0.81     18214
         45+       0.73      0.55      0.63     11672

    accuracy                           0.74     29886
   macro avg       0.74      0.71      0.72     29886
weighted avg       0.74      0.74      0.74     29886

Matr