# COVID-19 Patient Death Prediction

## Загрузка библиотек

In [None]:
import pandas as pd
import numpy as np
from google.colab import files
import io
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

## 2. Загрузка данных (загрузите файл "Covid Data.csv" через интерфейс Colab)

In [None]:
uploaded = files.upload()



Saving Covid Data.csv to Covid Data (1).csv


KeyError: '/content/Covid Data.csv'

In [None]:
df = pd.read_csv('/content/Covid-Data.csv')
df

Unnamed: 0,USMER,MEDICAL_UNIT,SEX,PATIENT_TYPE,DATE_DIED,INTUBED,PNEUMONIA,AGE,PREGNANT,DIABETES,...,ASTHMA,INMSUPR,HIPERTENSION,OTHER_DISEASE,CARDIOVASCULAR,OBESITY,RENAL_CHRONIC,TOBACCO,CLASIFFICATION_FINAL,ICU
0,2,1,1,1,03/05/2020,97,1,65,2,2,...,2,2,1,2,2,2,2,2,3,97
1,2,1,2,1,03/06/2020,97,1,72,97,2,...,2,2,1,2,2,1,1,2,5,97
2,2,1,2,2,09/06/2020,1,2,55,97,1,...,2,2,2,2,2,2,2,2,3,2
3,2,1,1,1,12/06/2020,97,2,53,2,2,...,2,2,2,2,2,2,2,2,7,97
4,2,1,2,1,21/06/2020,97,2,68,97,1,...,2,2,1,2,2,2,2,2,3,97
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1048570,2,13,2,1,9999-99-99,97,2,40,97,2,...,2,2,2,2,2,2,2,2,7,97
1048571,1,13,2,2,9999-99-99,2,2,51,97,2,...,2,2,1,2,2,2,2,2,7,2
1048572,2,13,2,1,9999-99-99,97,2,55,97,2,...,2,2,2,2,2,2,2,2,7,97
1048573,2,13,2,1,9999-99-99,97,2,28,97,2,...,2,2,2,2,2,2,2,2,7,97


## 3. Создание таргета: факт смерти
* Если пациент умер (DATE_DIED != '9999-99-99') — метка 1, иначе 0

In [None]:
df['DIED'] = df['DATE_DIED'].apply(lambda x: 1 if x != '9999-99-99' else 0)
df.drop(columns=['DATE_DIED'], inplace=True)

In [None]:
df['DIED']

Unnamed: 0,DIED
0,1
1,1
2,1
3,1
4,1
...,...
1048570,0
1048571,0
1048572,0
1048573,0


## 4. Предобработка данных
* Заменяем неизвестные коды (97,98,99) на NaN и заполняем медианой

In [None]:
unknown = [97, 98, 99]
for col in ['INTUBED', 'PNEUMONIA', 'USMER', 'PREGNANT', 'DIABETES', 'PATIENT_TYPE', 'MEDICAL_UNIT', 'SEX']:
    df[col] = df[col].replace(unknown, np.nan)
# Убираем строки без ключевых данных
df.dropna(subset=['DIED', 'AGE', 'SEX'], inplace=True)
# Заполняем оставшиеся пропуски медианой
df.fillna(df.median(), inplace=True)

In [None]:
df

Unnamed: 0,USMER,MEDICAL_UNIT,SEX,PATIENT_TYPE,INTUBED,PNEUMONIA,AGE,PREGNANT,DIABETES,COPD,...,INMSUPR,HIPERTENSION,OTHER_DISEASE,CARDIOVASCULAR,OBESITY,RENAL_CHRONIC,TOBACCO,CLASIFFICATION_FINAL,ICU,DIED
0,2,1,1,1,2.0,1.0,65,2.0,2.0,2,...,2,1,2,2,2,2,2,3,97,1
1,2,1,2,1,2.0,1.0,72,2.0,2.0,2,...,2,1,2,2,1,1,2,5,97,1
2,2,1,2,2,1.0,2.0,55,2.0,1.0,2,...,2,2,2,2,2,2,2,3,2,1
3,2,1,1,1,2.0,2.0,53,2.0,2.0,2,...,2,2,2,2,2,2,2,7,97,1
4,2,1,2,1,2.0,2.0,68,2.0,1.0,2,...,2,1,2,2,2,2,2,3,97,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1048570,2,13,2,1,2.0,2.0,40,2.0,2.0,2,...,2,2,2,2,2,2,2,7,97,0
1048571,1,13,2,2,2.0,2.0,51,2.0,2.0,2,...,2,1,2,2,2,2,2,7,2,0
1048572,2,13,2,1,2.0,2.0,55,2.0,2.0,2,...,2,2,2,2,2,2,2,7,97,0
1048573,2,13,2,1,2.0,2.0,28,2.0,2.0,2,...,2,2,2,2,2,2,2,7,97,0


## 5. Кодирование категорий

In [None]:
cat_cols = ['USMER', 'MEDICAL_UNIT', 'PATIENT_TYPE', 'SEX', 'INTUBED', 'PNEUMONIA', 'PREGNANT', 'DIABETES']
for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))

## 6. Формирование выборки

In [None]:
X = df.drop(columns=['DIED'])
y = df['DIED']

## 7. Масштабирование

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

## 8. Разделение на train/test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, stratify=y, random_state=897561
)


## 9. Подготовка для 1D CNN: добавляем канал

In [None]:
X_train_cnn = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test_cnn = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

## 10. Функция создания модели

In [None]:
def create_cnn_model(filters=32, kernel_size=2, dense_units=64, dropout_rate=0.5):
    model = Sequential([
        Conv1D(filters, kernel_size, activation='relu', input_shape=(X_train_cnn.shape[1], 1)),
        MaxPooling1D(pool_size=2),
        Flatten(),
        Dense(dense_units, activation='relu'),
        Dropout(dropout_rate),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [None]:
import pandas as pd
from datetime import datetime

def run_experiment(filters, kernel_size, dense_units, dropout_rate, batch_size, epochs):
    print(f"Training with params:\n(filters, kernel_size, dense_units, dropout_rate, batch_size, epochs) \n{(filters, kernel_size, dense_units, dropout_rate, batch_size, epochs)}")
    model = create_cnn_model(
        filters=filters,
        kernel_size=kernel_size,
        dense_units=dense_units,
        dropout_rate=dropout_rate
    )
    es = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
    history = model.fit(
        X_train_cnn, y_train,
        validation_split=0.2,
        epochs=epochs,
        batch_size=batch_size,
        callbacks=[es],
        verbose=1
    )
    # Оценка
    y_pred_prob = model.predict(X_test_cnn)
    y_pred = (y_pred_prob > 0.5).astype(int)
    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_prob)
    loss = history.history['val_loss'][-1]
    # results.append({**params, 'accuracy': acc, 'roc_auc': auc})

    result = {
        'timestamp': datetime.now(),
        'filters': filters,
        'kernel_size': kernel_size,
        'dense_units': dense_units,
        'dropout': dropout_rate,
        'epochs': epochs,
        'batch_size': batch_size,
        'val_accuracy': round(acc, 6),
        'roc_accuracy': round(auc, 6),
        'val_loss': round(loss, 6)
    }

    print("✅ Завершено:", result)
    return result

## 11. Эксперименты: разные гиперпараметры

In [None]:
results = []
# filters, kernel_size, dense_units, dropout_rate, batch_size, epochs
param_grid = [
    (16, 2, 32, 0.3, 32, 10),
    (32, 3, 64, 0.5, 64, 10),
    (64, 2, 128, 0.5, 32, 15),
]

for params in param_grid:
    result = run_experiment(*params)
    results.append(result)


Training with params:
(filters, kernel_size, dense_units, dropout_rate, batch_size, epochs) 
(16, 2, 32, 0.3, 32, 10)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m20972/20972[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 3ms/step - accuracy: 0.9443 - loss: 0.1372 - val_accuracy: 0.9480 - val_loss: 0.1198
Epoch 2/10
[1m20972/20972[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 3ms/step - accuracy: 0.9475 - loss: 0.1208 - val_accuracy: 0.9498 - val_loss: 0.1160
Epoch 3/10
[1m20972/20972[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 4ms/step - accuracy: 0.9475 - loss: 0.1195 - val_accuracy: 0.9493 - val_loss: 0.1165
Epoch 4/10
[1m20972/20972[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 3ms/step - accuracy: 0.9474 - loss: 0.1198 - val_accuracy: 0.9494 - val_loss: 0.1163
Epoch 5/10
[1m20972/20972[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 4ms/step - accuracy: 0.9479 - loss: 0.1178 - val_accuracy: 0.9494 - val_loss: 0.1156
Epoch 6/10
[1m20972/20972[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 4ms/step - accuracy: 0.9484 - loss: 0.1176 - val_accuracy: 0.9498 - val_loss: 0.115

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m10486/10486[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 4ms/step - accuracy: 0.9451 - loss: 0.1344 - val_accuracy: 0.9486 - val_loss: 0.1169
Epoch 2/10
[1m10486/10486[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 5ms/step - accuracy: 0.9475 - loss: 0.1202 - val_accuracy: 0.9498 - val_loss: 0.1161
Epoch 3/10
[1m10486/10486[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 4ms/step - accuracy: 0.9477 - loss: 0.1196 - val_accuracy: 0.9497 - val_loss: 0.1149
Epoch 4/10
[1m10486/10486[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 4ms/step - accuracy: 0.9484 - loss: 0.1179 - val_accuracy: 0.9501 - val_loss: 0.1144
Epoch 5/10
[1m10486/10486[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 5ms/step - accuracy: 0.9483 - loss: 0.1179 - val_accuracy: 0.9498 - val_loss: 0.1142
Epoch 6/10
[1m10486/10486[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 4ms/step - accuracy: 0.9485 - loss: 0.1178 - val_accuracy: 0.9506 - val_loss: 0.114

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/15
[1m20972/20972[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 5ms/step - accuracy: 0.9449 - loss: 0.1299 - val_accuracy: 0.9493 - val_loss: 0.1166
Epoch 2/15
[1m20972/20972[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m125s[0m 6ms/step - accuracy: 0.9481 - loss: 0.1184 - val_accuracy: 0.9497 - val_loss: 0.1153
Epoch 3/15
[1m20972/20972[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 5ms/step - accuracy: 0.9478 - loss: 0.1184 - val_accuracy: 0.9496 - val_loss: 0.1152
Epoch 4/15
[1m20972/20972[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 5ms/step - accuracy: 0.9484 - loss: 0.1177 - val_accuracy: 0.9497 - val_loss: 0.1152
Epoch 5/15
[1m20972/20972[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 5ms/step - accuracy: 0.9479 - loss: 0.1176 - val_accuracy: 0.9497 - val_loss: 0.1145
Epoch 6/15
[1m20972/20972[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 5ms/step - accuracy: 0.9486 - loss: 0.1167 - val_accuracy: 0.9501 - val_loss:

## 12. Вывод результатов

In [None]:
results_df = pd.DataFrame(results)
print("\n=== Results ===")
print(results_df)


=== Results ===
                   timestamp  filters  kernel_size  dense_units  dropout  \
0 2025-05-30 10:10:50.734952       16            2           32      0.3   
1 2025-05-30 10:22:02.266467       32            3           64      0.5   
2 2025-05-30 10:55:22.704027       64            2          128      0.5   

   epochs  batch_size  val_accuracy  roc_accuracy  val_loss  
0      10          32      0.949264      0.965334  0.114787  
1      10          64      0.949808      0.966059  0.113860  
2      15          32      0.949713      0.966029  0.114386  
