In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/covid-19-prediction/master_dataset.csv


In [2]:
import pandas as pd
import numpy as np

In [3]:
df=pd.read_csv("/kaggle/input/covid-19-prediction/master_dataset.csv",low_memory=False)

In [4]:
current_columns = df.columns.tolist()
new_columns = current_columns[1:] + ['empty_column']
df_fixed = df.copy()
df_fixed.columns = new_columns
df = df_fixed.drop('empty_column', axis=1)

In [5]:
df = df[['sex', 'age', 'bmi','smoking','alcohol','cannabis','amphetamines','cocaine','contacts_count','working','rate_reducing_risk_single','rate_reducing_mask','covid19_symptoms','covid19_contact','asthma','kidney_disease','liver_disease','compromised_immune','heart_disease','lung_disease','diabetes','hiv_positive','hypertension','other_chronic','nursing_home','health_worker','covid19_positive']]

In [6]:
def process_age_column(df, age_column='age'):
    """
    Process age column containing string ranges like '20_30' to their average values
    and handle null values by replacing them with mean.
    
    Parameters:
    df: pandas DataFrame
    age_column: str, name of the age column (default='age')
    
    Returns:
    pandas Series with processed age values
    """
    def extract_average(age_str):
        if pd.isna(age_str):
            return np.nan
        try:
            # Split the string on '_' and convert to integers
            start, end = map(int, str(age_str).split('_'))
            return (start + end) / 2
        except:
            return np.nan
    
    # Convert string ranges to averages
    processed_ages = df[age_column].apply(extract_average)
    
    # Replace null values with mean
    mean_age = processed_ages.mean()
    processed_ages = processed_ages.fillna(mean_age)
    
    return processed_ages

df['age'] = process_age_column(df, age_column='age')

In [7]:
categorical_cols = ['sex', 'age', 'smoking', 'alcohol', 'working', 'cannabis', 
                       'amphetamines', 'cocaine']
for col in categorical_cols:
    if col in df.columns:
        df[col] = df[col].fillna(df[col].mode()[0])

In [8]:
numerical_cols = ['bmi', 'contacts_count', 'rate_reducing_mask']
for col in numerical_cols:
    if col in df.columns:
        df[col] = df[col].fillna(df[col].mean())

In [9]:
nominal_cols = ['sex', 'smoking', 'working']
df_encoded = pd.get_dummies(df, columns=nominal_cols)
drug_cols = ['cannabis', 'amphetamines', 'cocaine']
for col in drug_cols:
    if col in df_encoded.columns:
        df_encoded[col] = pd.to_numeric(df_encoded[col], errors='coerce')
df=df_encoded

In [10]:
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE

scaler = MinMaxScaler()
cols_to_normalize = [col for col in df.columns if col != 'covid19_positive']
df_normalized = df.copy()
df_normalized[cols_to_normalize] = scaler.fit_transform(df[cols_to_normalize])
df=df_normalized

In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Input, Dropout, BatchNormalization, LeakyReLU
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam

# -------------------------------
# Assume data is preprocessed as described.
# The dataframe 'df' has 27 features (columns except 'covid19_positive')
# and a binary target 'covid19_positive'
# -------------------------------
# Split features and target
X = df.drop('covid19_positive', axis=1).values  # shape: (n_samples, 27)
y = df['covid19_positive'].values

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# -------------------------------
# 1. Build a deeper autoencoder to reduce features from 27 to 15
# -------------------------------
encoding_dim = 15
input_dim = X_train.shape[1]

input_layer = Input(shape=(input_dim,))

# Encoder part using LeakyReLU
encoded = Dense(128)(input_layer)
encoded = LeakyReLU(alpha=0.01)(encoded)
encoded = BatchNormalization()(encoded)

encoded = Dense(64)(encoded)
encoded = LeakyReLU(alpha=0.01)(encoded)
encoded = BatchNormalization()(encoded)

encoded = Dense(encoding_dim)(encoded)
encoded = LeakyReLU(alpha=0.01)(encoded)

# Decoder part (mirror of encoder) using LeakyReLU
decoded = Dense(64)(encoded)
decoded = LeakyReLU(alpha=0.01)(decoded)
decoded = BatchNormalization()(decoded)

decoded = Dense(128)(decoded)
decoded = LeakyReLU(alpha=0.01)(decoded)
decoded = BatchNormalization()(decoded)

decoded = Dense(input_dim, activation='sigmoid')(decoded)

autoencoder = Model(inputs=input_layer, outputs=decoded)
autoencoder.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

# Callbacks for autoencoder training
early_stop_ae = EarlyStopping(monitor='val_loss', patience=30, restore_best_weights=True)
reduce_lr_ae = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=15, min_lr=1e-6)

print("Training deeper autoencoder...")
autoencoder.fit(
    X_train, X_train,  # Autoencoder recreates its input
    validation_split=0.2,
    epochs=1000,
    batch_size=16,  # Smaller batch size for smoother optimization
    callbacks=[early_stop_ae, reduce_lr_ae],
    verbose=1
)

# Extract the encoder to get the 15-dimensional representation
encoder = Model(inputs=input_layer, outputs=encoded)
X_train_encoded = encoder.predict(X_train)
X_test_encoded = encoder.predict(X_test)

# -------------------------------
# 2. Build and train an optimized ANN on the encoded features
# -------------------------------
def create_optimized_ann(input_dim):
    model = Sequential()
    # First dense layer: increased neurons for richer feature extraction
    model.add(Dense(128, input_dim=input_dim))
    model.add(LeakyReLU(alpha=0.01))
    model.add(BatchNormalization())
    model.add(Dropout(0.3))
    
    # Second dense layer
    model.add(Dense(64))
    model.add(LeakyReLU(alpha=0.01))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    
    # Third dense layer
    model.add(Dense(32))
    model.add(LeakyReLU(alpha=0.01))
    model.add(BatchNormalization())
    model.add(Dropout(0.1))
    
    # Output layer for binary classification
    model.add(Dense(1, activation='sigmoid'))
    
    # Lower learning rate for smoother training
    optimizer = Adam(learning_rate=0.0005)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

ann_model = create_optimized_ann(input_dim=encoding_dim)

# Callbacks for ANN training
early_stop_ann = EarlyStopping(monitor='val_loss', patience=30, restore_best_weights=True)
reduce_lr_ann = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=15, min_lr=1e-6)

print("\nTraining ANN on encoded features with optimized architecture...")
ann_model.fit(
    X_train_encoded, y_train,
    validation_split=0.2,
    epochs=1000,
    batch_size=64,  # Increased batch size for more stable gradients
    callbacks=[early_stop_ann, reduce_lr_ann],
    verbose=1
)

# -------------------------------
# 3. Evaluate the optimized model on test data
# -------------------------------
y_pred_encoded = (ann_model.predict(X_test_encoded) > 0.5).astype(int)

print("\nOptimized ANN on Encoded Features Metrics:")
print("Accuracy:", accuracy_score(y_test, y_pred_encoded))
print("Precision:", precision_score(y_test, y_pred_encoded, zero_division=0))
print("Recall:", recall_score(y_test, y_pred_encoded, zero_division=0))
print("F1 Score:", f1_score(y_test, y_pred_encoded, zero_division=0))
print("\nClassification Report:\n", classification_report(y_test, y_pred_encoded, zero_division=0))




Training deeper autoencoder...
Epoch 1/1000
[1m40937/40937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m123s[0m 3ms/step - loss: 0.0105 - val_loss: 0.0017 - learning_rate: 0.0010
Epoch 2/1000
[1m40937/40937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m123s[0m 3ms/step - loss: 0.0023 - val_loss: 0.0017 - learning_rate: 0.0010
Epoch 3/1000
[1m40937/40937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m122s[0m 3ms/step - loss: 0.0020 - val_loss: 0.0019 - learning_rate: 0.0010
Epoch 4/1000
[1m40937/40937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m120s[0m 3ms/step - loss: 0.0018 - val_loss: 0.0014 - learning_rate: 0.0010
Epoch 5/1000
[1m40937/40937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m123s[0m 3ms/step - loss: 0.0017 - val_loss: 0.0014 - learning_rate: 0.0010
Epoch 6/1000
[1m40937/40937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m122s[0m 3ms/step - loss: 0.0016 - val_loss: 0.0015 - learning_rate: 0.0010
Epoch 7/1000
[1m40937/40937[0m [32m━━━━━━━━━━━━━━━━━━━

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



Training ANN on encoded features with optimized architecture...
Epoch 1/1000
[1m10235/10235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 4ms/step - accuracy: 0.9540 - loss: 0.1385 - val_accuracy: 0.9887 - val_loss: 0.0456 - learning_rate: 5.0000e-04
Epoch 2/1000
[1m10235/10235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 4ms/step - accuracy: 0.9890 - loss: 0.0461 - val_accuracy: 0.9890 - val_loss: 0.0438 - learning_rate: 5.0000e-04
Epoch 3/1000
[1m10235/10235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 4ms/step - accuracy: 0.9890 - loss: 0.0450 - val_accuracy: 0.9891 - val_loss: 0.0433 - learning_rate: 5.0000e-04
Epoch 4/1000
[1m10235/10235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 3ms/step - accuracy: 0.9891 - loss: 0.0446 - val_accuracy: 0.9893 - val_loss: 0.0428 - learning_rate: 5.0000e-04
Epoch 5/1000
[1m10235/10235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 3ms/step - accuracy: 0.9894 - loss: 0.0438 - val_accuracy: 0.98