<h1>Imports</h1>

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import logging

<h1>Logging</h1>

In [None]:
# Set up logging
logging.basicConfig(level=logging.INFO)

<h1>Function Declarations</h1>

In [None]:
def eda(data):
    # Excluding the patient visit identifier
    data = data.drop(columns=['PATIENT_VISIT_IDENTIFIER'])

    # 1. Histogram for 'BLOODPRESSURE_SISTOLIC_MEAN'
    plt.figure(figsize=(10, 6))
    sns.histplot(data['BLOODPRESSURE_SISTOLIC_MEAN'].dropna(), kde=True, bins=30)
    plt.title('Distribution of Systolic Blood Pressure Mean')
    plt.xlabel('Systolic Blood Pressure Mean')
    plt.ylabel('Count')
    plt.show()

    # 2. Bar Chart for 'AGE_PERCENTIL'
    plt.figure(figsize=(10, 6))
    sns.countplot(x='AGE_PERCENTIL', data=data)
    plt.title('Counts of Age Percentiles')
    plt.xlabel('Age Percentile')
    plt.xticks(rotation=45)
    plt.ylabel('Count')
    plt.show()

    # 3. Box Plot for 'BLOODPRESSURE_SISTOLIC_MEAN' by 'AGE_PERCENTIL'
    plt.figure(figsize=(12, 6))
    sns.boxplot(x='AGE_PERCENTIL', y='BLOODPRESSURE_SISTOLIC_MEAN', data=data)
    plt.title('Systolic Blood Pressure Mean by Age Percentile')
    plt.xlabel('Age Percentile')
    plt.xticks(rotation=45)
    plt.ylabel('Systolic Blood Pressure Mean')
    plt.show()

    # 4. Correlation Heatmap for numeric columns
    numeric_data = data.select_dtypes(include=[np.number])
    plt.figure(figsize=(10, 8))
    sns.heatmap(numeric_data.corr(), annot=True, cmap='coolwarm')
    plt.title('Correlation Heatmap (Numeric Columns)')
    plt.show()

    # 5. Missing Data Heatmap for the first 30 columns
    plt.figure(figsize=(15, 6))
    sns.heatmap(data.iloc[:, :30].isnull(), cbar=False, cmap='viridis')
    plt.title('Missing Data Heatmap (First 30 Columns)')
    plt.show()


In [None]:
def compute_correlations(data, target_column, top_n=10):
    # Select only numeric columns for correlation
    numeric_data = data.select_dtypes(include=[np.number])
    correlation_with_target = numeric_data.corr()[target_column].sort_values(ascending=False)
    print(f"Top {top_n} Features Correlated with {target_column}:\n")
    print(correlation_with_target.head(top_n))

In [None]:
def load_and_preprocess_data(filepath):
    # Load the data
    data = pd.read_csv(filepath)
    # EDA
    eda(data)
    compute_correlations(data, 'ICU', 20)  # For top 20 features correlated with ICU
    # Drop columns with more than 50% missing values
    threshold = 0.5 * len(data)
    data_dropped = data.dropna(thresh=threshold, axis=1)
    # Separate features and target
    X = data_dropped.drop('ICU', axis=1)
    y = data_dropped['ICU']
    # Identify numerical and categorical columns
    numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
    categorical_cols = X.select_dtypes(include=['object']).columns
    # Preprocessing pipelines for both numeric and categorical data
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])
    # Combine transformers into a ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)])
    return preprocessor, X, y

In [None]:
def train_and_evaluate_model(X, y, preprocessor):
    # Define the model and parameters for grid search
    model = XGBClassifier()  # Use XGBoost classifier
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 20, 30, None],
        'learning_rate': [0.01, 0.1, 0.3],  # XGBoost-specific parameter
        'min_child_weight': [1, 2, 3],  # XGBoost-specific parameter
        'subsample': [0.8, 0.9, 1.0],  # XGBoost-specific parameter
    }

    # Create a grid search model
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='recall', n_jobs=-1)

    # Create a full pipeline with preprocessing and the model
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('grid_search', grid_search)])

    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Fit the pipeline to the training data
    pipeline.fit(X_train, y_train)

    # Evaluate the model
    y_pred = pipeline.predict(X_test)
    print(classification_report(y_test, y_pred))

    # Best parameters
    print("Best parameters:", grid_search.best_params_)

In [None]:
def main():
    filepath = 'COVID-Full.csv'
    preprocessor, X, y = load_and_preprocess_data(filepath)
    train_and_evaluate_model(X, y, preprocessor)

<h1>Script Start</h1>

In [None]:
if __name__ == "__main__":
    main()