In [1]:
import pandas as pd
import numpy as np


In [2]:

# Load the dataset using the exact filename
df = pd.read_csv("/kaggle/input/covid-19-reinfection-and-health-dataset/covid_related_disease_data.csv")

# Display first 5 rows to verify
df.head()

Unnamed: 0,Patient_ID,Age,Gender,Region,Preexisting_Condition,Date_of_Infection,COVID_Strain,Symptoms,Severity,Hospitalized,...,Reinfection,Date_of_Reinfection,Vaccination_Status,Vaccine_Type,Doses_Received,Date_of_Last_Dose,Long_COVID_Symptoms,Occupation,Smoking_Status,BMI
0,1,69,Male,Hovedstaden,Obesity,2022-06-21,Delta,Mild,Moderate,Yes,...,No,,Yes,,1,2022-09-22,,Healthcare,Never,27.7
1,2,38,Male,Sjælland,Asthma,2024-02-02,XBB.1.5,Mild,Moderate,No,...,No,,No,,0,,,Healthcare,Never,21.9
2,3,41,Female,Syddanmark,Hypertension,2023-05-28,Beta,Mild,High,Yes,...,No,,Yes,Janssen,3,2024-05-14,,Unemployed,Never,22.7
3,4,81,Female,Hovedstaden,Asthma,2023-08-13,Delta,Severe,High,No,...,Yes,2024-08-24,Yes,AstraZeneca,1,2024-10-31,,Office Worker,Never,27.7
4,5,50,Female,Syddanmark,Cardiovascular,2023-03-10,Delta,Mild,High,No,...,No,,Yes,,2,2023-07-05,,Student,Never,11.9


In [3]:
df.dtypes

Patient_ID                   int64
Age                          int64
Gender                      object
Region                      object
Preexisting_Condition       object
Date_of_Infection           object
COVID_Strain                object
Symptoms                    object
Severity                    object
Hospitalized                object
Hospital_Admission_Date     object
Hospital_Discharge_Date     object
ICU_Admission               object
Ventilator_Support          object
Recovered                   object
Date_of_Recovery            object
Reinfection                 object
Date_of_Reinfection         object
Vaccination_Status          object
Vaccine_Type                object
Doses_Received               int64
Date_of_Last_Dose           object
Long_COVID_Symptoms         object
Occupation                  object
Smoking_Status              object
BMI                        float64
dtype: object

In [4]:
df.isnull().sum()

Patient_ID                    0
Age                           0
Gender                        0
Region                        0
Preexisting_Condition       469
Date_of_Infection             0
COVID_Strain                  0
Symptoms                      0
Severity                      0
Hospitalized                  0
Hospital_Admission_Date    2124
Hospital_Discharge_Date    2124
ICU_Admission                 0
Ventilator_Support            0
Recovered                     0
Date_of_Recovery           1492
Reinfection                   0
Date_of_Reinfection        2715
Vaccination_Status            0
Vaccine_Type               1809
Doses_Received                0
Date_of_Last_Dose          1528
Long_COVID_Symptoms        2780
Occupation                    0
Smoking_Status                0
BMI                           0
dtype: int64

In [5]:
df['Preexisting_Condition'] = df['Preexisting_Condition'].fillna('None')

In [6]:
df.loc[df['Hospitalized'] == 'No',['Hospital_Admission_Date','Hospital_Discharge_Date']] = pd.NA

In [7]:
print(df[df['Date_of_Recovery'].isnull()]['Recovered'].value_counts())

Recovered
No    1492
Name: count, dtype: int64


In [8]:
df['Date_of_Recovery'] = pd.to_datetime(df['Date_of_Recovery'])
print(df['Date_of_Recovery'].isnull().sum())  # Should be 1492 (all `Recovered=="No"`)

1492


In [9]:
#all null values for Date_of_Recovery is not Recovered yet add a placeholder
df['Date_of_Recovery'] = df['Date_of_Recovery'].fillna(pd.to_datetime('2099-12-31'))
df['Date_of_Recovery'].isnull().sum()

0

In [10]:
#all null values for Date_of_Reinfection doesn't have a Reinfection keep null
print(df[df['Date_of_Reinfection'].isnull()]['Reinfection'].value_counts())

Reinfection
No    2715
Name: count, dtype: int64


In [11]:
print(df[df['Vaccine_Type'].isnull()]['Vaccination_Status'].value_counts())

Vaccination_Status
No     1528
Yes     281
Name: count, dtype: int64


In [12]:
#for non vaccined patient keep vaccine type null else make it unknown
df.loc[df['Vaccination_Status'] == 'Yes','Vaccine_Type'] = 'Unknown'

In [13]:
#vaccine type is either unknown or nan drop column
df['Vaccine_Type'].unique()

array(['Unknown', nan], dtype=object)

In [14]:
df.drop('Vaccine_Type',axis=1)

Unnamed: 0,Patient_ID,Age,Gender,Region,Preexisting_Condition,Date_of_Infection,COVID_Strain,Symptoms,Severity,Hospitalized,...,Date_of_Recovery,Reinfection,Date_of_Reinfection,Vaccination_Status,Doses_Received,Date_of_Last_Dose,Long_COVID_Symptoms,Occupation,Smoking_Status,BMI
0,1,69,Male,Hovedstaden,Obesity,2022-06-21,Delta,Mild,Moderate,Yes,...,2023-04-19,No,,Yes,1,2022-09-22,,Healthcare,Never,27.7
1,2,38,Male,Sjælland,Asthma,2024-02-02,XBB.1.5,Mild,Moderate,No,...,2099-12-31,No,,No,0,,,Healthcare,Never,21.9
2,3,41,Female,Syddanmark,Hypertension,2023-05-28,Beta,Mild,High,Yes,...,2099-12-31,No,,Yes,3,2024-05-14,,Unemployed,Never,22.7
3,4,81,Female,Hovedstaden,Asthma,2023-08-13,Delta,Severe,High,No,...,2025-02-09,Yes,2024-08-24,Yes,1,2024-10-31,,Office Worker,Never,27.7
4,5,50,Female,Syddanmark,Cardiovascular,2023-03-10,Delta,Mild,High,No,...,2099-12-31,No,,Yes,2,2023-07-05,,Student,Never,11.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,2996,43,Male,Nordjylland,Hypertension,2022-10-19,XBB.1.5,Severe,Critical,No,...,2099-12-31,No,,Yes,1,2024-09-20,,Driver,Never,22.0
2996,2997,36,Female,Syddanmark,Obesity,2022-12-16,Omicron,Moderate,Low,No,...,2099-12-31,No,,Yes,2,2023-10-05,,Healthcare,Never,27.8
2997,2998,75,Female,Sjælland,Cardiovascular,2023-09-30,Beta,Severe,Moderate,No,...,2099-12-31,No,,Yes,3,2023-05-13,,Teacher,Former,20.9
2998,2999,45,Female,Hovedstaden,Asthma,2023-06-06,Delta,Severe,Moderate,No,...,2099-12-31,No,,Yes,1,2024-05-13,,Student,Never,19.3


In [15]:
#all non vaccinated patients have null values at the Date_of_Last_Dose , keep null
print(df[df['Date_of_Last_Dose'].isnull()]['Vaccination_Status'].value_counts())

Vaccination_Status
No    1528
Name: count, dtype: int64


In [16]:
#assign null covid symptoms with none
df['Long_COVID_Symptoms'] = df['Long_COVID_Symptoms'].fillna('None')

In [17]:
df.duplicated().sum()

0

In [18]:
df

Unnamed: 0,Patient_ID,Age,Gender,Region,Preexisting_Condition,Date_of_Infection,COVID_Strain,Symptoms,Severity,Hospitalized,...,Reinfection,Date_of_Reinfection,Vaccination_Status,Vaccine_Type,Doses_Received,Date_of_Last_Dose,Long_COVID_Symptoms,Occupation,Smoking_Status,BMI
0,1,69,Male,Hovedstaden,Obesity,2022-06-21,Delta,Mild,Moderate,Yes,...,No,,Yes,Unknown,1,2022-09-22,,Healthcare,Never,27.7
1,2,38,Male,Sjælland,Asthma,2024-02-02,XBB.1.5,Mild,Moderate,No,...,No,,No,,0,,,Healthcare,Never,21.9
2,3,41,Female,Syddanmark,Hypertension,2023-05-28,Beta,Mild,High,Yes,...,No,,Yes,Unknown,3,2024-05-14,,Unemployed,Never,22.7
3,4,81,Female,Hovedstaden,Asthma,2023-08-13,Delta,Severe,High,No,...,Yes,2024-08-24,Yes,Unknown,1,2024-10-31,,Office Worker,Never,27.7
4,5,50,Female,Syddanmark,Cardiovascular,2023-03-10,Delta,Mild,High,No,...,No,,Yes,Unknown,2,2023-07-05,,Student,Never,11.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,2996,43,Male,Nordjylland,Hypertension,2022-10-19,XBB.1.5,Severe,Critical,No,...,No,,Yes,Unknown,1,2024-09-20,,Driver,Never,22.0
2996,2997,36,Female,Syddanmark,Obesity,2022-12-16,Omicron,Moderate,Low,No,...,No,,Yes,Unknown,2,2023-10-05,,Healthcare,Never,27.8
2997,2998,75,Female,Sjælland,Cardiovascular,2023-09-30,Beta,Severe,Moderate,No,...,No,,Yes,Unknown,3,2023-05-13,,Teacher,Former,20.9
2998,2999,45,Female,Hovedstaden,Asthma,2023-06-06,Delta,Severe,Moderate,No,...,No,,Yes,Unknown,1,2024-05-13,,Student,Never,19.3


In [19]:
df = df.drop('Patient_ID',axis=1)
df

Unnamed: 0,Age,Gender,Region,Preexisting_Condition,Date_of_Infection,COVID_Strain,Symptoms,Severity,Hospitalized,Hospital_Admission_Date,...,Reinfection,Date_of_Reinfection,Vaccination_Status,Vaccine_Type,Doses_Received,Date_of_Last_Dose,Long_COVID_Symptoms,Occupation,Smoking_Status,BMI
0,69,Male,Hovedstaden,Obesity,2022-06-21,Delta,Mild,Moderate,Yes,2025-01-13,...,No,,Yes,Unknown,1,2022-09-22,,Healthcare,Never,27.7
1,38,Male,Sjælland,Asthma,2024-02-02,XBB.1.5,Mild,Moderate,No,,...,No,,No,,0,,,Healthcare,Never,21.9
2,41,Female,Syddanmark,Hypertension,2023-05-28,Beta,Mild,High,Yes,2025-03-07,...,No,,Yes,Unknown,3,2024-05-14,,Unemployed,Never,22.7
3,81,Female,Hovedstaden,Asthma,2023-08-13,Delta,Severe,High,No,,...,Yes,2024-08-24,Yes,Unknown,1,2024-10-31,,Office Worker,Never,27.7
4,50,Female,Syddanmark,Cardiovascular,2023-03-10,Delta,Mild,High,No,,...,No,,Yes,Unknown,2,2023-07-05,,Student,Never,11.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,43,Male,Nordjylland,Hypertension,2022-10-19,XBB.1.5,Severe,Critical,No,,...,No,,Yes,Unknown,1,2024-09-20,,Driver,Never,22.0
2996,36,Female,Syddanmark,Obesity,2022-12-16,Omicron,Moderate,Low,No,,...,No,,Yes,Unknown,2,2023-10-05,,Healthcare,Never,27.8
2997,75,Female,Sjælland,Cardiovascular,2023-09-30,Beta,Severe,Moderate,No,,...,No,,Yes,Unknown,3,2023-05-13,,Teacher,Former,20.9
2998,45,Female,Hovedstaden,Asthma,2023-06-06,Delta,Severe,Moderate,No,,...,No,,Yes,Unknown,1,2024-05-13,,Student,Never,19.3


In [20]:
X = df.drop('Recovered',axis=1)
y = df['Recovered']

In [21]:
# Remove features that happen AFTER recovery (leak the answer)
X = X.drop(columns=[
    'Date_of_Recovery',          # Directly shows recovery status
    'Hospital_Discharge_Date',   # Only exists if patient recovered
    'Date_of_Reinfection'        # Only exists for reinfected patients
], errors='ignore')  # <- Safe even if some columns don't exist

In [22]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score,f1_score,precision_score,recall_score,classification_report
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler,LabelEncoder,OneHotEncoder,OrdinalEncoder

In [23]:
num_values = ['Age','Doses_Received','BMI']
one_hot_values = ['Gender', 'Region', 'Vaccination_Status',
                   'Occupation', 'Smoking_Status','Hospitalized',
                  'ICU_Admission','Ventilator_Support','Reinfection']
ordinal_values = ['Symptoms','Severity','Preexisting_Condition','COVID_Strain','Long_COVID_Symptoms','Occupation']
date_values = ['Date_of_Infection','Hospital_Admission_Date',
               'Date_of_Last_Dose']

In [24]:
# Convert to datetime, forcing invalid dates to NaT (Not a Time)
X[date_values] = X[date_values].apply(pd.to_datetime, errors='coerce')

# Verify conversion
print(X[date_values].dtypes)

Date_of_Infection          datetime64[ns]
Hospital_Admission_Date    datetime64[ns]
Date_of_Last_Dose          datetime64[ns]
dtype: object


In [25]:
for date in date_values:
  X[f'{date}_days'] = (pd.to_datetime('today') - pd.to_datetime(X[date])).dt.days.fillna(0)
X.drop(date_values, axis=1, inplace=True)

In [26]:
#add number of days to num_values
num_values.extend(f'{col}_days' for col in date_values)

In [27]:
process = ColumnTransformer(
    transformers=[
        ('num',StandardScaler(),num_values),
        ('one_hot',OneHotEncoder(),one_hot_values),
        ('Label',OrdinalEncoder(),ordinal_values)
    ]
)

In [28]:
X_train,X_test,y_train,y_test  = train_test_split(X,y,test_size=0.2,random_state=42)

In [29]:
encode = LabelEncoder()
y_train_scaled = encode.fit_transform(y_train)
y_test_scaled = encode.transform(y_test)

In [30]:
print("Columns in X_train:", list(X_train.columns))

Columns in X_train: ['Age', 'Gender', 'Region', 'Preexisting_Condition', 'COVID_Strain', 'Symptoms', 'Severity', 'Hospitalized', 'ICU_Admission', 'Ventilator_Support', 'Reinfection', 'Vaccination_Status', 'Vaccine_Type', 'Doses_Received', 'Long_COVID_Symptoms', 'Occupation', 'Smoking_Status', 'BMI', 'Date_of_Infection_days', 'Hospital_Admission_Date_days', 'Date_of_Last_Dose_days']


In [31]:
X_train_scaled = process.fit_transform(X_train)
X_test_scaled = process.transform(X_test)

In [32]:
model1 = LogisticRegression()
model1.fit(X_train_scaled,y_train_scaled)
y1_predict = model1.predict(X_test_scaled)

In [33]:
acc1 = accuracy_score(y_test_scaled,y1_predict)
recall1 = recall_score(y_test_scaled,y1_predict)
precision1 = precision_score(y_test_scaled,y1_predict)
f11 = f1_score(y_test_scaled,y1_predict)

In [34]:
model2 = SVC(kernel='linear')
model2.fit(X_train_scaled,y_train_scaled)
y2_predict = model2.predict(X_test_scaled)

In [35]:
acc2 = accuracy_score(y_test_scaled,y2_predict)
recall2 = recall_score(y_test_scaled,y2_predict)
precision2 = precision_score(y_test_scaled,y2_predict)
f12 = f1_score(y_test_scaled,y2_predict)

In [36]:
print('Logistic Regression Model')
print(f'Accuracy: {acc1*100:.2f}%')
print(f'Recall" {recall1*100:.2f}%')
print(f'Precision: {precision1*100:.2f}5')
print(f'F1-Score: {f11*100:.2f}%')

Logistic Regression Model
Accuracy: 48.33%
Recall" 43.55%
Precision: 45.795
F1-Score: 44.64%


In [37]:
print(classification_report(y_test_scaled, y1_predict))

              precision    recall  f1-score   support

           0       0.50      0.53      0.52       313
           1       0.46      0.44      0.45       287

    accuracy                           0.48       600
   macro avg       0.48      0.48      0.48       600
weighted avg       0.48      0.48      0.48       600



In [38]:
print('SVM Model')
print(f'Accuracy: {acc2*100:.2f}%')
print(f'Recall" {recall2*100:.2f}%')
print(f'Precision: {precision2*100:.2f}%')
print(f'F1-Score: {f12*100:.2f}%')

SVM Model
Accuracy: 50.67%
Recall" 33.80%
Precision: 47.78%
F1-Score: 39.59%


In [39]:
#too low accuracy but i don't know what to do
print(classification_report(y_test_scaled, y2_predict))

              precision    recall  f1-score   support

           0       0.52      0.66      0.58       313
           1       0.48      0.34      0.40       287

    accuracy                           0.51       600
   macro avg       0.50      0.50      0.49       600
weighted avg       0.50      0.51      0.49       600

