Importing the Dependencies

In [4]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

Data Collection & Processing

In [2]:
covid_dataset = pd.read_csv('Covid_Data.csv')

In [3]:
covid_dataset.head()

Unnamed: 0,USMER,MEDICAL_UNIT,SEX,PATIENT_TYPE,DATE_DIED,INTUBED,PNEUMONIA,AGE,PREGNANT,DIABETES,...,ASTHMA,INMSUPR,HIPERTENSION,OTHER_DISEASE,CARDIOVASCULAR,OBESITY,RENAL_CHRONIC,TOBACCO,CLASIFFICATION_FINAL,ICU
0,2,1,1,1,03/05/2020,97,1,65.0,2.0,2.0,...,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,3.0,97.0
1,2,1,2,1,03/06/2020,97,1,72.0,97.0,2.0,...,2.0,2.0,1.0,2.0,2.0,1.0,1.0,2.0,5.0,97.0
2,2,1,2,2,09/06/2020,1,2,55.0,97.0,1.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,3.0,2.0
3,2,1,1,1,12/06/2020,97,2,53.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,7.0,97.0
4,2,1,2,1,21/06/2020,97,2,68.0,97.0,1.0,...,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,3.0,97.0


In [5]:
covid_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 248319 entries, 0 to 248318
Data columns (total 21 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   USMER                 248319 non-null  int64  
 1   MEDICAL_UNIT          248319 non-null  int64  
 2   SEX                   248319 non-null  int64  
 3   PATIENT_TYPE          248319 non-null  int64  
 4   DATE_DIED             248319 non-null  object 
 5   INTUBED               248319 non-null  int64  
 6   PNEUMONIA             248319 non-null  int64  
 7   AGE                   248318 non-null  float64
 8   PREGNANT              248318 non-null  float64
 9   DIABETES              248318 non-null  float64
 10  COPD                  248318 non-null  float64
 11  ASTHMA                248318 non-null  float64
 12  INMSUPR               248318 non-null  float64
 13  HIPERTENSION          248318 non-null  float64
 14  OTHER_DISEASE         248318 non-null  float64
 15  

In [8]:
# Check for missing values
covid_dataset.isnull().sum()

Unnamed: 0,0
USMER,0
MEDICAL_UNIT,0
SEX,0
PATIENT_TYPE,0
DATE_DIED,0
INTUBED,0
PNEUMONIA,0
AGE,1
PREGNANT,1
DIABETES,1


In [43]:
# Strategy for handling missing values:
# - Use mean for numeric columns.
# - For categorical columns, use the most frequent value.

numeric_columns = covid_dataset.select_dtypes(include=['float64', 'int64']).columns.tolist()
categorical_columns = covid_dataset.select_dtypes(include=['object', 'bool']).columns.tolist()


In [12]:
# Impute missing values for numeric columns
imputer_numeric = SimpleImputer(strategy='mean')
covid_dataset[numeric_columns] = imputer_numeric.fit_transform(covid_dataset[numeric_columns])


In [14]:
# Handle missing values for categorical columns
if categorical_columns:  # Ensure there are categorical columns to process
    imputer_categorical = SimpleImputer(strategy='most_frequent')
    covid_dataset[categorical_columns] = pd.DataFrame(
        imputer_categorical.fit_transform(covid_dataset[categorical_columns]),
        columns=categorical_columns
    )

In [15]:
covid_dataset.isnull().sum()

Unnamed: 0,0
USMER,0
MEDICAL_UNIT,0
SEX,0
PATIENT_TYPE,0
DATE_DIED,0
INTUBED,0
PNEUMONIA,0
AGE,0
PREGNANT,0
DIABETES,0


In [16]:
# Define features (X) and target variable (Y)
if 'CLASIFFICATION_FINAL' not in covid_dataset.columns:
    raise ValueError("The 'CLASIFFICATION_FINAL' column (target variable) is missing from the dataset.")
X = covid_dataset.drop(columns=['CLASIFFICATION_FINAL'], axis=1)  # Drop target variable
Y = covid_dataset['CLASIFFICATION_FINAL']

In [21]:
print(X)

        USMER  MEDICAL_UNIT  SEX  PATIENT_TYPE  DATE_DIED  INTUBED  PNEUMONIA  \
0         2.0           1.0  1.0           1.0        1.0     97.0        1.0   
1         2.0           1.0  2.0           1.0        1.0     97.0        1.0   
2         2.0           1.0  2.0           2.0        1.0      1.0        2.0   
3         2.0           1.0  1.0           1.0        1.0     97.0        2.0   
4         2.0           1.0  2.0           1.0        1.0     97.0        2.0   
...       ...           ...  ...           ...        ...      ...        ...   
248314    2.0           4.0  2.0           1.0        0.0     97.0        2.0   
248315    1.0           4.0  1.0           1.0        0.0     97.0        2.0   
248316    2.0           4.0  1.0           2.0        0.0      2.0        2.0   
248317    2.0           4.0  2.0           2.0        0.0      2.0        2.0   
248318    2.0           4.0  2.0           1.0        0.0     97.0        2.0   

              AGE   PREGNAN

In [22]:
print(Y)

0         3.000000
1         5.000000
2         3.000000
3         7.000000
4         3.000000
            ...   
248314    7.000000
248315    7.000000
248316    7.000000
248317    7.000000
248318    4.499058
Name: CLASIFFICATION_FINAL, Length: 248319, dtype: float64


In [44]:
# Check target variable's unique values
print("\nTarget (Y) unique values:\n", Y.unique())


Target (Y) unique values:
 [3.         5.         7.         6.         1.         2.
 4.         4.49905766]


In [45]:
# Handle the target variable (convert to discrete if necessary)
if Y.dtype in ['float64', 'int64']:
    # Example: Convert continuous target into bins (adjust as per your dataset)
    bins = [0, 1, 2, 3]  # Define bin edges
    labels = ['Low', 'Medium', 'High']  # Define class labels
    Y = pd.cut(Y, bins=bins, labels=labels)


In [46]:
# Encode the target variable if categorical
encoder = LabelEncoder()
Y = encoder.fit_transform(Y)

In [47]:
# Encode categorical features in X
for column in categorical_columns:
    if column in X.columns:
        X[column] = LabelEncoder().fit_transform(X[column])


In [48]:
# Scale numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [49]:

# Split data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=42)


In [50]:
# Check the distribution of the target in training data
print("\nUnique values in target (Y_train):", np.unique(Y_train, return_counts=True))



Unique values in target (Y_train): (array([0, 1, 2, 3]), array([107104,    285,   1003,  90263]))


In [51]:
# Train a Random Forest Classifier
model = RandomForestClassifier(random_state=42, n_estimators=100)
model.fit(X_train, Y_train)

In [52]:
# Make predictions
train_predictions = model.predict(X_train)
test_predictions = model.predict(X_test)

In [53]:
# Evaluate the model
print("\nModel Evaluation:")
print(f"Training Accuracy: {accuracy_score(Y_train, train_predictions):.2f}")
print(f"Testing Accuracy: {accuracy_score(Y_test, test_predictions):.2f}")
print("\nClassification Report (Testing Data):")
print(classification_report(Y_test, test_predictions))
print("\nConfusion Matrix:")
print(confusion_matrix(Y_test, test_predictions))


Model Evaluation:
Training Accuracy: 0.71
Testing Accuracy: 0.61

Classification Report (Testing Data):
              precision    recall  f1-score   support

           0       0.62      0.68      0.65     26624
           1       0.00      0.00      0.00        70
           2       0.04      0.01      0.02       246
           3       0.58      0.53      0.55     22724

    accuracy                           0.61     49664
   macro avg       0.31      0.31      0.31     49664
weighted avg       0.60      0.61      0.60     49664


Confusion Matrix:
[[18130     9    51  8434]
 [   44     0     1    25]
 [  178     1     3    64]
 [10703     3    27 11991]]


In [54]:
# Predict on a new sample
# Replace these values with real data
sample_input = [1, 3, 1, 0, 0, 0, 0, 45, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1]  # Example input
sample_input = np.asarray(sample_input).reshape(1, -1)

In [55]:
# Transform the sample input using the fitted scaler
sample_input_scaled = scaler.transform(sample_input)

# Make prediction
prediction = model.predict(sample_input_scaled)

# Output prediction result
print("\nSample Prediction:", prediction)
if prediction[0] == 1:
    print("Prediction: COVID-19 Positive (Severe).")
else:
    print("Prediction: COVID-19 Negative or Mild.")


Sample Prediction: [0]
Prediction: COVID-19 Negative or Mild.


