In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import joblib

# Load the dataset
data_path = r"C:\SUDHA\Personal portfolio project\datasets\covid.csv"  # Raw string to fix escape sequence warning
data = pd.read_csv(data_path)

# Inspect the dataset
print("Dataset Info:")
print(data.info())
print("\nFirst few rows:")
print(data.head())

# Remove the 'Country' column if it exists
if 'Country' in data.columns:
    data = data.drop(columns=['Country'])

# Add a new 'Outcome' column (binary classification: 1 for positive, 0 for negative)
import numpy as np
np.random.seed(42)  # For reproducibility
data['Outcome'] = np.random.choice([0, 1], size=len(data))

# Check for missing values
if data.isnull().sum().any():
    print("\nMissing values detected! Handling missing values...")
    data = data.dropna()  # Simple approach: drop rows with missing values

# Split features (X) and target (y)
target_column = "Outcome"  # The new column added
X = data.drop(target_column, axis=1)
y = data[target_column]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Evaluate the model on the test set
y_pred = rf_model.predict(X_test)
y_prob = rf_model.predict_proba(X_test)[:, 1]  # Probability of being positive for COVID-19

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)

print("\nModel Performance:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"ROC-AUC: {roc_auc:.2f}")

# Save the trained model
model_path = r"C:\SUDHA\Personal portfolio project\Models\covid_rf_model.pkl"  # Raw string for model path
joblib.dump(rf_model, model_path)
print(f"\nModel saved as '{model_path}'")

# Optional: Display feature importance
feature_importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

print("\nFeature Importances:")
print(feature_importances)

# Save feature importances to a CSV file
feature_importances.to_csv(r"C:\SUDHA\Personal portfolio project\Models\feature_importances.csv", index=False)
print("\nFeature importances saved to 'feature_importances.csv'")


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 316800 entries, 0 to 316799
Data columns (total 27 columns):
 #   Column                   Non-Null Count   Dtype
---  ------                   --------------   -----
 0   Fever                    316800 non-null  int64
 1   Tiredness                316800 non-null  int64
 2   Dry-Cough                316800 non-null  int64
 3   Difficulty-in-Breathing  316800 non-null  int64
 4   Sore-Throat              316800 non-null  int64
 5   None_Sympton             316800 non-null  int64
 6   Pains                    316800 non-null  int64
 7   Nasal-Congestion         316800 non-null  int64
 8   Runny-Nose               316800 non-null  int64
 9   Diarrhea                 316800 non-null  int64
 10  None_Experiencing        316800 non-null  int64
 11  Age_0-9                  316800 non-null  int64
 12  Age_10-19                316800 non-null  int64
 13  Age_20-24                316800 non-null  int64
 14  Age_25-59             

In [11]:
import numpy as np
import pandas as pd
import joblib

# Load the saved model
model_path = r"C:\SUDHA\Personal portfolio project\Models\covid_rf_model.pkl"
rf_model = joblib.load(model_path)

# Get the feature names from the trained model
expected_features = rf_model.feature_names_in_

# Define a function to take user input and make a prediction
def predict_covid():
    print("Enter the following details (1 for Yes, 0 for No):")
    try:
        # Collect user inputs (ensure values match the expected features)
        user_inputs = {
            "Fever": int(input("Fever: ")),
            "Tiredness": int(input("Tiredness: ")),
            "Dry-Cough": int(input("Dry Cough: ")),
            "Difficulty-in-Breathing": int(input("Difficulty in Breathing: ")),
            "Sore-Throat": int(input("Sore Throat: ")),
            "Pains": int(input("Pains: ")),
            "Nasal-Congestion": int(input("Nasal Congestion: ")),
            "Runny-Nose": int(input("Runny Nose: ")),
            "Diarrhea": int(input("Diarrhea: ")),
            "Age_0-9": int(input("Age 0-9 (1 for Yes, 0 for No): ")),
            "Age_10-19": int(input("Age 10-19 (1 for Yes, 0 for No): ")),
            "Age_20-24": int(input("Age 20-24 (1 for Yes, 0 for No): ")),
            "Age_25-59": int(input("Age 25-59 (1 for Yes, 0 for No): ")),
            "Age_60+": int(input("Age 60+ (1 for Yes, 0 for No): ")),
            "Gender_Female": int(input("Gender Female (1 for Yes, 0 for No): ")),
            "Gender_Male": int(input("Gender Male (1 for Yes, 0 for No): ")),
            "Gender_Transgender": int(input("Gender Transgender (1 for Yes, 0 for No): ")),
            "Contact_Yes": int(input("Contact with COVID-19 Positive Person (1 for Yes, 0 for No): ")),
            "Contact_No": int(input("No Contact with COVID-19 Positive Person (1 for Yes, 0 for No): ")),
            "Contact_Dont-Know": int(input("Uncertain Contact with COVID-19 Positive Person (1 for Yes, 0 for No): ")),
            "None_Sympton": 0,  # Default
            "None_Experiencing": 0,  # Default
            "Severity_Mild": 0,  # Default
            "Severity_Moderate": 0,  # Default
            "Severity_Severe": 0,  # Default
            "Severity_None": 0,  # Default
        }

        # Align input features with the expected order
        features = pd.DataFrame([[user_inputs[feature] for feature in expected_features]], columns=expected_features)

        # Make a prediction
        prediction = rf_model.predict(features)
        probability = rf_model.predict_proba(features)[:, 1][0]

        # Output the result
        if prediction[0] == 1:
            print(f"\nThe person is likely to have COVID-19 with a confidence of {probability:.2%}.")
        else:
            print(f"\nThe person is unlikely to have COVID-19 with a confidence of {(1 - probability):.2%}.")
    
    except ValueError as e:
        print("\nInvalid input! Please enter 1 or 0 for all symptoms and details.")
        print(f"Error: {e}")

# Call the function
predict_covid()


Enter the following details (1 for Yes, 0 for No):

The person is likely to have COVID-19 with a confidence of 50.61%.
