In [1]:
# Import libraries\
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib


In [2]:
data = pd.read_csv('/home/tsoien/github/MLResearch/backend/data/Disease_symptom_and_patient_profile_dataset.csv')
df = pd.DataFrame(data)

# Display the first few rows
print(df.head())

# Check dataset information
print(df.info())


       Disease Fever Cough Fatigue Difficulty Breathing  Age  Gender  \
0    Influenza   Yes    No     Yes                  Yes   19  Female   
1  Common Cold    No   Yes     Yes                   No   25  Female   
2       Eczema    No   Yes     Yes                   No   25  Female   
3       Asthma   Yes   Yes      No                  Yes   25    Male   
4       Asthma   Yes   Yes      No                  Yes   25    Male   

  Blood Pressure Cholesterol Level Outcome Variable  
0            Low            Normal         Positive  
1         Normal            Normal         Negative  
2         Normal            Normal         Negative  
3         Normal            Normal         Positive  
4         Normal            Normal         Positive  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 349 entries, 0 to 348
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Disease               349 non-nul

In [3]:
# Define features (X) and target variable (y)
target_column = 'Outcome Variable'
X = df.drop(columns=[target_column])  # Drop the target column to keep features
y = df[target_column]  # Target column


preprocessing

In [4]:
print(X.head())

       Disease Fever Cough Fatigue Difficulty Breathing  Age  Gender  \
0    Influenza   Yes    No     Yes                  Yes   19  Female   
1  Common Cold    No   Yes     Yes                   No   25  Female   
2       Eczema    No   Yes     Yes                   No   25  Female   
3       Asthma   Yes   Yes      No                  Yes   25    Male   
4       Asthma   Yes   Yes      No                  Yes   25    Male   

  Blood Pressure Cholesterol Level  
0            Low            Normal  
1         Normal            Normal  
2         Normal            Normal  
3         Normal            Normal  
4         Normal            Normal  


In [5]:
# Mapping function
# mapping = {'Yes': 1, 'No': 0}
# # Mapping for Gender
# gender_mapping = {'Male': 1, 'Female': 2}
# X['Gender'] = X['Gender'].map(gender_mapping)

# # Apply mapping to the specified columns
# columns_to_map = ['Fever', 'Cough', 'Fatigue', 'Difficulty Breathing']
# for column in columns_to_map:
#     X[column] = X[column].map(mapping)

# Display the updated dataframe
print(X.head())

       Disease Fever Cough Fatigue Difficulty Breathing  Age  Gender  \
0    Influenza   Yes    No     Yes                  Yes   19  Female   
1  Common Cold    No   Yes     Yes                   No   25  Female   
2       Eczema    No   Yes     Yes                   No   25  Female   
3       Asthma   Yes   Yes      No                  Yes   25    Male   
4       Asthma   Yes   Yes      No                  Yes   25    Male   

  Blood Pressure Cholesterol Level  
0            Low            Normal  
1         Normal            Normal  
2         Normal            Normal  
3         Normal            Normal  
4         Normal            Normal  


In [6]:
# # Mapping for Blood Pressure and Cholesterol Level
# bp_mapping = {'Low': 1, 'Normal': 2, 'High': 3}
# cholesterol_mapping = {'Normal': 1, 'High': 2, 'Very High': 3}

# # Apply mapping to the specified columns
# X['Blood Pressure'] = X['Blood Pressure'].map(bp_mapping)
# X['Cholesterol Level'] = X['Cholesterol Level'].map(cholesterol_mapping)

# # Display the updated dataframe
# print(X.head())

In [7]:
from sklearn.impute import SimpleImputer

# Mapping values for categorical columns
mapping = {'Yes': 1, 'No': 0}
gender_mapping = {'Male': 1, 'Female': 2}
bp_mapping = {'Low': 1, 'Normal': 2, 'High': 3}
cholesterol_mapping = {'Normal': 1, 'High': 2, 'Very High': 3}

# Apply mappings
X['Gender'] = X['Gender'].map(gender_mapping)
columns_to_map = ['Fever', 'Cough', 'Fatigue', 'Difficulty Breathing']
for column in columns_to_map:
    X[column] = X[column].map(mapping)
X['Blood Pressure'] = X['Blood Pressure'].map(bp_mapping)
X['Cholesterol Level'] = X['Cholesterol Level'].map(cholesterol_mapping)

# Ensure correct data types after mapping
categorical_columns = X.select_dtypes(include=['object']).columns.tolist()  # categorical columns are those of type object
numeric_columns = X.select_dtypes(include=['int64', 'float64']).columns.tolist()  # numeric columns are those of type int or float

# Print columns for debugging
print("Categorical Columns:", categorical_columns)
print("Numeric Columns:", numeric_columns)

# Create preprocessing pipelines for categorical and numeric columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Handle missing categorical data
    ('encoder', OneHotEncoder(handle_unknown='ignore'))    # OneHotEncode categorical variables
])

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Handle missing numeric data
    ('scaler', StandardScaler())  # Scale numeric columns
])

# Combine into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_columns),  # Apply transformations for numeric columns
        ('cat', categorical_transformer, categorical_columns)  # Apply transformations for categorical columns
    ]
)

# Apply preprocessing to the dataset
processed_data = preprocessor.fit_transform(X)  # Use X, not df

# Print shape of processed data to verify
print("Processed Data Shape:", processed_data.shape)

# Optionally, convert to DataFrame for easier inspection
processed_data_df = pd.DataFrame(processed_data)
print(processed_data_df.head())

Categorical Columns: ['Disease']
Numeric Columns: ['Fever', 'Cough', 'Fatigue', 'Difficulty Breathing', 'Age', 'Gender', 'Blood Pressure', 'Cholesterol Level']
Processed Data Shape: (349, 124)
                                                   0
0  <Compressed Sparse Row sparse matrix of dtype ...
1  <Compressed Sparse Row sparse matrix of dtype ...
2  <Compressed Sparse Row sparse matrix of dtype ...
3  <Compressed Sparse Row sparse matrix of dtype ...
4  <Compressed Sparse Row sparse matrix of dtype ...


In [8]:
print(f"Categorical Columns: {categorical_columns}")
print(f"Numeric Columns: {numeric_columns}")


Categorical Columns: ['Disease']
Numeric Columns: ['Fever', 'Cough', 'Fatigue', 'Difficulty Breathing', 'Age', 'Gender', 'Blood Pressure', 'Cholesterol Level']


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply preprocessing steps to training and test data
X_train_preprocessed = preprocessor.fit_transform(X_train)  # Fit and transform training data
X_test_preprocessed = preprocessor.transform(X_test)  # Transform test data with the same preprocessor

# Train the model with the preprocessed training data
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_preprocessed, y_train)

In [10]:
print(X['Blood Pressure'].unique())

[1 2 3]


In [11]:
# Save the trained model
model_path = '/home/tsoien/github/MLResearch/backend/ml_model/model.joblib'
joblib.dump(rf_model, model_path)
print(f"Model saved to {model_path}")


Model saved to /home/tsoien/github/MLResearch/backend/ml_model/model.joblib


In [12]:
if hasattr(rf_model, 'estimators_'):
    # Make predictions
    y_pred = rf_model.predict(X_test_preprocessed)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.2f}")

    # Classification report
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
else:
    print("The model has not been fitted yet.")

Accuracy: 0.83
Classification Report:
              precision    recall  f1-score   support

    Negative       0.76      0.87      0.81        30
    Positive       0.89      0.80      0.84        40

    accuracy                           0.83        70
   macro avg       0.83      0.83      0.83        70
weighted avg       0.84      0.83      0.83        70

