In [1]:
# Import libraries\
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib


In [2]:
data = pd.read_csv('/home/tsoien/github/MLResearch/backend/data/Disease_symptom_and_patient_profile_dataset.csv')
df = pd.DataFrame(data)

# Display the first few rows
print(df.head())

# Check dataset information
print(df.info())


       Disease Fever Cough Fatigue Difficulty Breathing  Age  Gender  \
0    Influenza   Yes    No     Yes                  Yes   19  Female   
1  Common Cold    No   Yes     Yes                   No   25  Female   
2       Eczema    No   Yes     Yes                   No   25  Female   
3       Asthma   Yes   Yes      No                  Yes   25    Male   
4       Asthma   Yes   Yes      No                  Yes   25    Male   

  Blood Pressure Cholesterol Level Outcome Variable  
0            Low            Normal         Positive  
1         Normal            Normal         Negative  
2         Normal            Normal         Negative  
3         Normal            Normal         Positive  
4         Normal            Normal         Positive  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 349 entries, 0 to 348
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Disease               349 non-nul

In [3]:
# Check for missing data in the dataframe
missing_data = df.isnull().sum()
print(missing_data)

Disease                 0
Fever                   0
Cough                   0
Fatigue                 0
Difficulty Breathing    0
Age                     0
Gender                  0
Blood Pressure          0
Cholesterol Level       0
Outcome Variable        0
dtype: int64


In [4]:
# Define features (X) and target variable (y)
target_column = 'Outcome Variable'
X = df.drop(columns=[target_column])  # Drop the target column to keep features
y = df[target_column]  # Target column


preprocessing

In [5]:
print(X.head())

       Disease Fever Cough Fatigue Difficulty Breathing  Age  Gender  \
0    Influenza   Yes    No     Yes                  Yes   19  Female   
1  Common Cold    No   Yes     Yes                   No   25  Female   
2       Eczema    No   Yes     Yes                   No   25  Female   
3       Asthma   Yes   Yes      No                  Yes   25    Male   
4       Asthma   Yes   Yes      No                  Yes   25    Male   

  Blood Pressure Cholesterol Level  
0            Low            Normal  
1         Normal            Normal  
2         Normal            Normal  
3         Normal            Normal  
4         Normal            Normal  


In [6]:
# Mapping function
# mapping = {'Yes': 1, 'No': 0}
# # Mapping for Gender
# gender_mapping = {'Male': 1, 'Female': 2}
# X['Gender'] = X['Gender'].map(gender_mapping)

# # Apply mapping to the specified columns
# columns_to_map = ['Fever', 'Cough', 'Fatigue', 'Difficulty Breathing']
# for column in columns_to_map:
#     X[column] = X[column].map(mapping)

# Display the updated dataframe
print(X.head())

       Disease Fever Cough Fatigue Difficulty Breathing  Age  Gender  \
0    Influenza   Yes    No     Yes                  Yes   19  Female   
1  Common Cold    No   Yes     Yes                   No   25  Female   
2       Eczema    No   Yes     Yes                   No   25  Female   
3       Asthma   Yes   Yes      No                  Yes   25    Male   
4       Asthma   Yes   Yes      No                  Yes   25    Male   

  Blood Pressure Cholesterol Level  
0            Low            Normal  
1         Normal            Normal  
2         Normal            Normal  
3         Normal            Normal  
4         Normal            Normal  


In [7]:
# # Mapping for Blood Pressure and Cholesterol Level
# bp_mapping = {'Low': 1, 'Normal': 2, 'High': 3}
# cholesterol_mapping = {'Normal': 1, 'High': 2, 'Very High': 3}

# # Apply mapping to the specified columns
# X['Blood Pressure'] = X['Blood Pressure'].map(bp_mapping)
# X['Cholesterol Level'] = X['Cholesterol Level'].map(cholesterol_mapping)

# # Display the updated dataframe
# print(X.head())

In [8]:
from sklearn.impute import SimpleImputer
# Mapping values for categorical columns
# Mapping values for categorical columns
mapping = {'Yes': 1, 'No': 0}
gender_mapping = {'Male': 1, 'Female': 2}
bp_mapping = {'Low': 1, 'Normal': 2, 'High': 3}
cholesterol_mapping = {'Normal': 1, 'High': 2, 'Very High': 3}

# Map and handle missing values for specified columns
columns_to_map = ['Fever', 'Cough', 'Fatigue', 'Difficulty Breathing']
for column in columns_to_map:
    print(f"Unique values in '{column}' before mapping:", X[column].unique())
    X[column] = X[column].map(mapping).fillna(0)  # Map values and fill missing with 0 (default)

X['Gender'] = X['Gender'].map(gender_mapping).fillna(0)
X['Blood Pressure'] = X['Blood Pressure'].map(bp_mapping).fillna(2)  # Default to "Normal"
X['Cholesterol Level'] = X['Cholesterol Level'].map(cholesterol_mapping).fillna(1)  # Default to "Normal"

# Specify column types
categorical_columns = ['Disease']  # Only 'Disease' is categorical
numeric_columns = [col for col in X.columns if col not in categorical_columns]

# Debugging: Print column information
print("Categorical Columns:", categorical_columns)
print("Numeric Columns:", numeric_columns)

# Define numeric and categorical transformers
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', drop=None))  # Adjust drop strategy if needed
])

# Create a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns)
    ]
)

processed_data = preprocessor.fit_transform(X)

# Check if processed_data is sparse
if hasattr(processed_data, "toarray"):  # For sparse matrix
    processed_data_dense = processed_data.toarray()
else:
    processed_data_dense = processed_data

# Convert to numeric if necessary and check for NaNs
try:
    processed_data_numeric = processed_data_dense.astype(float)  # Ensure numeric type
    print("NaN values in processed_data:", np.isnan(processed_data_numeric).sum())
except ValueError:
    print("processed_data contains non-numeric data and cannot be checked for NaN values directly.")

# Check processed data shape
print("Processed Data Shape:", processed_data.shape)

# Optionally, convert to dense DataFrame for inspection
processed_data_df = pd.DataFrame(processed_data_dense)
print(processed_data_df.head())


feature_names = preprocessor.get_feature_names_out()
print("\nFeature Names:", feature_names)
print("Processed Data Shape:", processed_data_dense.shape)


Unique values in 'Fever' before mapping: ['Yes' 'No']
Unique values in 'Cough' before mapping: ['No' 'Yes']
Unique values in 'Fatigue' before mapping: ['Yes' 'No']
Unique values in 'Difficulty Breathing' before mapping: ['Yes' 'No']
Categorical Columns: ['Disease']
Numeric Columns: ['Fever', 'Cough', 'Fatigue', 'Difficulty Breathing', 'Age', 'Gender', 'Blood Pressure', 'Cholesterol Level']
NaN values in processed_data: 0
Processed Data Shape: (349, 124)
        0         1         2         3         4         5         6    \
0  0.997139 -0.957905  0.664943  1.722181 -2.091160  0.991441 -2.419529   
1 -1.002869  1.043945  0.664943 -0.580659 -1.631964  0.991441 -0.723915   
2 -1.002869  1.043945  0.664943 -0.580659 -1.631964  0.991441 -0.723915   
3  0.997139  1.043945 -1.503889  1.722181 -1.631964 -1.008633 -0.723915   
4  0.997139  1.043945 -1.503889  1.722181 -1.631964 -1.008633 -0.723915   

       7    8    9    ...  114  115  116  117  118  119  120  121  122  123  
0 -0.95242  0

In [9]:
nan_values = np.sum(np.isnan(processed_data.data))  # Check only the stored data
print(f"Number of NaN values in processed data: {nan_values}")


Number of NaN values in processed data: 0


In [10]:
# Check for missing data in the processed dataframe
missing_data_processed = processed_data_df.isnull().sum()
print(missing_data_processed)

0      0
1      0
2      0
3      0
4      0
      ..
119    0
120    0
121    0
122    0
123    0
Length: 124, dtype: int64


In [11]:
print(f"Categorical Columns: {categorical_columns}")
print(f"Numeric Columns: {numeric_columns}")


Categorical Columns: ['Disease']
Numeric Columns: ['Fever', 'Cough', 'Fatigue', 'Difficulty Breathing', 'Age', 'Gender', 'Blood Pressure', 'Cholesterol Level']


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply preprocessing steps to training and test data
X_train_preprocessed = preprocessor.fit_transform(X_train)  # Fit and transform training data
X_test_preprocessed = preprocessor.transform(X_test)  # Transform test data with the same preprocessor

# Train the model with the preprocessed training data
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_preprocessed, y_train)


joblib.dump(preprocessor, "/home/tsoien/github/MLResearch/backend/ml_model/preprocessor.joblib")
preprocessor = joblib.load("/home/tsoien/github/MLResearch/backend/ml_model/preprocessor.joblib")

In [13]:
# import pandas as pd
# nan_values = processed_data.isna().sum().sum()
# print(f"Number of NaN values in processed data: {nan_values}")


In [14]:
print(X['Blood Pressure'].unique())
print(X['Cholesterol Level'].unique())


[1 2 3]
[1. 2.]


In [15]:
# Save the processed data to a CSV file
processed_data_df.to_csv('/home/tsoien/github/MLResearch/backend/data/processed_data.csv', index=False)
print("Processed data saved to /home/tsoien/github/MLResearch/backend/data/processed_data.csv")

Processed data saved to /home/tsoien/github/MLResearch/backend/data/processed_data.csv


In [16]:
# Save the trained model
model_path = '/home/tsoien/github/MLResearch/backend/ml_model/model.joblib'
joblib.dump(rf_model, model_path)
print(f"Model saved to {model_path}")


Model saved to /home/tsoien/github/MLResearch/backend/ml_model/model.joblib


In [17]:
if hasattr(rf_model, 'estimators_'):
    # Make predictions
    y_pred = rf_model.predict(X_test_preprocessed)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.2f}")

    # Classification report
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
else:
    print("The model has not been fitted yet.")

Accuracy: 0.87
Classification Report:
              precision    recall  f1-score   support

    Negative       0.86      0.83      0.85        30
    Positive       0.88      0.90      0.89        40

    accuracy                           0.87        70
   macro avg       0.87      0.87      0.87        70
weighted avg       0.87      0.87      0.87        70



In [18]:
print("Training Columns:", X_train.columns.tolist())
print("Prediction Columns:", y_train.name)

Training Columns: ['Disease', 'Fever', 'Cough', 'Fatigue', 'Difficulty Breathing', 'Age', 'Gender', 'Blood Pressure', 'Cholesterol Level']
Prediction Columns: Outcome Variable
