In [1]:
import os
import sys
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib
from sklearn.impute import SimpleImputer
import pickle

# Add the data directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

# Import database credentials
from data.data_config import db_cred
from encryption.homomorphic import load_keys, decrypt_value

In [2]:
public_key_path = "/home/tsoien/github/newML/public_key.pkl"
private_key_path = "/home/tsoien/github/newML/private_key.pkl"

public_key, private_key = load_keys(public_key_path, private_key_path)


# Fetch data from the database
def fetch_data_from_db(query):
    try:
        db = db_cred()  # Establish connection using db_cred
        cursor = db.cursor()
        
        # Execute the query
        cursor.execute(query)
        
        # Fetch all the data and column names
        rows = cursor.fetchall()
        column_names = [desc[0] for desc in cursor.description]  # Extract column names
        
        # Close the cursor and database
        cursor.close()
        db.close()
        
        # Load into a DataFrame
        df = pd.DataFrame(rows, columns=column_names)
        return df
    except Exception as e:
        print(f"Error: {e}")
        return None

# Decrypt encrypted columns
def decrypt_dataframe(df):
    try:
        # Columns to decrypt
        encrypted_columns = [
            "Gender", "Fever", "Cough", "Fatigue",
            "Difficulty_Breathing", "Age", "Blood_Pressure", "Cholesterol_Level"
        ]
        
        for col in encrypted_columns:
            # Deserialize and decrypt each value in the column
            df[col] = df[col].apply(lambda x: decrypt_value(private_key, pickle.loads(x)))
        
        # Convert decrypted numeric columns to their appropriate types
        numeric_columns = ["Age"]
        for col in numeric_columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')  # Convert to numeric, handle errors
        
        return df
    except Exception as e:
        print(f"Error during decryption: {e}")
        return None

# Query to fetch all data from the 'disease_data' table
query = "SELECT * FROM disease_data;"
data = fetch_data_from_db(query)

# Process the data
if data is not None:
    print("Data fetched successfully!")
    
    # Decrypt the data
    decrypted_data = decrypt_dataframe(data)
    
    if decrypted_data is not None:
        print("Decryption completed successfully!")
        display(decrypted_data.head())  # Display the first few rows
        
        # Further processing (e.g., splitting features and labels for model training)
        X = decrypted_data.drop(columns=["Outcome_Variable", "id"])  # Features
        y = decrypted_data["Outcome_Variable"]  # Labels
        
        # Display processed features and labels
        print("Features (X):")
        display(X.head())
        print("Labels (y):")
        display(y.head())
    else:
        print("Failed to decrypt data.")
else:
    print("Failed to fetch data.")


Keys loaded from disk.
Data fetched successfully!
Decryption completed successfully!


Unnamed: 0,id,Disease,Gender,Fever,Cough,Fatigue,Difficulty_Breathing,Age,Blood_Pressure,Cholesterol_Level,Outcome_Variable
0,1,Influenza,2,1,0,1,1,19,1,2,Positive
1,2,Common Cold,2,0,1,1,0,25,2,2,Negative
2,3,Eczema,2,0,1,1,0,25,2,2,Negative
3,4,Asthma,1,1,1,0,1,25,2,2,Positive
4,5,Asthma,1,1,1,0,1,25,2,2,Positive


Features (X):


Unnamed: 0,Disease,Gender,Fever,Cough,Fatigue,Difficulty_Breathing,Age,Blood_Pressure,Cholesterol_Level
0,Influenza,2,1,0,1,1,19,1,2
1,Common Cold,2,0,1,1,0,25,2,2
2,Eczema,2,0,1,1,0,25,2,2
3,Asthma,1,1,1,0,1,25,2,2
4,Asthma,1,1,1,0,1,25,2,2


Labels (y):


0    Positive
1    Negative
2    Negative
3    Positive
4    Positive
Name: Outcome_Variable, dtype: object

In [3]:
unique_cholesterol_levels = X['Cholesterol_Level'].unique()
print(unique_cholesterol_levels)

[2 1 3]


In [4]:
# Check for missing data in the dataframe
missing_data = X.isnull().sum()
print(missing_data)

Disease                 0
Gender                  0
Fever                   0
Cough                   0
Fatigue                 0
Difficulty_Breathing    0
Age                     0
Blood_Pressure          0
Cholesterol_Level       0
dtype: int64


In [5]:
print(X.head())

       Disease  Gender  Fever  Cough  Fatigue  Difficulty_Breathing  Age  \
0    Influenza       2      1      0        1                     1   19   
1  Common Cold       2      0      1        1                     0   25   
2       Eczema       2      0      1        1                     0   25   
3       Asthma       1      1      1        0                     1   25   
4       Asthma       1      1      1        0                     1   25   

   Blood_Pressure  Cholesterol_Level  
0               1                  2  
1               2                  2  
2               2                  2  
3               2                  2  
4               2                  2  


In [6]:
column_types = X.dtypes
print(column_types)

Disease                 object
Gender                   int64
Fever                    int64
Cough                    int64
Fatigue                  int64
Difficulty_Breathing     int64
Age                      int64
Blood_Pressure           int64
Cholesterol_Level        int64
dtype: object


In [7]:
import numpy as np
# Specify column types
categorical_columns = ['Disease']  # Only 'Disease' is categorical
numeric_columns = [col for col in X.columns if col not in categorical_columns]

# Debugging: Print column information
print("Categorical Columns:", categorical_columns)
print("Numeric Columns:", numeric_columns)

# Define numeric and categorical transformers
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', drop=None))  # Adjust drop strategy if needed
])

# Create a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns)
    ]
)

processed_data = preprocessor.fit_transform(X)

# Check if processed_data is sparse
if hasattr(processed_data, "toarray"):  # For sparse matrix
    processed_data_dense = processed_data.toarray()
else:
    processed_data_dense = processed_data

# Convert to numeric if necessary and check for NaNs
try:
    processed_data_numeric = processed_data_dense.astype(float)  # Ensure numeric type
    print("NaN values in processed_data:", np.isnan(processed_data_numeric).sum())
except ValueError:
    print("processed_data contains non-numeric data and cannot be checked for NaN values directly.")

# Check processed data shape
print("Processed Data Shape:", processed_data.shape)

# Optionally, convert to dense DataFrame for inspection
processed_data_df = pd.DataFrame(processed_data_dense)
print(processed_data_df.head())


feature_names = preprocessor.get_feature_names_out()
print("\nFeature Names:", feature_names)
print("Processed Data Shape:", processed_data_dense.shape)


Categorical Columns: ['Disease']
Numeric Columns: ['Gender', 'Fever', 'Cough', 'Fatigue', 'Difficulty_Breathing', 'Age', 'Blood_Pressure', 'Cholesterol_Level']
NaN values in processed_data: 0
Processed Data Shape: (349, 124)
        0         1         2         3         4         5         6    \
0  0.991441  0.997139 -0.957905  0.664943  1.722181 -2.091160 -2.419529   
1  0.991441 -1.002869  1.043945  0.664943 -0.580659 -1.631964 -0.723915   
2  0.991441 -1.002869  1.043945  0.664943 -0.580659 -1.631964 -0.723915   
3 -1.008633  0.997139  1.043945 -1.503889  1.722181 -1.631964 -0.723915   
4 -1.008633  0.997139  1.043945 -1.503889  1.722181 -1.631964 -0.723915   

        7    8    9    ...  114  115  116  117  118  119  120  121  122  123  
0 -0.576777  0.0  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
1 -0.576777  0.0  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
2 -0.576777  0.0  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
3 -0.576

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply preprocessing steps to training and test data
X_train_preprocessed = preprocessor.fit_transform(X_train)  # Fit and transform training data
X_test_preprocessed = preprocessor.transform(X_test)  # Transform test data with the same preprocessor

# Train the model with the preprocessed training data
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_preprocessed, y_train)




In [9]:
if hasattr(rf_model, 'estimators_'):
    # Make predictions
    y_pred = rf_model.predict(X_test_preprocessed)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.2f}")

    # Classification report
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
else:
    print("The model has not been fitted yet.")

Accuracy: 0.76
Classification Report:
              precision    recall  f1-score   support

    Negative       0.70      0.77      0.73        30
    Positive       0.81      0.75      0.78        40

    accuracy                           0.76        70
   macro avg       0.75      0.76      0.75        70
weighted avg       0.76      0.76      0.76        70



In [10]:
print("Training Columns:", X_train.columns.tolist())
print("Prediction Columns:", y_train.name)

Training Columns: ['Disease', 'Gender', 'Fever', 'Cough', 'Fatigue', 'Difficulty_Breathing', 'Age', 'Blood_Pressure', 'Cholesterol_Level']
Prediction Columns: Outcome_Variable


In [11]:

joblib.dump(preprocessor, "/home/tsoien/github/newML/backend/ml_model/preprocessor.joblib")
preprocessor = joblib.load("/home/tsoien/github/newML/backend/ml_model/preprocessor.joblib")

In [12]:
model_path = '/home/tsoien/github/newML/backend/ml_model/model.joblib'
joblib.dump(rf_model, model_path)
print(f"Model saved to {model_path}")

Model saved to /home/tsoien/github/newML/backend/ml_model/model.joblib
