In [5]:
##### Install required libraries (optional)

# Import necessary libraries

import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv('./data/Disease_symptom_and_patient_profile_dataset.csv')

# df.shape
array = df['Disease'].to_numpy()
category = np.unique(array)


In [6]:
# Check for missing values
df.isna().sum

<bound method DataFrame.sum of      Disease  Fever  Cough  Fatigue  Difficulty Breathing    Age  Gender  \
0      False  False  False    False                 False  False   False   
1      False  False  False    False                 False  False   False   
2      False  False  False    False                 False  False   False   
3      False  False  False    False                 False  False   False   
4      False  False  False    False                 False  False   False   
..       ...    ...    ...      ...                   ...    ...     ...   
344    False  False  False    False                 False  False   False   
345    False  False  False    False                 False  False   False   
346    False  False  False    False                 False  False   False   
347    False  False  False    False                 False  False   False   
348    False  False  False    False                 False  False   False   

     Blood Pressure  Cholesterol Level  Outcome Variable

In [7]:
# Check for duplicated rows
df.duplicated().sum()
print(df)

# Drop duplicate rows
df = df.drop_duplicates()
# print(df)


         Disease Fever Cough Fatigue Difficulty Breathing  Age  Gender  \
0      Influenza   Yes    No     Yes                  Yes   19  Female   
1    Common Cold    No   Yes     Yes                   No   25  Female   
2         Eczema    No   Yes     Yes                   No   25  Female   
3         Asthma   Yes   Yes      No                  Yes   25    Male   
4         Asthma   Yes   Yes      No                  Yes   25    Male   
..           ...   ...   ...     ...                  ...  ...     ...   
344       Stroke   Yes    No     Yes                   No   80  Female   
345       Stroke   Yes    No     Yes                   No   85    Male   
346       Stroke   Yes    No     Yes                   No   85    Male   
347       Stroke   Yes    No     Yes                   No   90  Female   
348       Stroke   Yes    No     Yes                   No   90  Female   

    Blood Pressure Cholesterol Level Outcome Variable  
0              Low            Normal         Positive  

In [8]:

from sklearn.model_selection import train_test_split

# Split the dataset into training and testing sets
train_data, test_data = train_test_split(df, test_size = 0.26, random_state = 42)

# Save the training and testing sets to CSV files
train_data.to_csv('../artifacts/train.csv', index = False, header = True)
test_data.to_csv('../artifacts/test.csv', index = False, header = True)

# print(train_data)
print("Ingestion of the data is completed")



Ingestion of the data is completed


In [9]:
#Data trasformation

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,StandardScaler

numerical_columns = ["Age"]

categorical_columns = [
                "Fever",
                "Cough",
                "Fatigue",
                "Difficulty Breathing",
                "Gender",
                "Blood Pressure",
                "Cholesterol Level",
                "Outcome Variable",
            ]


In [10]:
# Define pipelines for numerical and categorical data preprocessing

num_pipeline = Pipeline(
                steps=[
                    ("imputer", SimpleImputer(strategy="median")),
                    ("scalar", StandardScaler(with_mean=False))
                ]
            )

cat_pipeline = Pipeline(
                steps=[
                    ("imputer", SimpleImputer(strategy="most_frequent")),
                    ("one_hot_encoder", OneHotEncoder()),
                    ("scaler", StandardScaler(with_mean=False))

                ]
            )

# Create a preprocessor that applies transformations to numerical and categorical columns

preprocessor = ColumnTransformer(
                [
                    ("num_pipeline", num_pipeline, numerical_columns),
                    ("cat_pipeline", cat_pipeline, categorical_columns)
                ]
            )


In [11]:
# Load the training and testing datasets

train_df = pd.read_csv('../artifacts/train.csv')
test_df = pd.read_csv('../artifacts/test.csv')

# Define the target column and numerical columns
target_column_name = "Disease"
numerical_columns = ["Age"]

# Separate input and target features for training and testing datasets
input_feature_train_df = train_df.drop(columns=[target_column_name], axis=1)
target_feature_train_df = train_df[[target_column_name]]

input_feature_test_df = test_df.drop(columns=[target_column_name], axis=1)
target_feature_test_df = test_df[[target_column_name]]

# Apply one-hot encoding to the target feature
one = OneHotEncoder(handle_unknown='ignore', sparse_output=False).set_output(transform='pandas')

output_feature_train_df = one.fit_transform(target_feature_train_df)
output_feature_test_df = one.transform(target_feature_test_df)

# Decoding the disease column
diseases = np.array([column for column in  output_feature_train_df.columns])
diseases


array(['Disease_Acne', 'Disease_Allergic Rhinitis',
       "Disease_Alzheimer's Disease", 'Disease_Anemia',
       'Disease_Anxiety Disorders', 'Disease_Appendicitis',
       'Disease_Asthma', 'Disease_Atherosclerosis',
       'Disease_Autism Spectrum Disorder (ASD)',
       'Disease_Bipolar Disorder', 'Disease_Bladder Cancer',
       'Disease_Brain Tumor', 'Disease_Breast Cancer',
       'Disease_Bronchitis', 'Disease_Cataracts',
       'Disease_Cerebral Palsy', 'Disease_Chickenpox',
       'Disease_Cholecystitis', 'Disease_Cholera',
       'Disease_Chronic Obstructive Pulmonary Disease (COPD)',
       'Disease_Chronic Obstructive Pulmonary...', 'Disease_Cirrhosis',
       'Disease_Colorectal Cancer', 'Disease_Common Cold',
       'Disease_Conjunctivitis (Pink Eye)',
       'Disease_Coronary Artery Disease', "Disease_Crohn's Disease",
       'Disease_Cystic Fibrosis', 'Disease_Dementia',
       'Disease_Dengue Fever', 'Disease_Depression', 'Disease_Diabetes',
       'Disease_Diverticu

In [12]:
# Apply preprocessing to input features
input_feature_train_arr = preprocessor.fit_transform(input_feature_train_df)
input_feature_test_arr = preprocessor.transform(input_feature_test_df)

print(input_feature_train_arr)

[[2.76195032 0.         2.00816597 ... 2.00816597 0.         2.00129986]
 [5.52390063 2.00816597 0.         ... 2.00816597 2.00129986 0.        ]
 [3.55107898 0.         2.00816597 ... 0.         0.         2.00129986]
 ...
 [3.15651465 2.00816597 0.         ... 2.00816597 2.00129986 0.        ]
 [5.1293363  0.         2.00816597 ... 2.00816597 2.00129986 0.        ]
 [3.15651465 2.00816597 0.         ... 2.00816597 2.00129986 0.        ]]


In [13]:
# Save the preprocessor object to a file
import dill

with open('../artifacts/preprocessor.pkl', "wb") as file_obj:
        dill.dump(preprocessor, file_obj)


In [15]:
# Prepare training and testing data
X_train,y_train,X_test,y_test = (
    input_feature_train_arr,
    np.array(output_feature_train_df),
    input_feature_test_arr,
    np.array(output_feature_test_df)
)

# print(train_arr[1])
print("Saved preprocesssing object.")
# print(X_train.shape ,y_train.shape ,X_test.shape ,y_test.shape)
# print(type(test_arr))
# ValueError: X has 19 features, but LinearRegression is expecting 37 features as input.

Saved preprocesssing object.


In [16]:
# Evaluate the performance of regression models by iterating through all the models

def evaluate_models(X_train, y_train, X_test, y_test, models):
    report = {}

    for i in range(len(list(models))):
        model = list(models.values())[i]

        model.fit(X_train,y_train)

        y_train_pred = model.predict(X_train)

        y_test_pred = model.predict(X_test)

        train_model_score = r2_score(y_train, y_train_pred)

        test_model_score = r2_score(y_test, y_test_pred)

        report[list(models.keys())[i]] = test_model_score

    return report

In [17]:

from sklearn.model_selection import GridSearchCV
from catboost import CatBoostRegressor
from sklearn.ensemble import (
    AdaBoostRegressor,
    GradientBoostingRegressor,
    RandomForestRegressor
)
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVC

# Define a dictionary of regression models to evaluate

models = {
        "Random Forest" : RandomForestRegressor(),
        "Decision Tree" : DecisionTreeRegressor(),
        "Linear Regression" : LinearRegression(),
        "K-Neighbors Regressor" : KNeighborsRegressor(),
        "XGBRegressor" : XGBRegressor(),
        # "Gradient Boosting" : GradientBoostingRegressor(),
        # "CatBoosting Regressor" : CatBoostRegressor(verbose = False),
        # "AdaBoost Regressor" : AdaBoostRegressor()
        # "SVM" : SVC(kernel="rbf", gamma=0.5, C=1.0)
}

 
model_report:dict = evaluate_models(
    X_train = X_train,
    y_train = y_train,
    X_test = X_test,
    y_test = y_test,
    models = models
)
# ValueError: y should be a 1d array, got an array of shape (222, 101) instead.
# print(X_train)

In [18]:
# Find the best performing model
best_model_score = max(sorted(model_report.values()))
best_model_name = list(model_report.keys())[
    list(model_report.values()).index(best_model_score)
]

best_model = models[best_model_name]

if best_model_score < 0.6:
    print("No best model found")

print("Best found model on both training and testing dataset")


No best model found
Best found model on both training and testing dataset


In [19]:
# Save the best model to a file
import dill

with open('../artifacts/best_model.pkl', "wb") as file_obj:
        dill.dump(best_model, file_obj)

In [20]:
# Make predictions using the best model
predicted = best_model.predict(X_test)

# Calculate the R-squared score for the best model
r2_sco = r2_score(y_test, predicted)
print(best_model_name)
print(r2_sco)

Decision Tree
0.17501976889418258


In [21]:
# Load the preprocessor and best model for prediction
model = dill.load(open('../artifacts/preprocessor.pkl', 'rb'))
final_model = dill.load(open('../artifacts/best_model.pkl', 'rb'))

# Define input values for prediction

input_values = {
    "Fever" : ['Yes'],
    "Cough" : ['No'],
    "Fatigue" : ['Yes'],
    "Difficulty Breathing" : ['No'],
    "Age" : [30],
    "Gender" : ['Female'],
    "Blood Pressure" : ['Normal'],
    "Cholesterol Level" : ['Normal'],
    "Outcome Variable" : ['Negative'] 
}

input_df = pd.DataFrame(input_values)

# Transform the input data
data_scaled = model.transform(input_df)

# Evaluate the performance of the model
# print(data_scaled)

# Make predictions
pred = final_model.predict(data_scaled)
print(pred[0])

index = np.where(pred[0] == 1)[0][0]
diseases[index]

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0.]


'Disease_Dengue Fever'