In [None]:
# %pip install --upgrade pip FLAML "flaml[spark]" shap setuptools wheel optuna optuna-integration openml xgboost catboost imbalanced-learn pandas scipy statsmodels
# %pip install --upgrade "scikit-learn==1.1.2"
# %pip install catboost FLAML "flaml[spark]" geopy gspread imbalanced-learn ipykernel matplotlib numpy \
#     openpyxl openai openml optuna optuna-integration \
#         pandas pip plotly_express polars PyCap pygsheets python-dotenv pyspark \
#             seaborn scipy setuptools scikit-learn shap statsmodels tabulate tabula-py wheel xlsx2csv xgboost

In [None]:
# To allow own package to be imported
import sys
import os
if os.path.dirname(os.getcwd()) not in sys.path:
    sys.path.append(os.path.dirname(os.getcwd()))
if os.getcwd() not in sys.path:
    sys.path.append(os.getcwd())
from warnings import filterwarnings
filterwarnings(action = "ignore")

# import necessary packages
import pandas as pd
import numpy as np
import polars as pl
import plotly.express as px
import wh0102 as mphd

# Prepare the data dictionary
data_dictionary = {
    "Ethnic":{0:"Malay", 1:"Chinese", 2:"Indian"},
    "bmi":{0:"Normal BMI", 1:"Overweight"},
    "Disease":{0:"No liver disease", 1:"Have Liver Disease"},
    "Gender":{0:"Female", 1:"Male"}
}

normal_values = {"TP":[64, 83],
                 "ALB":[35, 52],
                 "TB":[0, 22],
                 "ALP":[40, 130],
                 "ALT":[0, 42],
                 "AST":[0, 41]}

# Rename for easier references
column_to_be_rename = {"Sgot":"ALT",
                       "Sgpt":"AST",
                       "Alkphos":"ALP"}

# Prepare the variables
dependent_variable = "Disease"
independent_demographic = ("Age", "Gender", "Ethnic", "bmi",)
independent_investigations = ("AGR", "ALB", "TP", "TB", "DB", "ALP", "ALT", "AST",)
independent_continous = (independent_demographic[0],) + independent_investigations
independent_categorical = independent_demographic[1:]
independent_variables = independent_demographic + independent_investigations

# Load the data
df = pd.read_csv(r"assignment4.csv")

# Rename the column name
df = df.rename(columns = column_to_be_rename)

# To reassign the categorical value
for column in [key for key in data_dictionary.keys() if key != "Gender"]:
    df.loc[:,column] = df.loc[:,column] - 1

# To change 0 & 1 for the bmi and disease in reverse order
for column in ["bmi", "Disease"]:
    df.loc[:,column] = df.loc[:,column].replace({0:1, 1:0})

# Print the information
df.info()

In [None]:
# Convert the gender
df = mphd.categorical_data.label_encode(df = df, columns = "Gender", convert_numeric=True)
# "Gender":{0:"Female", 1:"Male"}

# Check for duplication
duplicated_df, to_drop_duplicated_df = mphd.pre_processing.check_duplication(df)

# Check for missing value
missing_df = mphd.missing_values.analyse_missing_row(df)
missing_df

## To fix AGR == ' ' issue

In [None]:
# Calculate the Globulin for every patient ID with globulin = tp - alb based on resource below:
# https://www.ncbi.nlm.nih.gov/books/NBK204/#:~:text=The%20total%20globulin%20fraction%20is,of%20further%20fractionating%20serum%20proteins

# To check the truthness of this on the data
# Create a deep copy of the df with AGR not null first
temp_df = df.query("AGR.notnull()").copy(deep = True)

# Calculatet the globulin and agr_ratio
def calculate_agr(df:pd.DataFrame, column_name:str):
    df.loc[:,column_name] = df.loc[:,"ALB"] / (df.loc[:,"TP"] - df.loc[:,"ALB"])
    return df

# Calculate the approximate agr
temp_df = calculate_agr(df = temp_df, column_name = "agr_new")
# Check for float similarity
temp_df.loc[:,"agr_similarity"] = temp_df.loc[:,("AGR", "agr_new",)]\
    .apply(lambda x: np.isclose(float(x[0]), x[1], rtol = 0.1), axis = 1)

# Pivot the information
pt = temp_df.pivot_table(index = "agr_similarity", values = "Patient_ID", aggfunc = len, margins = True)\
    .rename(columns={"Patient_ID":"count"})
# Calculate percentage
pt.loc[:,"percentage"] = round(pt.loc[:,"count"] / pt.loc["All", "count"] * 100, 2)

print(pt.to_markdown(tablefmt = "pretty"))

# Check on the not similarity result
temp_df.query("agr_similarity == False")

In [None]:
# Trial to impute with calculation
missing_df_temp = calculate_agr(df = missing_df, column_name="AGR")
missing_df_temp

In [None]:
# Impute with MissingForest
# https://betterdatascience.com/python-missforest-algorithm/#google_vignette

# mf_df = mphd.missing_values.miss_forest_imputation(df=df, columns = "AGR")
# # Check on the imputated value
# mf_df.loc[missing_df.index]

In [None]:
# Impute with MICE
# https://medium.com/@brijesh_soni/topic-9-mice-or-multivariate-imputation-with-chain-equation-f8fd435ca91#:~:text=MICE%20stands%20for%20Multivariate%20Imputation,produce%20a%20final%20imputed%20dataset.

df = mphd.missing_values.mice_imputation(df = df, columns = "AGR")
# Check on the imputated value
df.loc[missing_df.index]

__intepretation__:

For imputation, despite the logic of how AGR being calculated, there is a lot of noise in the data for AGR value, therefore we would use MICE for imputation.

## Continous Data

In [None]:
# Check for normal distribution
# normal_distribution_list, abnormal_distribution_list = mphd.continous_data.descriptive_analysis(df = df, 
#                                                                                                 independent_variables=independent_continous, 
#                                                                                                 dependent_variables = dependent_variable,
#                                                                                                 descriptive_type = "continous",
#                                                                                                 plot_dependent_variables = False,
#                                                                                                 plot_correlation = True, 
#                                                                                                 round = 4)

In [27]:
# Show outliers with 1.5 * iqr
# outliers_df = mphd.continous_data.identify_outliers(df = df, 
#                                                     column_name = ["TP", "TB", "ALP", "AST", "ALT"], 
#                                                     ratio = 1.5,
#                                                     normal_values = normal_values)

# normal_distribution_list, abnormal_distribution_list = mphd.continous_data.descriptive_analysis(df = df.loc[~df.index.isin(outliers_df.index)], 
#                                                                                                 independent_variables=independent_continous, 
#                                                                                                 dependent_variables = dependent_variable,
#                                                                                                 descriptive_type = "continous",
#                                                                                                 plot_dependent_variables = False,
#                                                                                                 plot_correlation = True, 
#                                                                                                 round = 4)

## Categorical Data

In [None]:
# Reverse Encode
# data = mphd.categorical_data.reverse_encode(df = df, json_dict=data_dictionary)

# # Categegorical Data Analysis
# categorical_summary = mphd.categorical_data.categorical_descriptive_analysis(data,
#                                                                              independent_variables = independent_categorical, 
#                                                                              dependent_variables = dependent_variable, 
#                                                                              analyse_dependent = True)

## Model Training

In [None]:
# Train Test Split
X_train, X_test, y_train, y_test = mphd.pre_processing.train_test_split(df = df,
                                                                        independent_variables=independent_variables,
                                                                        dependent_variable = dependent_variable,
                                                                        test_size = 0.2)

# Generate random_seed
random_seed = 168

## Logistic Regression

In [None]:
# # Prepare the params
# logistic_regression_params = {'classifier__penalty': ['l1', 'l2', 'elasticnet', 'none'],
#                               'classifier__C': np.logspace(0, 4, 10),
#                               'classifier__solver': ['liblinear'],
#                               'classifier__max_iter': [100, 200, 300]}

# # Build logistic regression
# logistic_regression_grid_search, logistic_regression_time_required = mphd.machine_learning.logisticRegression(X_train = X_train, 
#                                                                                                               y_train = y_train,
#                                                                                                               params = logistic_regression_params, 
#                                                                                                               random_seed = random_seed)
# # Perform analysis on Random Forest
# lr_summary_df = mphd.analyse_ml.analyse_ml(logistic_regression_grid_search,
#                                            time_required = logistic_regression_time_required,
#                                            model_type = "Logistic Regression",
#                                            independent_variables=independent_variables,
#                                            X_test=X_test, y_test=y_test)
# lr_summary_df

## Support Vector Machine(SVM)

In [None]:
# # SVM params
# svm_params=param_grid = {
#     'classifier__C': [0.1, 1, 10, 100],
#     'classifier__kernel': ['linear', 'rbf', 'poly'],
#     'classifier__gamma': [0.1, 0.01, 0.001],
#     'classifier__degree': [3, 4, 5]
# }

# svm_grid_search, svm_time_required = mphd.machine_learning.svm(X_train = X_train, 
#                                                                y_train = y_train,
#                                                                params = svm_params, 
#                                                                random_seed = random_seed)
# # Perform analysis on Random Forest
# svm_summary_df = mphd.analyse_ml.analyse_ml(svm_grid_search,
#                                            time_required = svm_time_required,
#                                            model_type = "SVM",
#                                            independent_variables=independent_variables,
#                                            X_test=X_test, y_test=y_test)
# svm_summary_df

## K-Nearest Neighbour(kNN)

In [None]:
# # Define the parameter grid
# knn_params = {
#     'classifier__n_neighbors': [3, 5, 7],
#     'classifier__weights': ['uniform', 'distance'],
#     'classifier__metric': ['euclidean', 'manhattan', 'chebyshev']
# }

# # Build logistic regression
# knn_grid_search, knn_time_required = mphd.machine_learning.knn(X_train = X_train, 
#                                                                y_train = y_train,
#                                                                params = knn_params, 
#                                                                random_seed = random_seed)
# # Perform analysis on Random Forest
# knn_summary_df = mphd.analyse_ml.analyse_ml(knn_grid_search,
#                                            time_required = knn_time_required,
#                                            model_type = "kNN",
#                                            independent_variables=independent_variables,
#                                            X_test=X_test, y_test=y_test)
# knn_summary_df

## Decision Tree

In [None]:
# # Define the parameter grid
# # 'ccp_alpha', 'class_weight', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'monotonic_cst', 'random_state', 'splitter'
# decision_tree_params = {
#     'classifier__criterion': ['gini', 'entropy'],
#     'classifier__max_depth': [None, 5, 10, 20],
#     'classifier__min_samples_split': [2, 5, 10],
#     'classifier__min_samples_leaf': [1, 2, 4],
#     'classifier__max_features':[0.8],
#     'classifier__class_weight': ['balanced', None]
# }

# # Build logistic regression
# decision_tree_grid_search, decision_tree_time_required = mphd.machine_learning.decision_tree(X_train = X_train, 
#                                                                                              y_train = y_train,
#                                                                                              params = decision_tree_params, 
#                                                                                              random_seed = random_seed)
# # Perform analysis on Random Forest
# decision_tree_summary_df = mphd.analyse_ml.analyse_ml(decision_tree_grid_search,
#                                            time_required = decision_tree_time_required,
#                                            model_type = "Decision Tree",
#                                            independent_variables=independent_variables,
#                                            X_test=X_test, y_test=y_test)
# decision_tree_summary_df

## For Ramdon Forest

In [None]:
# # Prepare Random Forest Params
# rf_params = {'classifier__max_depth':[7],
#               'smote__sampling_strategy': [0.8],
#               'classifier__min_samples_split':[10],
#               'classifier__max_features':[0.8],
#               'classifier__criterion':["entropy"],
#               'classifier__bootstrap':[True],
#               'classifier__n_estimators':[200],
#               'classifier__min_samples_leaf':[8]}

# random_forest_grid_search, random_forest_time_required = mphd.machine_learning.random_forest(X_train = X_train, 
#                                                                                              y_train = y_train,
#                                                                                              params = rf_params, 
#                                                                                              random_seed = random_seed)

# # Perform analysis on Random Forest
# rf_summary_df = mphd.analyse_ml.analyse_ml(random_forest_grid_search,
#                                            time_required = random_forest_time_required,
#                                            model_type = "Random Forest",
#                                            independent_variables=independent_variables,
#                                            X_test=X_test, y_test=y_test)
# rf_summary_df

## Extreme Gradient Bboost(XGB)

In [None]:
# Define the parameter grid
xgb_params = {
    'classifier__learning_rate': [0.01, 0.1, 0.2, 0.3],
    'classifier__max_depth': [3, 4, 5, 6, 8, 10, 12, 15],
    'classifier__n_estimators': [100, 150, 200, 500, 1000],
    'classifier__subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'classifier__colsample_bytree': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'classifier__gamma': [0, 0.25, 0.5, 1.0],
    'classifier__min_child_weight': [1, 3, 5, 7],
    'classifier__scale_pos_weight': [1, 3, 5, 7]
}

xgb_grid_search, xbg_time_required = mphd.machine_learning.xgb(X_train = X_train, 
                                                               y_train = y_train,
                                                               params = xgb_params, 
                                                              random_seed = random_seed)

# Perform analysis on Random Forest
xgb_summary_df = mphd.analyse_ml.analyse_ml(xgb_grid_search,
                                           time_required = xbg_time_required,
                                           model_type = "XGB",
                                           independent_variables=independent_variables,
                                           X_test=X_test, y_test=y_test)
xgb_summary_df

## For Light Gradient Boost Machine

In [None]:
# # LightGBM
# lgbm_params = {'classifier__max_depth': [3,4,5],
#                'classifier__learning_rate' : [0.1, 0.2],
#                'classifier__min_child_weight' : range(1,3,1),
#                'classifier__boosting_type' : ['gbdt'],
#                'smote__sampling_strategy': np.linspace(0.5, 0.9, 2),
#                'classifier__reg_alpha':[1e-5,0.01,0.03],
#                'classifier__num_leaves':[6]}

# LightGBM_grid_search, LightGBM_time_required = mphd.machine_learning.LightGBM(X_train = X_train, 
#                                                                               y_train = y_train,
#                                                                               params = lgbm_params,
#                                                                               independent_variables_continous = independent_continous,
#                                                                               random_seed = random_seed)

# # Perform analysis on LightGBM
# lightgbm_summary_df = mphd.analyse_ml.analyse_ml(LightGBM_grid_search,
#                                                  time_required = LightGBM_time_required,
#                                                  model_type = "LightGBM",
#                                                  independent_variables=independent_variables,
#                                                  X_test=X_test, y_test=y_test)
# lightgbm_summary_df