In [None]:
# %pip install --upgrade pip FLAML "flaml[spark]" shap setuptools wheel optuna optuna-integration openml xgboost catboost imbalanced-learn pandas scipy statsmodels
# %pip install --upgrade "scikit-learn==1.1.2"
%pip install catboost FLAML "flaml[spark]" geopy gspread imbalanced-learn ipykernel matplotlib numpy \
    openpyxl openai openml optuna optuna-integration \
        pandas pip plotly_express polars PyCap pygsheets python-dotenv pyspark \
            seaborn scipy setuptools scikit-learn shap statsmodels tabulate tabula-py wheel xlsx2csv xgboost

In [1]:
# To allow own package to be imported
import sys
import os
if os.path.dirname(os.getcwd()) not in sys.path:
    sys.path.append(os.path.dirname(os.getcwd()))
if os.getcwd() not in sys.path:
    sys.path.append(os.getcwd())
from warnings import filterwarnings
filterwarnings(action = "ignore")

# import necessary packages
import pandas as pd
import numpy as np
import polars as pl
import plotly.express as px
import wh0102 as mphd

# Prepare the data dictionary
data_dictionary = {
    "Ethnic":{0:"Malay", 1:"Chinese", 2:"Indian"},
    "bmi":{0:"Normal BMI", 1:"Overweight"},
    "Disease":{0:"No liver disease", 1:"Have Liver Disease"},
    "Gender":{0:"Female", 1:"Male"}
}

normal_values = {"TP":[64, 83],
                 "ALB":[35, 52],
                 "TB":[0, 22],
                 "ALP":[40, 130],
                 "ALT":[0, 42],
                 "AST":[0, 41]}

# Rename for easier references
column_to_be_rename = {"Sgot":"ALT",
                       "Sgpt":"AST",
                       "Alkphos":"ALP"}

# Prepare the variables
dependent_variable = "Disease"
independent_demographic = ("Age", "Gender", "Ethnic", "bmi",)
independent_investigations = ("AGR", "ALB", "TP", "TB", "DB", "ALP", "ALT", "AST",)
independent_continous = (independent_demographic[0],) + independent_investigations
independent_categorical = independent_demographic[1:]
independent_variables = independent_demographic + independent_investigations

# Load the data
df = pd.read_csv(r"assignment4.csv")

# Rename the column name
df = df.rename(columns = column_to_be_rename)

# To reassign the categorical value
for column in [key for key in data_dictionary.keys() if key != "Gender"]:
    df.loc[:,column] = df.loc[:,column] - 1

# Print the information
df.info()

# To delete after this
missing_df = mphd.missing_values.analyse_missing_row(df)
df = mphd.categorical_data.label_encode(df = df, columns = "Gender", convert_numeric=True)
df = mphd.missing_values.mice_imputation(df = df, columns = "AGR")
# df.drop(columns = ["Patient_ID"]).to_csv(r"imputed_assignment_4.csv", index = False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 553 entries, 0 to 552
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Patient_ID  553 non-null    int64  
 1   Age         553 non-null    int64  
 2   TB          553 non-null    float64
 3   DB          553 non-null    float64
 4   ALP         553 non-null    int64  
 5   ALT         553 non-null    int64  
 6   AST         553 non-null    int64  
 7   TP          553 non-null    float64
 8   ALB         553 non-null    float64
 9   AGR         553 non-null    object 
 10  Disease     553 non-null    int64  
 11  Ethnic      553 non-null    int64  
 12  Gender      553 non-null    object 
 13  bmi         553 non-null    int64  
dtypes: float64(4), int64(8), object(2)
memory usage: 60.6+ KB
Missing data detected for columns AGR.
Summary of the missing values from the dataframe =
+------------------------------+-------+--------------------+
|                       

In [2]:
X_train, X_test, y_train, y_test = mphd.pre_processing.train_test_split(df = df,
                                                                        independent_variables=independent_variables,
                                                                        dependent_variable = dependent_variable,
                                                                        test_size = 0.2)

lgbm_params = {'classifier__max_depth': [3,4,5],
               'classifier__learning_rate' : [0.1, 0.2], 
               'classifier__min_child_weight' : range(1,3,1), 
               'classifier__boosting_type' : ['gbdt'], 
               'smote__sampling_strategy': np.linspace(0.3, 0.9, 2),
               'classifier__reg_alpha':[1e-5,0.01,0.03], 
               'classifier__num_leaves':[6]}

grid_search, time_required = mphd.machine_learning.LightGBM(X_train, y_train, 
                                                            params = lgbm_params,
                                                            independent_variables_continous = independent_continous)

[LightGBM] [Info] Number of positive: 220, number of negative: 245
[LightGBM] [Info] Number of positive: 220, number of negative: 245
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008385 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 635
[LightGBM] [Info] Number of data points in the train set: 465, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.473118 -> initscore=-0.107631
[LightGBM] [Info] Start training from score -0.107631
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012986 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 643
[LightGBM] [Info] Number of data points in the train set: 465, number of used features: 12
[LightGBM] [Info] Number of positive: 220, number of negative: 245
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.473118 -> initscore=-0.107631
[LightGBM] [I

In [3]:
grid_search

In [None]:
# Convert the gender
df = mphd.categorical_data.label_encode(df = df, columns = "Gender", convert_numeric=True)
# "Gender":{0:"Female", 1:"Male"}

# Check for duplication
duplicated_df, to_drop_duplicated_df = mphd.pre_processing.check_duplication(df)

# Check for missing value
missing_df = mphd.missing_values.analyse_missing_row(df)
missing_df

## To fix AGR == ' ' issue

In [None]:
# Calculate the Globulin for every patient ID with globulin = tp - alb based on resource below:
# https://www.ncbi.nlm.nih.gov/books/NBK204/#:~:text=The%20total%20globulin%20fraction%20is,of%20further%20fractionating%20serum%20proteins

# To check the truthness of this on the data
# Create a deep copy of the df with AGR not null first
temp_df = df.query("AGR.notnull()").copy(deep = True)

# Calculatet the globulin and agr_ratio
def calculate_agr(df:pd.DataFrame, column_name:str):
    df.loc[:,column_name] = df.loc[:,"ALB"] / (df.loc[:,"TP"] - df.loc[:,"ALB"])
    return df

# Calculate the approximate agr
temp_df = calculate_agr(df = temp_df, column_name = "agr_new")
# Check for float similarity
temp_df.loc[:,"agr_similarity"] = temp_df.loc[:,("AGR", "agr_new",)].apply(lambda x: np.isclose(float(x[0]), x[1], rtol = 0.1), axis = 1)

# Pivot the information
pt = temp_df.pivot_table(index = "agr_similarity", values = "Patient_ID", aggfunc = len, margins = True).rename(columns={"Patient_ID":"count"})
# Calculate percentage
pt.loc[:,"percentage"] = round(pt.loc[:,"count"] / pt.loc["All", "count"] * 100, 2)

print(pt.to_markdown(tablefmt = "pretty"))

# Check on the not similarity result
temp_df.query("agr_similarity == False")

In [None]:
# Trial to impute with calculation
missing_df = calculate_agr(df = missing_df, column_name="AGR")
missing_df

In [None]:
# Impute with MICE
# https://medium.com/@brijesh_soni/topic-9-mice-or-multivariate-imputation-with-chain-equation-f8fd435ca91#:~:text=MICE%20stands%20for%20Multivariate%20Imputation,produce%20a%20final%20imputed%20dataset.

df = mphd.missing_values.mice_imputation(df = df, columns = "AGR")
# Check on the imputated value
df.loc[missing_df.index]

__intepretation__:

For imputation, despite the logic of how AGR being calculated, there is a lot of noise in the data for AGR value, therefore we would use MICE for imputation.

In [None]:
# Reverse Encode
data = mphd.categorical_data.reverse_encode(df = df, json_dict=data_dictionary)
data

## Continous Data

In [None]:
# Check for normal distribution
normal_distribution_list, abnormal_distribution_list = mphd.continous_data.descriptive_analysis(df = df, 
                                                                                                independent_variables=independent_continous, 
                                                                                                dependent_variables = dependent_variable,
                                                                                                descriptive_type = "continous",
                                                                                                plot_dependent_variables = False,
                                                                                                plot_correlation = True, 
                                                                                                round = 4)

In [None]:
# Show outliers with 1.5 * iqr
outliers_df = mphd.continous_data.identify_outliers(df = df, 
                                                    column_name = ["TP", "TB", "ALP", "AST", "ALT"], 
                                                    ratio = 1.5,
                                                    normal_values = normal_values)

normal_distribution_list, abnormal_distribution_list = mphd.continous_data.descriptive_analysis(df = df.loc[~df.index.isin(outliers_df.index)], 
                                                                                                independent_variables=independent_continous, 
                                                                                                dependent_variables = dependent_variable,
                                                                                                descriptive_type = "continous",
                                                                                                plot_dependent_variables = False,
                                                                                                plot_correlation = True, 
                                                                                                round = 4)

## Categorical Data

In [None]:
categorical_summary = mphd.categorical_data.categorical_descriptive_analysis(data,
                                                                             independent_variables = independent_categorical, 
                                                                             dependent_variables = dependent_variable, 
                                                                             analyse_dependent = True)

## Model Training

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE

X = df.loc[:,independent_variables]
y = df.loc[:,dependent_variable]

X_train, X_test, y_train, y_test = mphd.pre_processing.train_test_split(df = df,
                                                                        independent_variables=independent_variables,
                                                                        dependent_variable = dependent_variable,
                                                                        test_size = 0.2)

# Resample the dataset using SMOTE
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X, y)

# Feature Scaling
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Model Building
model = RandomForestClassifier(random_state=42)
model.fit(X_train_scaled, y_train)

# Model Evaluation
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Classification Report: \n{classification_rep}')
print(f'Confusion Matrix: \n{confusion_mat}')
# "estimator_list": ['lgbm', 'lgbm_spark', 'xgboost', 'catboost', 'rf'],
    # "task": 'regression',  # task type    
# settings = {
#     "time_budget": 120,  # total running time in seconds
#     "metric": 'roc_auc',  # primary metrics for regression can be chosen from: ['mae','mse','r2','rmse','mape']
#     "estimator_list": ['lgbm',],  # list of ML learners; we tune lightgbm in this example
#     "task": 'classification',  # task type    
#     "log_file_name": None,  # flaml log file
#     "seed": 7654321,    # random seed
#     "use_spark": True,  # whether to use Spark for distributed training
#     "n_concurrent_trials": 2,  # the maximum number of concurrent trials
# }

# automl = mphd.machine_learning.automl(X_train=X_train, y_train=y_train, **settings)

In [None]:
X_resampled
mphd.continous_data.descriptive_analysis(X_resampled, independent_variables = list(X_resampled.columns))

In [None]:
mphd.continous_data.descriptive_analysis(pd.DataFrame(X_train_scaled, columns = X_train.columns), independent_variables = list(X_train.columns))


In [None]:
# "estimator_list": ['lgbm', 'lgbm_spark', 'xgboost', 'catboost', 'rf'],
    # "task": 'regression',  # task type    
settings = {
    "time_budget": 120,  # total running time in seconds
    "metric": 'roc_auc',  # primary metrics for regression can be chosen from: ['mae','mse','r2','rmse','mape']
    "estimator_list": ['lgbm',],  # list of ML learners; we tune lightgbm in this example
    "task": 'classification',  # task type    
    "log_file_name": None,  # flaml log file
    "seed": 7654321,    # random seed
    "use_spark": True,  # whether to use Spark for distributed training
    "n_concurrent_trials": 2,  # the maximum number of concurrent trials
}
X_train, X_test, y_train, y_test = mphd.pre_processing.train_test_split(df = df,
                                                                        independent_variables=independent_variables,
                                                                        dependent_variable = dependent_variable,
                                                                        test_size = 0.2)
automl = mphd.machine_learning.automl(X_train=X_train, y_train=y_train, **settings)

mphd.analyse_ml.analyse_automl(automl=automl, X_test = X_test, y_test = y_test)
automl.model.estimator

In [None]:
grid_search = automl
cv_score = grid_search.
test_score = grid_search.score(X_test, y_test)
print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')

In [None]:
# Logistic Regression
# set acceptable p value
acceptable_p_value = 0.05

# For binominal logistic regression with 2 different depression score outcome along with all independent variable are categorized
logistic_models, summary_logistic_models = mphd.regression.regression_list(df = df, mode = "sm.Logit",
                                                                                   independent_variables = independent_demographic + independent_investigations,
                                                                                   dependent_variables = dependent_variable,
                                                                                   p_value_cut_off = acceptable_p_value)

# To display some information
columns_to_display = ("pseudo_r_2", "log_likelihood", "llr_p_value", "aic_akaike_information_criterion", "bic_bayesin_information_criterion", "coeff_all_significant")
summary_logistic_short = mphd.regression.analyse_model_summary(summary_logistic_models.loc[:,("variables", "num_variables") + columns_to_display + 
                                                                                      ("roc", "shapiro_residual", 
                                                                                       "Lagrange_Multiplier", "Lagrange_Multiplier_p-value",
                                                                                       "F-statistic", "F-statistic_p-value")], 
                                                      top_count = 3,
                                                      parameters= {"aic_akaike_information_criterion": True,
                                                                   "bic_bayesin_information_criterion": True,
                                                                   "pseudo_r_2": False, 
                                                                   "roc":False})
print(summary_logistic_short.round(4).to_markdown(tablefmt = "pretty"))
print(logistic_models[2893].summary())

In [None]:
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline as imbpipeline
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score, RepeatedStratifiedKFold
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from imblearn.over_sampling import SMOTE
from lightgbm import LGBMClassifier

n_estimators = [100, 300]
max_depth = [3,4,5]
min_child_weight = range(1,3,1)
booster = ['gbdt']
base_score = [0.5,0.6]
learning_rate = [0.1,0.2]
objective = ['binary']
seed = [27]
gamma= [0.7,0.8,0.9]
colsample_bytree=[0.7,0.8,0.9]
subsample=[0.6,0.7,0.8]
reg_alpha = [1e-5,0.01,0.03]
weights = np.linspace(0.3, 0.9, 2)
num_leaves = [6]

lgbm_params = {
               'classifier__max_depth': max_depth,
               'classifier__learning_rate' : learning_rate,
               'classifier__min_child_weight' : min_child_weight,
               'classifier__boosting_type' : booster,
               'classifier__seed':seed,
               'smote__sampling_strategy': weights,
               'classifier__num_leaves':num_leaves}

# X_train, X_test, y_train, y_test = train_test_split(X,
#                                                     y,
#                                                     test_size=0.2,
#                                                     stratify=y,
#                                                     random_state=11)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', RobustScaler(), independent_continous),
    ],
    remainder='passthrough'
)

pipeline = imbpipeline([('smote', SMOTE(random_state=11)),
                        ('scaler', preprocessor),
                        ('classifier', LGBMClassifier())])

stratified_kfold = StratifiedKFold(n_splits=5,
                                   shuffle=True,
                                   random_state=11)

param_grid = lgbm_params
grid_search = GridSearchCV(estimator=pipeline,
                           param_grid=param_grid,
                           scoring='roc_auc',
                           cv=stratified_kfold,
                           n_jobs=-1)

grid_search.fit(X_train, y_train)

In [None]:
cv_score = grid_search.best_score_
test_score = grid_search.score(X_test, y_test)
print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')

In [None]:
# Get the best estimator from the GridSearchCV object
best_estimator = grid_search.best_estimator_

# Get the predicted probabilities for the test set
y_test_proba = best_estimator.predict_proba(X_test)[:, 1]

# Compute the fpr, tpr, and thresholds for the ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_test_proba)

# Plot the ROC curve
plt.plot(fpr, tpr, label='ROC curve')
plt.plot([0, 1], [0, 1], 'k--', label='Random guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')

# Compute the AUC
auc = roc_auc_score(y_test, y_test_proba)

# Add the AUC score to the graph
plt.annotate(f'AUC = {auc:.4f}', xy=(0.8, 0.2), xycoords='axes fraction')

plt.legend(loc='best')
plt.show()

In [None]:
grid_search.best_params_