In [None]:
%pip install --upgrade pip "flaml[spark]" setuptools wheel "scikit-learn==1.1.2"
# %pip install --upgrade FLAML optuna

In [4]:
# To allow own package to be imported
import sys
import os
if os.path.dirname(os.getcwd()) not in sys.path:
    sys.path.append(os.path.dirname(os.getcwd()))
if os.getcwd() not in sys.path:
    sys.path.append(os.getcwd())
from warnings import filterwarnings
filterwarnings(action = "ignore")

# import necessary packages
import pandas as pd
import numpy as np
import polars as pl
import plotly.express as px
import wh0102 as mphd

# Prepare the data dictionary
data_dictionary = {
    "Ethnic":{0:"Malay", 1:"Chinese", 2:"Indian"},
    "bmi":{0:"Normal BMI", 1:"Overweight"},
    "Disease":{0:"No liver disease", 1:"Have Liver Disease"},
    "Gender":{0:"Female", 1:"Male"}
}

# Prepare the variables
dependent_variable = "Disease"
independent_demographic = ("Age", "Gender", "Ethnic", "bmi",)
independent_investigations = ("AGR", "ALB", "TP", "TB", "DB", "Alkphos", "Sgot", "Sgpt",)
independent_continous = (independent_demographic[0],) + independent_investigations
independent_categorical = independent_demographic[1:]

# Load the data
df = pd.read_csv(r"assignment4.csv")

# To reassign the categorical value
for column in [key for key in data_dictionary.keys() if key != "Gender"]:
    df.loc[:,column] = df.loc[:,column] - 1

# Print the information
# df.info()

# To delete after this
missing_df = mphd.missing_values.analyse_missing_row(df)
df = mphd.categorical_data.label_encode(df = df, columns = "Gender", convert_numeric=True)
df = mphd.missing_values.mice_imputation(df = df, columns = "AGR")

Missing data detected for columns AGR.
Summary of the missing values from the dataframe =
+------------------------------+-------+--------------------+
|                              | count | missing_percentage |
+------------------------------+-------+--------------------+
|             AGR              |  4.0  |        0.72        |
| All_rows_with_missing_values |  4.0  |        0.72        |
+------------------------------+-------+--------------------+


In [3]:
from flaml import AutoML
from sklearn.model_selection import train_test_split
automl = AutoML()
settings = {
    "time_budget": 30,  # total running time in seconds
    "metric": 'r2',  # primary metrics for regression can be chosen from: ['mae','mse','r2','rmse','mape']
    "estimator_list": ['lgbm'],  # list of ML learners; we tune lightgbm in this example
    "task": 'regression',  # task type    
    "log_file_name": 'houses_experiment.log',  # flaml log file
    "seed": 7654321,    # random seed
    "use_spark": False,  # whether to use Spark for distributed training
    "n_concurrent_trials": 2,  # the maximum number of concurrent trials
}
X_train, X_test, y_train, y_test = train_test_split(df.loc[:,independent_demographic+independent_investigations],
                                                    df.loc[:,dependent_variable],
                                                    test_size=0.2,
                                                    stratify=df.loc[:,dependent_variable],
                                                    random_state=11)
automl.fit(X_train=X_train, y_train=y_train, **settings)

[flaml.automl.logger: 05-25 15:59:11] {1680} INFO - task = regression


[flaml.automl.logger: 05-25 15:59:11] {1691} INFO - Evaluation method: cv
[flaml.automl.logger: 05-25 15:59:11] {1789} INFO - Minimizing error metric: 1-r2
[flaml.automl.logger: 05-25 15:59:11] {1901} INFO - List of ML learners in AutoML Run: ['lgbm']


You passed a `space` parameter to OptunaSearch that contained unresolved search space definitions. OptunaSearch should however be instantiated with fully configured search spaces only. To use Ray Tune's automatic search space conversion, pass the space definition as part of the `config` argument to `tune.run()` instead.
[I 2024-05-25 15:59:11,997] A new study created in memory with name: optuna
[I 2024-05-25 15:59:12,057] A new study created in memory with name: optuna


ImportError: No module named 'joblibspark'. Try pip install flaml[spark] or set use_spark=False.

In [None]:
print('Best hyperparmeter config:', automl.best_config)
print('Best r2 on validation data: {0:.4g}'.format(1-automl.best_loss))
print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))
automl.model.estimator

In [None]:
import matplotlib.pyplot as plt
plt.barh(automl.feature_names_in_, automl.feature_importances_)

In [None]:
y_pred = automl.predict(X_test)
from flaml.ml import sklearn_metric_loss_score
print('r2', '=', 1 - sklearn_metric_loss_score('r2', y_pred, y_test))
print('mse', '=', sklearn_metric_loss_score('mse', y_pred, y_test))
print('mae', '=', sklearn_metric_loss_score('mae', y_pred, y_test))

In [None]:
import optuna.integration.lightgbm as lgb
from lightgbm import early_stopping
from lightgbm import log_evaluation
from sklearn.metrics import accuracy_score

dtrain = lgb.Dataset(X_train, label=y_train)
dval = lgb.Dataset(X_test, label=y_test)

params = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
    }

model = lgb.train(
    params,
    dtrain,
    valid_sets=[dtrain, dval],
    callbacks=[early_stopping(100), log_evaluation(100)],
)

prediction = np.rint(model.predict(X_test, num_iteration=model.best_iteration))
accuracy = accuracy_score(y_test, prediction)

best_params = model.params
print("Best params:", best_params)
print("  Accuracy = {}".format(accuracy))
print("  Params: ")
for key, value in best_params.items():
    print("    {}: {}".format(key, value))

In [None]:
import numpy as np
import optuna.integration.lightgbm as lgb

from lightgbm import early_stopping
from lightgbm import log_evaluation
import sklearn.datasets
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


if __name__ == "__main__":
    data, target = sklearn.datasets.load_breast_cancer(return_X_y=True)
    train_x, val_x, train_y, val_y = train_test_split(data, target, test_size=0.25)
    dtrain = lgb.Dataset(train_x, label=train_y)
    dval = lgb.Dataset(val_x, label=val_y)

    params = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
    }

    model = lgb.train(
        params,
        dtrain,
        valid_sets=[dtrain, dval],
        callbacks=[early_stopping(100), log_evaluation(100)],
    )

    prediction = np.rint(model.predict(val_x, num_iteration=model.best_iteration))
    accuracy = accuracy_score(val_y, prediction)

    best_params = model.params
    print("Best params:", best_params)
    print("  Accuracy = {}".format(accuracy))
    print("  Params: ")
    for key, value in best_params.items():
        print("    {}: {}".format(key, value))

In [None]:
# Convert the gendfer
df = mphd.categorical_data.label_encode(df = df, columns = "Gender", convert_numeric=True)
# "Gender":{0:"Female", 1:"Male"}

# Check for duplication
duplicated_df, to_drop_duplicated_df = mphd.pre_processing.check_duplication(df)

# Check for missing value
missing_df = mphd.missing_values.analyse_missing_row(df)
missing_df

## To fix AGR == ' ' issue

In [None]:
# Calculate the Globulin for every patient ID with globulin = tp - alb based on resource below:
# https://www.ncbi.nlm.nih.gov/books/NBK204/#:~:text=The%20total%20globulin%20fraction%20is,of%20further%20fractionating%20serum%20proteins

# To check the truthness of this on the data
# Create a deep copy of the df with AGR not null first
temp_df = df.query("AGR.notnull()").copy(deep = True)

# Calculatet the globulin and agr_ratio
def calculate_agr(df:pd.DataFrame, column_name:str):
    df.loc[:,column_name] = df.loc[:,"ALB"] / (df.loc[:,"TP"] - df.loc[:,"ALB"])
    return df

# Calculate the approximate agr
temp_df = calculate_agr(df = temp_df, column_name = "agr_new")
# Check for float similarity
temp_df.loc[:,"agr_similarity"] = temp_df.loc[:,("AGR", "agr_new",)].apply(lambda x: np.isclose(float(x[0]), x[1], rtol = 0.1), axis = 1)

# Pivot the information
pt = temp_df.pivot_table(index = "agr_similarity", values = "Patient_ID", aggfunc = len, margins = True).rename(columns={"Patient_ID":"count"})
# Calculate percentage
pt.loc[:,"percentage"] = round(pt.loc[:,"count"] / pt.loc["All", "count"] * 100, 2)

print(pt.to_markdown(tablefmt = "pretty"))

# Check on the not similarity result
temp_df.query("agr_similarity == False")

In [None]:
# Trial to impute with calculation
missing_df = calculate_agr(df = missing_df, column_name="AGR")
missing_df

In [None]:
# Impute with MICE
# https://medium.com/@brijesh_soni/topic-9-mice-or-multivariate-imputation-with-chain-equation-f8fd435ca91#:~:text=MICE%20stands%20for%20Multivariate%20Imputation,produce%20a%20final%20imputed%20dataset.

df = mphd.missing_values.mice_imputation(df = df, columns = "AGR")
# Check on the imputated value
df.loc[missing_df.index]

__intepretation__:

For imputation, despite the logic of how AGR being calculated, there is a lot of noise in the data for AGR value, therefore we would use MICE for imputation.

In [None]:
# Reverse Encode
data = mphd.categorical_data.reverse_encode(df = df, json_dict=data_dictionary)
data

## Continous Data

In [None]:
# Check for normal distribution
normal_distribution_list, abnormal_distribution_list = mphd.continous_data.descriptive_analysis(df = df, 
                                                                                                independent_variables=independent_continous, 
                                                                                                dependent_variables = dependent_variable,
                                                                                                descriptive_type = "continous",
                                                                                                plot_dependent_variables = False,
                                                                                                plot_correlation = False, 
                                                                                                round = 4)

## Categorical Data

In [None]:
categorical_summary = mphd.categorical_data.categorical_descriptive_analysis(data,
                                                                             independent_variables = independent_categorical, 
                                                                             dependent_variables = dependent_variable)

## Model Training

In [None]:
# Logistic Regression
# set acceptable p value
acceptable_p_value = 0.05

# For binominal logistic regression with 2 different depression score outcome along with all independent variable are categorized
logistic_models, summary_logistic_models = mphd.regression.regression_list(df = df, mode = "sm.Logit",
                                                                                   independent_variables = independent_demographic + independent_investigations,
                                                                                   dependent_variables = dependent_variable,
                                                                                   p_value_cut_off = acceptable_p_value)

# To display some information
columns_to_display = ("pseudo_r_2", "log_likelihood", "llr_p_value", "aic_akaike_information_criterion", "bic_bayesin_information_criterion", "coeff_all_significant")
summary_logistic_short = mphd.regression.analyse_model_summary(summary_logistic_models.loc[:,("variables", "num_variables") + columns_to_display + 
                                                                                      ("roc", "shapiro_residual", 
                                                                                       "Lagrange_Multiplier", "Lagrange_Multiplier_p-value",
                                                                                       "F-statistic", "F-statistic_p-value")], 
                                                      top_count = 3,
                                                      parameters= {"aic_akaike_information_criterion": True,
                                                                   "bic_bayesin_information_criterion": True,
                                                                   "pseudo_r_2": False, 
                                                                   "roc":False})
print(summary_logistic_short.round(4).to_markdown(tablefmt = "pretty"))
print(logistic_models[2893].summary())

In [None]:
from lightgbm import LGBMClassifier
from imblearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE

num_cols = ['age', 'bmi', 'sys_bp', 'dias_bp', 'hba1c', 'ldl']
cat_cols = ['sex', 'ethnic', 'retinopathy', 'ihd', 'cevd', 'nephropathy']

n_estimators = [100, 300]
max_depth = [3,4,5]
min_child_weight = range(1,3,1)
booster = ['gbdt']
base_score = [0.5,0.6]
learning_rate = [0.1,0.2]
objective = ['binary']
seed = [27]
gamma= [0.7,0.8,0.9]
colsample_bytree=[0.7,0.8,0.9]
subsample=[0.6,0.7,0.8]
reg_alpha = [1e-5,0.01,0.03]
weights = np.linspace(0.3, 0.9, 2)
num_leaves = [6]

lgbm_params = {'classifier__n_estimators': n_estimators, 'classifier__max_depth': max_depth,
               'classifier__learning_rate' : learning_rate, 'classifier__min_child_weight' : min_child_weight, 
               'classifier__boosting_type' : booster, 'classifier__seed':seed,'smote__sampling_strategy': weights,
               'classifier__reg_alpha':reg_alpha, 'classifier__num_leaves':num_leaves}

X_train, X_test, y_train, y_test = train_test_split(df.loc[:,independent_demographic+independent_investigations],
                                                    df.loc[:,dependent_variable],
                                                    test_size=0.2,
                                                    stratify=df.loc[:,dependent_variable],
                                                    random_state=11)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', RobustScaler(), independent_continous),
    ],
    remainder='passthrough'
)

pipeline = Pipeline([('smote', SMOTE(random_state=11)),
                     ('scaler', preprocessor),
                     ('classifier', LGBMClassifier())])

stratified_kfold = StratifiedKFold(n_splits=5,
                                   shuffle=True,
                                   random_state=11)
    
param_grid = lgbm_params
grid_search = GridSearchCV(estimator=pipeline,
                           param_grid=param_grid,
                           scoring='roc_auc',
                           cv=stratified_kfold,
                           n_jobs=-1)

model = grid_search.fit(X_train, y_train, num_boost_round=1000, early_stopping_rounds=50, verbose_eval=100)
model.predict(X_test)