In [1]:
from __future__ import print_function
from packaging.version import parse as Version
from platform import python_version

OK = '\x1b[42m[ OK ]\x1b[0m'
FAIL = "\x1b[41m[FAIL]\x1b[0m"

try:
    import importlib
except ImportError:
    print(FAIL, "Python version 3.12 is required,"
                " but %s is installed." % sys.version)

def import_version(pkg, min_ver, fail_msg=""):
    mod = None
    try:
        mod = importlib.import_module(pkg)
        if pkg in {'PIL'}:
            ver = mod.VERSION
        else:
            ver = mod.__version__
        if Version(ver) == Version(min_ver):
            print(OK, "%s version %s is installed."
                  % (lib, min_ver))
        else:
            print(FAIL, "%s version %s is required, but %s installed."
                  % (lib, min_ver, ver))    
    except ImportError:
        print(FAIL, '%s not installed. %s' % (pkg, fail_msg))
    return mod


# first check the python version
pyversion = Version(python_version())

if pyversion >= Version("3.12.5"):
    print(OK, "Python version is %s" % pyversion)
elif pyversion < Version("3.12.5"):
    print(FAIL, "Python version 3.12.5 is required,"
                " but %s is installed." % pyversion)
else:
    print(FAIL, "Unknown Python version: %s" % pyversion)

    
print()
requirements = {'numpy': "1.26.4", 'matplotlib': "3.9.2",'sklearn': "1.5.1", 
                'pandas': "2.2.2",'xgboost': "2.1.1", 'shap': "0.45.1", 
                'plotly': "5.23.0"}

# now the dependencies
for lib, required_version in list(requirements.items()):
    import_version(lib, required_version)

[42m[ OK ][0m Python version is 3.12.5

[42m[ OK ][0m numpy version 1.26.4 is installed.
[42m[ OK ][0m matplotlib version 3.9.2 is installed.
[42m[ OK ][0m sklearn version 1.5.1 is installed.
[42m[ OK ][0m pandas version 2.2.2 is installed.
[42m[ OK ][0m xgboost version 2.1.1 is installed.
[42m[ OK ][0m shap version 0.45.1 is installed.
[42m[ OK ][0m plotly version 5.23.0 is installed.


In [2]:
#Reading data

import pandas as pd

df = pd.read_csv('data/ad_click_dataset.csv')
print(df)

        id full_name   age      gender device_type ad_position  \
0      670   User670  22.0         NaN     Desktop         Top   
1     3044  User3044   NaN        Male     Desktop         Top   
2     5912  User5912  41.0  Non-Binary         NaN        Side   
3     5418  User5418  34.0        Male         NaN         NaN   
4     9452  User9452  39.0  Non-Binary         NaN         NaN   
...    ...       ...   ...         ...         ...         ...   
9995  8510  User8510   NaN         NaN      Mobile         Top   
9996  7843  User7843   NaN      Female     Desktop      Bottom   
9997  3914  User3914   NaN        Male      Mobile        Side   
9998  7924  User7924   NaN         NaN     Desktop         NaN   
9999  3056  User3056  44.0        Male      Tablet         Top   

     browsing_history time_of_day  click  
0            Shopping   Afternoon      1  
1                 NaN         NaN      1  
2           Education       Night      1  
3       Entertainment     Evening  

In [3]:
from sklearn.model_selection import GroupShuffleSplit, StratifiedGroupKFold

X = df.drop(columns=['click', 'id', 'full_name']) #Dropping full name, as it adds no value
y = df['click']  #Target variable
groups = df['id'] 

def group_split_with_stratifiedGroupkfold(X, y, groups, train_size, test_size, n_splits, random_state):
    if train_size + test_size != 1:
        raise ValueError('The sum of the train_size and test_size should be 1.')
    
    #Using GroupShuffleSplit to split into train+validation and test set
    splitter = GroupShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state)
    
    for i_train_val, i_test in splitter.split(X, y, groups):
        X_train_val, y_train_val, groups_train_val = X.iloc[i_train_val], y.iloc[i_train_val], groups.iloc[i_train_val]
        X_test, y_test, groups_test = X.iloc[i_test], y.iloc[i_test], groups.iloc[i_test]
    
    print('Test set size:', X_test.shape, y_test.shape)
    print()
    
    #Using Stratified GroupKFold to split the train+validation set into train and validation sets
    skf = StratifiedGroupKFold(n_splits=n_splits)
    for i_train, i_val in skf.split(X_train_val, y_train_val, groups_train_val):
        X_train, y_train, groups_train = X_train_val.iloc[i_train], y_train_val.iloc[i_train], groups_train_val.iloc[i_train]
        X_val, y_val, groups_val = X_train_val.iloc[i_val], y_train_val.iloc[i_val], groups_train_val.iloc[i_val]
    
        print('Training set size:', X_train.shape, y_train.shape) 
        print('Validation set size:', X_val.shape, y_val.shape)
        

    return X_train, y_train, X_val, y_val, X_test, y_test

random_state = 42
n_splits = 5  #Number of folds for GroupKFold
X_train, y_train, X_val, y_val, X_test, y_test = group_split_with_stratifiedGroupkfold(X, y, groups, train_size=0.8, test_size=0.2, n_splits=n_splits, random_state=random_state)

#print(X_train.head())
#print(X_val.head())
#print(X_test.head())

Test set size: (2019, 6) (2019,)

Training set size: (6385, 6) (6385,)
Validation set size: (1596, 6) (1596,)
Training set size: (6383, 6) (6383,)
Validation set size: (1598, 6) (1598,)
Training set size: (6384, 6) (6384,)
Validation set size: (1597, 6) (1597,)
Training set size: (6386, 6) (6386,)
Validation set size: (1595, 6) (1595,)
Training set size: (6386, 6) (6386,)
Validation set size: (1595, 6) (1595,)


In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

cat_ftrs = ['gender', 'device_type', 'ad_position', 'browsing_history', 'time_of_day']
num_ftrs = ['age']

# one-hot encoder
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant',fill_value='missing')),
    ('onehot', OneHotEncoder(sparse_output=False,handle_unknown='ignore'))])

# standard scaler
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

# collect the transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_ftrs),
        ('cat', categorical_transformer, cat_ftrs)])

clf = Pipeline(steps=[('preprocessor', preprocessor)]) 

X_train_prep = clf.fit_transform(X_train)
X_val_prep = clf.transform(X_val)
X_test_prep = clf.transform(X_test)

print(X_train.shape)
print(X_train_prep.shape)
#print(X_train_prep)

print(X_val.shape)
print(X_val_prep.shape)
#print(X_val_prep)

print(X_test.shape)
print(X_test.shape)
#print(X_test_prep)

(6386, 6)
(6386, 24)
(1595, 6)
(1595, 24)
(2019, 6)
(2019, 6)


In [None]:
import numpy as np
import xgboost as xgb
from sklearn.metrics import accuracy_score, mean_squared_error, precision_score, recall_score, f1_score
from sklearn.model_selection import ParameterGrid, KFold
import pickle

param_grid = {
    "learning_rate": [0.03],
    "n_estimators": [10000],
    "missing": [np.nan],
    "colsample_bytree": [0.9],
    "subsample": [0.66],
    "reg_alpha": [0e0, 1e-2, 1e-1, 1e0, 1e1, 1e2],
    "reg_lambda": [0e0, 1e-2, 1e-1, 1e0, 1e1, 1e2],
    "max_depth": [1, 3, 10, 30, 100]
}
pg = ParameterGrid(param_grid)

results = {
    'test_rmse': [],
    'test_accuracy': [],
    'test_precision': [],
    'test_recall': [],
    'test_f1': []
}

random_states = [2, 7, 4, 9, 3]

for random_state in random_states:
    print(f"\nRandom State {random_state}...")

    X_train_transformed = preprocessor.fit_transform(X_train)
    X_val_transformed = preprocessor.transform(X_val)
    X_test_transformed = preprocessor.transform(X_test)

    kf = KFold(n_splits=5, shuffle=True, random_state=random_state)

    scores = []
    for params in pg:
        fold_rmse = []

        for train_idx, val_idx in kf.split(X_train_transformed):
            X_cv_train, X_cv_val = X_train_transformed[train_idx], X_train_transformed[val_idx]
            y_cv_train, y_cv_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

            model = xgb.XGBClassifier(
                **params, random_state=random_state, eval_metric='logloss',
                n_jobs=-1, early_stopping_rounds=50
            )
            model.fit(
                X_cv_train, y_cv_train,
                eval_set=[(X_cv_val, y_cv_val)],
                verbose=False
            )

            y_cv_pred = model.predict(X_cv_val)
            rmse = np.sqrt(mean_squared_error(y_cv_val, y_cv_pred))
            fold_rmse.append(rmse)

        cv_mean_rmse = np.mean(fold_rmse)
        scores.append(cv_mean_rmse)

    best_idx = np.argmin(scores)
    best_params = list(pg)[best_idx]
    print(f"Best parameters after CV: {best_params}")

    best_model = xgb.XGBClassifier(
        **best_params, random_state=random_state, eval_metric='logloss',
        n_jobs=-1, early_stopping_rounds=50
    )
    best_model.fit(
        X_train_transformed, y_train,
        eval_set=[(X_val_transformed, y_val)],
        verbose=False
    )

    y_test_pred = best_model.predict(X_test_transformed)

    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    test_acc = accuracy_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred, zero_division=0)
    test_recall = recall_score(y_test, y_test_pred, zero_division=0)
    test_f1 = f1_score(y_test, y_test_pred, zero_division=0)

    results['test_rmse'].append(test_rmse)
    results['test_accuracy'].append(test_acc)
    results['test_precision'].append(test_precision)
    results['test_recall'].append(test_recall)
    results['test_f1'].append(test_f1)

    # Print the metrics for this random state
    print(f"Best Params for Random State {random_state}: {best_params}")
    print(f"Test RMSE: {test_rmse:.4f}")
    print(f"Test Accuracy: {test_acc:.4f}")
    print(f"Test Precision: {test_precision:.4f}")
    print(f"Test Recall: {test_recall:.4f}")
    print(f"Test F1-Score: {test_f1:.4f}")

# Summary metrics
mean_rmse = np.mean(results['test_rmse'])
std_rmse = np.std(results['test_rmse'])
mean_accuracy = np.mean(results['test_accuracy'])
std_accuracy = np.std(results['test_accuracy'])
mean_precision = np.mean(results['test_precision'])
std_precision = np.std(results['test_precision'])
mean_recall = np.mean(results['test_recall'])
std_recall = np.std(results['test_recall'])
mean_f1 = np.mean(results['test_f1'])
std_f1 = np.std(results['test_f1'])

print("\nSummary:")
print(f"Test RMSE: {mean_rmse:.4f} +/- {std_rmse:.4f}")
print(f"Test Accuracy: {mean_accuracy:.4f} +/- {std_accuracy:.4f}")
print(f"Test Precision: {mean_precision:.4f} +/- {std_precision:.4f}")
print(f"Test Recall: {mean_recall:.4f} +/- {std_recall:.4f}")
print(f"Test F1-Score: {mean_f1:.4f} +/- {std_f1:.4f}")


with open("results.pkl", "wb") as file:
    pickle.dump(results, file)

print("Results have been saved to results.pkl")


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

cm = confusion_matrix(y_test, y_test_pred)

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0, 1]) 
disp.plot(cmap='Blues', values_format='d')
plt.title(f"Confusion Matrix for XGB")
plt.savefig('model1.png', bbox_inches='tight', dpi=300)
plt.show()

In [None]:
#Global
import shap

df_test = pd.DataFrame(X_test_transformed, columns=preprocessor.get_feature_names_out())

explainer = shap.TreeExplainer(XGB)  
shap_values = explainer.shap_values(df_test)  

mean_shap_values = np.abs(shap_values).mean(axis=0)

top_10_indices = np.argsort(mean_shap_values)[-10:]
top_10_features = df_test.columns[top_10_indices]
top_10_importances = mean_shap_values[top_10_indices]

plt.figure(figsize=(10, 6))
plt.barh(top_10_features[::-1], top_10_importances[::-1]) 
plt.xlabel('Mean |SHAP Value|')
plt.title('Top 10 Features by SHAP Importance')
plt.savefig('model2.png', bbox_inches='tight', dpi=300)
plt.show()

In [None]:
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt
import numpy as np

feature_names = np.array(feature_names)

perm_importance = permutation_importance(
    XGB,
    df_test, 
    y_test,         
    n_repeats=10,
    random_state=42,
    n_jobs=-1
)

importances = perm_importance.importances_mean
sorted_indices = np.argsort(importances)[::-1]

top_10_indices = sorted_indices[:10]
top_10_features = feature_names[top_10_indices]
top_10_importances = importances[top_10_indices]

plt.figure(figsize=(10, 6))
plt.barh(range(10), top_10_importances[::-1], align="center", color="skyblue")
plt.yticks(range(10), top_10_features[::-1])
plt.xlabel("Mean Permutation Importance")
plt.title("Top 10 Feature Importances")
plt.tight_layout()
plt.savefig('model3.png', bbox_inches='tight', dpi=300)
plt.show()


In [None]:
#xgb 5 metrics

feature_names = preprocessor.get_feature_names_out()

feature_mapping = {f"f{i}": name for i, name in enumerate(feature_names)}

importance_types = ['weight', 'gain', 'cover', 'total_gain', 'total_cover']
feature_importances = {}

for i in importance_types:
    raw_importances = XGB.get_booster().get_score(importance_type=i)
    feature_importances[i] = {feature_mapping.get(k, k): v for k, v in raw_importances.items()}

def get_sorted_importance(importance_dict):
    sorted_importance = sorted(importance_dict.items(), key=lambda x: x[1], reverse=True)
    features, scores = zip(*sorted_importance)
    return features, scores

for i in importance_types:
    features, scores = get_sorted_importance(feature_importances[i])
    top_10_features = features[:10]
    top_10_scores = scores[:10]

    plt.figure(figsize=(10, 6))
    plt.barh(top_10_features[::-1], top_10_scores[::-1])  
    plt.xlabel('Importance Score')
    plt.title(f'Top 10 Features by {i.capitalize()}')
    plt.tight_layout()
    #plt.savefig('model4.png', bbox_inches='tight', dpi=300)
    plt.show()


In [None]:
#local
shap.initjs()

feature_names = preprocessor.get_feature_names_out()

explainer = shap.TreeExplainer(XGB)

index = 0  
shap_values = explainer.shap_values(df_test) 

shap.force_plot(
    explainer.expected_value,
    shap_values[index],
    features=df_test.iloc[index],
    feature_names=feature_names 
)

In [None]:
explainer = shap.TreeExplainer(XGB)

index = 100  
shap_values = explainer.shap_values(df_test) 

shap.force_plot(
    explainer.expected_value,
    shap_values[index],
    features=df_test.iloc[index],
    feature_names=feature_names 
)

In [None]:
explainer = shap.TreeExplainer(XGB)

index = 200  
shap_values = explainer.shap_values(df_test) 

shap.force_plot(
    explainer.expected_value,
    shap_values[index],
    features=df_test.iloc[index],
    feature_names=feature_names 
)

In [17]:
df = pd.read_csv('data/ad_click_dataset.csv')

In [18]:


X = df.drop(columns=['click', 'id', 'full_name']) #Dropping full name, as it adds no value
y = df['click']  #Target variable
groups = df['id'] 

def group_split_with_stratifiedGroupkfold(X, y, groups, train_size, test_size, n_splits, random_state):
    if train_size + test_size != 1:
        raise ValueError('The sum of the train_size and test_size should be 1.')
    
    #Using GroupShuffleSplit to split into train+validation and test set
    splitter = GroupShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state)
    
    for i_train_val, i_test in splitter.split(X, y, groups):
        X_train_val, y_train_val, groups_train_val = X.iloc[i_train_val], y.iloc[i_train_val], groups.iloc[i_train_val]
        X_test, y_test, groups_test = X.iloc[i_test], y.iloc[i_test], groups.iloc[i_test]
    
    print('Test set size:', X_test.shape, y_test.shape)
    
    #Using Stratified GroupKFold to split the train+validation set into train and validation sets
    skf = StratifiedGroupKFold(n_splits=n_splits)
    for i_train, i_val in skf.split(X_train_val, y_train_val, groups_train_val):
        X_train, y_train, groups_train = X_train_val.iloc[i_train], y_train_val.iloc[i_train], groups_train_val.iloc[i_train]
        X_val, y_val, groups_val = X_train_val.iloc[i_val], y_train_val.iloc[i_val], groups_train_val.iloc[i_val]
    
        print('Training set size:', X_train.shape, y_train.shape) 
        print('Validation set size:', X_val.shape, y_val.shape)
        

    return X_train, y_train, X_val, y_val, X_test, y_test

random_state = 42
n_splits = 5  #Number of folds for GroupKFold
X_train, y_train, X_val, y_val, X_test, y_test = group_split_with_stratifiedGroupkfold(X, y, groups, train_size=0.8, test_size=0.2, n_splits=n_splits, random_state=random_state)

Test set size: (2019, 6) (2019,)
Training set size: (6385, 6) (6385,)
Validation set size: (1596, 6) (1596,)
Training set size: (6383, 6) (6383,)
Validation set size: (1598, 6) (1598,)
Training set size: (6384, 6) (6384,)
Validation set size: (1597, 6) (1597,)
Training set size: (6386, 6) (6386,)
Validation set size: (1595, 6) (1595,)
Training set size: (6386, 6) (6386,)
Validation set size: (1595, 6) (1595,)


In [19]:
import numpy as np
def preprocess_reduced_features(X_train, y_train, X_val, y_val, X_test, y_test):
    mask = X_test.isnull()
    unique_rows = np.array(np.unique(mask, axis=0))
    all_subsets = []

    print(f'There are {len(unique_rows)} unique missing value patterns.')

    for i, pattern in enumerate(unique_rows):
        print(f'Processing pattern {i + 1}')

        sub_X_test = X_test.loc[(mask == pattern).all(axis=1), :]
        sub_y_test = y_test.loc[sub_X_test.index]

        sub_X_train = X_train.loc[:, ~pattern]
        sub_X_val = X_val.loc[:, ~pattern]
        sub_X_train = sub_X_train.dropna()
        sub_X_val = sub_X_val.dropna()

        sub_y_train = y_train.loc[sub_X_train.index]
        sub_y_val = y_val.loc[sub_X_val.index]

        # Dynamically updating preprocessor for reduced features
        cat_ftrs_dynamic = [col for col in cat_ftrs if col in sub_X_train.columns]
        num_ftrs_dynamic = [col for col in num_ftrs if col in sub_X_train.columns]

        # Skip if no features are left
        if not cat_ftrs_dynamic and not num_ftrs_dynamic:
            print(f"   Pattern {i + 1} skipped (0 features remaining).")
            continue

        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, num_ftrs_dynamic),
                ('cat', categorical_transformer, cat_ftrs_dynamic)
            ]
        )

        sub_X_train_prep = preprocessor.fit_transform(sub_X_train)
        sub_X_val_prep = preprocessor.transform(sub_X_val)
        sub_X_test_prep = preprocessor.transform(sub_X_test.loc[:, ~pattern])

        # Ensuring preprocessed data has at least 1 feature
        if sub_X_train_prep.shape[1] == 0:
            print(f"   Pattern {i + 1} skipped (0 features after preprocessing).")
            continue

        all_subsets.append({
            'pattern': i + 1,
            'X_train_prep': sub_X_train_prep,
            'y_train': sub_y_train,
            'X_val_prep': sub_X_val_prep,
            'y_val': sub_y_val,
            'X_test_prep': sub_X_test_prep,
            'y_test': sub_y_test
        })

        print(f'   Pattern {i + 1}: Train size {sub_X_train_prep.shape}, Test size {sub_X_test_prep.shape}')

    return all_subsets

reduced_datasets = preprocess_reduced_features(X_train, y_train, X_val, y_val, X_test, y_test)

print(f'Total subsets created: {len(reduced_datasets)}')

There are 64 unique missing value patterns.
Processing pattern 1
   Pattern 1: Train size (517, 19), Test size (142, 19)
Processing pattern 2
   Pattern 2: Train size (628, 15), Test size (34, 15)
Processing pattern 3
   Pattern 3: Train size (909, 14), Test size (142, 14)
Processing pattern 4
   Pattern 4: Train size (1118, 10), Test size (44, 10)
Processing pattern 5
   Pattern 5: Train size (636, 16), Test size (37, 16)
Processing pattern 6
   Pattern 6: Train size (783, 12), Test size (3, 12)
Processing pattern 7
   Pattern 7: Train size (1128, 11), Test size (38, 11)
Processing pattern 8
   Pattern 8: Train size (1404, 7), Test size (11, 7)
Processing pattern 9
   Pattern 9: Train size (623, 16), Test size (32, 16)
Processing pattern 10
   Pattern 10: Train size (762, 12), Test size (7, 12)
Processing pattern 11
   Pattern 11: Train size (1110, 11), Test size (36, 11)
Processing pattern 12
   Pattern 12: Train size (1366, 7), Test size (6, 7)
Processing pattern 13
   Pattern 13: T

In [20]:
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


def MLpipe_multiple_random_states(reduced_datasets, models, param_grids, random_states):
    overall_results = {}

    for model_name, model in models.items():
        print(f"Evaluating {model_name} across all missing patterns and random states...")

        model_results = {
            'Validation RMSE': [],
            'Test RMSE': [],
            'Accuracy': [],
            'Precision': [],
            'Recall': [],
            'F1-Score': [],
            'Best Models': []
        }

        for random_state in random_states:
            print(f"\nRandom State: {random_state}...") 
            for subset in reduced_datasets:
                X_train = subset['X_train_prep']
                y_train = subset['y_train']
                X_val = subset['X_val_prep']
                y_val = subset['y_val']
                X_test = subset['X_test_prep']
                y_test = subset['y_test']

                kfold = KFold(n_splits=5, shuffle=True, random_state=random_state)
                grid_search = GridSearchCV(
                    estimator=model,
                    param_grid=param_grids[model_name],
                    scoring='neg_root_mean_squared_error',
                    cv=kfold,
                )
                
                grid_search.fit(X_train, y_train)
                best_model = grid_search.best_estimator_
                best_params = grid_search.best_params_  

                
                print(f"Best Params for {model_name} (Random State {random_state}): {best_params}")

                y_test_pred = best_model.predict(X_test)

            
                if np.issubdtype(y_test_pred.dtype, np.floating):
                    y_test_pred = (y_test_pred >= 0.5).astype(int)

                test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
                model_results['Test RMSE'].append(test_rmse)

                accuracy = accuracy_score(y_test, y_test_pred)
                precision = precision_score(y_test, y_test_pred, average='weighted', zero_division=0)
                recall = recall_score(y_test, y_test_pred, average='weighted', zero_division=0)
                f1 = f1_score(y_test, y_test_pred, average='weighted', zero_division=0)

                model_results['Accuracy'].append(accuracy)
                model_results['Precision'].append(precision)
                model_results['Recall'].append(recall)
                model_results['F1-Score'].append(f1)

                model_results['Best Models'].append(best_model)

        overall_results[model_name] = {
            'Mean Test RMSE': np.mean(model_results['Test RMSE']),
            'Std Test RMSE': np.std(model_results['Test RMSE']),
            'Mean Accuracy': np.mean(model_results['Accuracy']),
            'Mean Precision': np.mean(model_results['Precision']),
            'Mean Recall': np.mean(model_results['Recall']),
            'Mean F1-Score': np.mean(model_results['F1-Score']),
            'Std F1-Score': np.std(model_results['F1-Score']),  
            'Best Models': model_results['Best Models']
        }

        # Print overall metrics for the current model
        print(f"{model_name} - Mean Test RMSE: {np.mean(model_results['Test RMSE']):.4f}, "
              f"Std Test RMSE: {np.std(model_results['Test RMSE']):.4f}")
        print(f"{model_name} - Mean Accuracy: {np.mean(model_results['Accuracy']):.4f}, "
              f"Mean Precision: {np.mean(model_results['Precision']):.4f}, "
              f"Mean Recall: {np.mean(model_results['Recall']):.4f}, "
              f"Mean F1-Score: {np.mean(model_results['F1-Score']):.4f}, "
              f"Std F1-Score: {np.std(model_results['F1-Score']):.4f}\n") 

    return overall_results


models = {
    'Lasso': Lasso(max_iter=10000),
    'Ridge': Ridge(max_iter=10000),
    'ElasticNet': ElasticNet(max_iter=10000),
    'RandomForest': RandomForestClassifier(),
    'KNeighbors': KNeighborsRegressor()
}

param_grids = {
    'Lasso': {'alpha': [0.001, 0.01, 0.1, 1, 10]},
    'Ridge': {'alpha': [0.001, 0.01, 0.1, 1, 10]},
    'ElasticNet': {'alpha': [0.001, 0.01, 0.1, 1, 10], 'l1_ratio': [0.1, 0.5, 0.9]},
    'RandomForest': {'n_estimators': [1, 3, 10, 30], 'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]},
    'KNeighbors': {'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance']}
}

random_states = [2, 7, 4, 9, 3]

# Evaluating models on reduced datasets
results = MLpipe_multiple_random_states(reduced_datasets, models, param_grids, random_state)

print("\nReduced Features Model Evaluation Summary:")
print(f"{'Model':<15}{'Mean Test RMSE':>18}{'Std Test RMSE':>18}")
print("-" * 51)

for model_name, result in results.items():
    mean_test_rmse = result.get('Mean Test RMSE', 'N/A')
    std_test_rmse = result.get('Std Test RMSE', 'N/A')
    print(f"{model_name:<15}{mean_test_rmse:>18.4f}{std_test_rmse:>18.4f}")

Evaluating Lasso across all missing patterns and random states...


TypeError: 'int' object is not iterable