##### The cell below is for you to keep track of the libraries used and install those libraries quickly
##### Ensure that the proper library names are used and the syntax of `%pip install PACKAGE_NAME` is followed

In [None]:
%pip install pandas 
%pip install matplotlib
# add commented pip installation lines for packages used as shown above for ease of testing
# the line should be of the format %pip install PACKAGE_NAME 

In [None]:
%pip install imbalanced-learn
%pip install seaborn
%pip install tensorflow
%pip install keras

## **DO NOT CHANGE** the filepath variable
##### Instead, create a folder named 'data' in your current working directory and 
##### have the .parquet file inside that. A relative path *must* be used when loading data into pandas

In [None]:
# Can have as many cells as you want for code
import pandas as pd
import sklearn
filepath = "./data/catB_train.parquet" 
# the initialised filepath MUST be a relative path to a folder named data that contains the parquet file

### **ALL** Code for machine learning and dataset analysis should be entered below. 
##### Ensure that your code is clear and readable.
##### Comments and Markdown notes are advised to direct attention to pieces of code you deem useful.

In [None]:
data = pd.read_parquet(filepath)

In [None]:
data['f_purchase_lh'] = data['f_purchase_lh'].fillna(0)
y = data['f_purchase_lh']
data = data.drop(['f_purchase_lh'],axis = 1)
data = data.drop(['clntnum'], axis = 1)
data = data.drop(['min_occ_date'], axis = 1)
data = data.drop(['cltdob_fix'], axis = 1)

In [None]:
#Deleting data columns with more than 50% missing data
thresh = 50
percentage = [col for col in data if data[col].isna().sum()/data.shape[0]*100>thresh]
data = data.drop(percentage, axis=1)

In [None]:
cate_data = [col for col in data.columns if data[col].dtype not in ['float64', 'int64']]

In [None]:
for col in cate_data:
    mode_value = data[col].mode().iloc[0]  # Use iloc[0] to get the first mode in case of multiple modes
    data[col] = data[col].fillna(mode_value)

In [None]:
# manually filtered list of numerical values classified as categorical data

decimal_columns = ['hh_20', 'pop_20', 'hh_size', 'n_months_last_bought_products', 'flg_latest_being_lapse', 'flg_latest_being_cancel', 'tot_inforce_pols',
                    'ape_gi_42e115', 'ape_ltc_1280bf', 'ape_grp_6fc3e6', 'ape_grp_de05ae', 'ape_inv_dcd836', 'ape_grp_945b5a',
                  'ape_grp_6a5788', 'ape_ltc_43b9d5', 'ape_grp_9cdedf', 'ape_lh_d0adeb', 'ape_grp_1581d7', 'ape_grp_22decf',
                  'ape_lh_507c37', 'ape_lh_839f8a', 'ape_inv_e9f316', 'ape_gi_a10d1b', 'ape_gi_29d435', 'ape_grp_caa6ff', 
                  'ape_grp_fd3bfb', 'ape_lh_e22a6a', 'ape_grp_70e1dd', 'ape_grp_e04c3a', 'ape_grp_fe5fb8', 'ape_gi_856320', 
                   'ape_grp_94baec', 'ape_gi_058815', 'ape_grp_e91421', 'ape_lh_f852af', 'ape_lh_947b15', 'sumins_gi_42e115', 
                   'sumins_ltc_1280bf', 'sumins_grp_6fc3e6', 'sumins_grp_de05ae', 'sumins_inv_dcd836', 'sumins_grp_945b5a', 'sumins_grp_6a5788', 
                  'sumins_ltc_43b9d5', 'sumins_grp_9cdedf', 'sumins_lh_d0adeb', 'sumins_grp_1581d7', 'sumins_grp_22decf', 'sumins_lh_507c37', 
                  'sumins_inv_e9f316', 'sumins_gi_a10d1b', 'sumins_gi_29d435', 'sumins_grp_caa6ff', 'sumins_grp_fd3bfb', 'sumins_lh_e22a6a', 
                  'sumins_grp_70e1dd', 'sumins_grp_e04c3a', 'sumins_grp_fe5fb8', 'sumins_gi_856320', 'sumins_grp_94baec', 'sumins_gi_058815',
                  'sumins_grp_e91421', 'sumins_lh_f852af', 'sumins_lh_947b15', 'sumins_32c74c', 'prempaid_gi_42e115', 'prempaid_ltc_1280bf', 
                  'prempaid_grp_6fc3e6', 'prempaid_grp_de05ae', 'prempaid_inv_dcd836', 'prempaid_grp_945b5a', 'prempaid_grp_6a5788', 
                  'prempaid_ltc_43b9d5', 'prempaid_grp_9cdedf', 'prempaid_lh_d0adeb', 'prempaid_grp_1581d7', 'prempaid_lh_507c37', 'prempaid_lh_839f8a', 
                  'prempaid_inv_e9f316', 'prempaid_gi_a10d1b', 'prempaid_gi_29d435', 'prempaid_grp_caa6ff', 'prempaid_grp_fd3bfb', 'prempaid_lh_e22a6a', 
                  'prempaid_grp_70e1dd', 'prempaid_grp_e04c3a', 'prempaid_grp_fe5fb8', 'prempaid_gi_856320', 'prempaid_grp_94baec', 
                  'prempaid_gi_058815', 'prempaid_grp_e91421', 'prempaid_lh_f852af', 'prempaid_lh_947b15', 'prempaid_32c74c', 
                  'ape_839f8a', 'ape_e22a6a', 'ape_d0adeb', 'ape_c4bda5', 'ape_ltc', 'ape_507c37', 'ape_gi', 'f_hold_839f8a', 
                  'f_hold_e22a6a', 'f_hold_d0adeb', 'f_hold_c4bda5', 'f_hold_ltc', 'f_hold_507c37', 'f_hold_gi', 'sumins_839f8a',
                  'sumins_e22a6a', 'sumins_d0adeb', 'sumins_c4bda5', 'sumins_ltc', 'sumins_507c37', 'sumins_gi', 'prempaid_839f8a', 'prempaid_e22a6a', 
                  'prempaid_d0adeb', 'prempaid_c4bda5', 'prempaid_ltc', 'prempaid_507c37', 'prempaid_gi', 'n_months_last_bought_839f8a',
                  'n_months_last_bought_e22a6a', 'n_months_last_bought_d0adeb', 'n_months_last_bought_c4bda5', 'n_months_last_bought_ltc', 
                   'n_months_last_bought_507c37', 'n_months_last_bought_gi', 'f_ever_bought_ltc_1280bf', 'f_ever_bought_grp_6fc3e6', 'f_ever_bought_grp_de05ae', 
                   'f_ever_bought_inv_dcd836', 'f_ever_bought_grp_945b5a', 'f_ever_bought_grp_6a5788', 'f_ever_bought_ltc_43b9d5', 'f_ever_bought_grp_9cdedf',
                   'f_ever_bought_lh_d0adeb', 'f_ever_bought_grp_1581d7', 'f_ever_bought_grp_22decf', 'f_ever_bought_lh_507c37', 'f_ever_bought_lh_839f8a',
                   'f_ever_bought_inv_e9f316', 'f_ever_bought_grp_caa6ff', 'f_ever_bought_grp_fd3bfb', 'f_ever_bought_lh_e22a6a', 'f_ever_bought_grp_70e1dd',
                   'f_ever_bought_grp_e04c3a', 'f_ever_bought_grp_fe5fb8', 'f_ever_bought_grp_94baec', 'f_ever_bought_grp_e91421', 'f_ever_bought_lh_f852af',
                   'f_ever_bought_lh_947b15', 'f_ever_bought_32c74c', 'n_months_last_bought_ltc_1280bf', 'n_months_last_bought_grp_6fc3e6', 'n_months_last_bought_grp_de05ae', 
                  'n_months_last_bought_inv_dcd836', 'n_months_last_bought_grp_945b5a', 'n_months_last_bought_grp_6a5788', 'n_months_last_bought_ltc_43b9d5', 
                  'n_months_last_bought_grp_9cdedf', 'n_months_last_bought_lh_d0adeb', 'n_months_last_bought_grp_1581d7', 'n_months_last_bought_grp_22decf', 
                  'n_months_last_bought_lh_507c37', 'n_months_last_bought_lh_507c37', 'n_months_last_bought_lh_839f8a', 'n_months_last_bought_inv_e9f316', 
                  'n_months_last_bought_grp_caa6ff', 'n_months_last_bought_grp_fd3bfb', 'n_months_last_bought_lh_e22a6a', 'n_months_last_bought_grp_70e1dd', 
                  'n_months_last_bought_grp_e04c3a', 'n_months_last_bought_grp_fe5fb8', 'n_months_last_bought_grp_94baec', 'n_months_last_bought_grp_e91421', 
                  'n_months_last_bought_lh_f852af', 'n_months_last_bought_lh_947b15', 'n_months_last_bought_32c74c', 'f_elx', 'f_mindef_mha', 'f_retail', 
                   'ape_32c74c', 'prempaid_grp_22decf', 'f_ever_bought_839f8a', 'f_ever_bought_e22a6a', 'f_ever_bought_d0adeb', 'f_ever_bought_c4bda5',
                   'f_ever_bought_ltc', 'f_ever_bought_507c37', 'f_ever_bought_gi']

In [None]:
# Formatting data with Decimal('') to float and other string representations of numbers into floats
for col in decimal_columns:
    data[col] = list(map(float, data[col]))

In [None]:
#One Hot Encoding for nominal data
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
processed_columns = ['race_desc', 'ctrycode_desc', 'clttype', 'stat_flag', 'cltsex_fix',
                     'flg_substandard', 'flg_is_borderline_standard', 'flg_is_revised_term', 'flg_is_rental_flat', 
                     'flg_has_health_claim', 'flg_has_life_claim', 'flg_gi_claim', 'flg_is_proposal', 
                    'flg_with_preauthorisation', 'flg_is_returned_mail', 'is_consent_to_mail', 'is_consent_to_email',
                    'is_consent_to_call', 'is_consent_to_sms', 'is_valid_dm', 'is_valid_email', 'is_housewife_retiree',
                    'is_sg_pr', 'is_class_1_2', 'is_dependent_in_at_least_1_policy', 'hh_size_est', 'annual_income_est']
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(sparse_output=False, drop='first'), processed_columns)
    ],
    remainder='passthrough'  # Pass through non-categorical columns
)
data_encoded = preprocessor.fit_transform(data)
data = pd.DataFrame(data_encoded, columns=preprocessor.get_feature_names_out())

In [None]:
num_data = [col for col in data.columns if data[col].dtype in ['float64', 'int64']]

In [None]:
for col in num_data:
    median_value = data[col].median()  # Use iloc[0] to get the first mode in case of multiple modes
    data[col] = data[col].fillna(median_value)

In [None]:
#Feature selection to reduce number of features
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
X = data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

rf_classifier_0 = RandomForestClassifier(n_estimators=100, random_state=10)

rf_classifier_0.fit(X_train, y_train)

feature_importances = rf_classifier_0.feature_importances_

feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})

feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df)
plt.title('Feature Importances')
plt.show()

threshold = 0.001
selected_features = feature_importance_df.loc[feature_importance_df['Importance'] > threshold, 'Feature']

X_selected = X[selected_features]

In [None]:
#Showing imbalance of the classes
import matplotlib.pyplot as plt
positive_ratio = (y == 1).sum() / len(data)
negative_ratio = (y== 0).sum() / len(data)
labels = ['Positive', 'Negative']
ratios = [positive_ratio, negative_ratio]

plt.bar(labels, ratios, color=['green', 'red'])
plt.title('Positive/Negative Ratio')
plt.xlabel('Category')
plt.ylabel('Ratio')
plt.show()
print(positive_ratio)
print(negative_ratio)

In [None]:
#Dropping columns with no variance
X = X_selected
zero_std_columns = X.columns[X.std() == 0]
X = X.drop(columns=zero_std_columns)

In [None]:
#Standardising the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)
X = pd.DataFrame(X)

In [None]:
#Splitting the dataset into train, test and validation sets
from sklearn.model_selection import train_test_split
X_train_temp, X_test, y_train_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=10)
X_train, X_val, y_train, y_val = train_test_split(X_train_temp, y_train_temp, test_size=0.25, random_state=10)

In [None]:
#Undersampling the majority class to 90:10
from imblearn.under_sampling import RandomUnderSampler
under_sample = RandomUnderSampler(sampling_strategy = 1/9, random_state = 10)
X_train_sampled, y_train_sampled = under_sample.fit_resample(X_train, y_train)
count_1 = (y_train_sampled == 1).sum()/len(y_train_sampled)
count_0 = (y_train_sampled == 0).sum()/len(y_train_sampled)
labels = ['Positive', 'Negative']
ratios = [count_1, count_0]

plt.bar(labels, ratios, color=['green', 'red'])
plt.title('Positive/Negative Ratio after undersampling')
plt.xlabel('Category')
plt.ylabel('Ratio')
plt.show()
print(count_1)
print(count_0)

In [None]:
#SMOTE the minority class
from imblearn.over_sampling import SMOTE
smote = SMOTE(sampling_strategy = 1.1/8.9, random_state = 10)
X_train_smote, y_train_smote = smote.fit_resample(X_train_sampled, y_train_sampled)
count_1_smote = (y_train_smote == 1).sum()/len(y_train_smote)
count_0_smote = (y_train_smote == 0).sum()/len(y_train_smote)
labels = ['Positive', 'Negative']
ratios = [count_1_smote, count_0_smote]

plt.bar(labels, ratios, color=['green', 'red'])
plt.title('Positive/Negative Ratio after SMOTE')
plt.xlabel('Category')
plt.ylabel('Ratio')
plt.show()
print(count_1_smote)
print(count_0_smote)

In [None]:
#Random Forest
param_rf = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]}
rf_classifier = RandomForestClassifier(class_weight='balanced',random_state=10)
grid_search_rf = GridSearchCV(estimator=rf_classifier, param_grid=param_rf, cv=5, scoring='f1')
grid_search_rf.fit(X_train_smote, y_train_smote)
best_params_rf = grid_search_rf.best_params_
best_estimator_rf = grid_search_rf.best_estimator_
print("Best Parameters:", best_params_rf)
y_pred_rf = best_estimator_rf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_rf)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred_rf))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))
f1_rf = f1_score(y_test,y_pred_rf)
print("f1_score:", f1_rf)

In [None]:
def train_model():
    import pandas as pd
    import sklearn
    filepath = "./data/catB_train.parquet" 
    data = pd.read_parquet(filepath)
    #Taking in the data and removing unimportant columns and target column, target column assigned to y
    data['f_purchase_lh'] = data['f_purchase_lh'].fillna(0)
    y = data['f_purchase_lh']
    data = data.drop(['f_purchase_lh'],axis = 1)
    data = data.drop(['clntnum'], axis = 1)
    data = data.drop(['min_occ_date'], axis = 1)
    data = data.drop(['cltdob_fix'], axis = 1)

    #Deleting data columns with more than 50% missing data
    thresh = 50
    percentage = [col for col in data if data[col].isna().sum()/data.shape[0]*100>thresh]
    data = data.drop(percentage, axis=1)

    #Imputing for categorical data
    cate_data = [col for col in data.columns if data[col].dtype not in ['float64', 'int64']]
    for col in cate_data:
        mode_value = data[col].mode().iloc[0]  # Use iloc[0] to get the first mode in case of multiple modes
        data[col] = data[col].fillna(mode_value)

    # manually filtered list of numerical values classified as categorical data
    decimal_columns = ['hh_20', 'pop_20', 'hh_size', 'n_months_last_bought_products', 'flg_latest_being_lapse', 'flg_latest_being_cancel', 'tot_inforce_pols',
                    'ape_gi_42e115', 'ape_ltc_1280bf', 'ape_grp_6fc3e6', 'ape_grp_de05ae', 'ape_inv_dcd836', 'ape_grp_945b5a',
                  'ape_grp_6a5788', 'ape_ltc_43b9d5', 'ape_grp_9cdedf', 'ape_lh_d0adeb', 'ape_grp_1581d7', 'ape_grp_22decf',
                  'ape_lh_507c37', 'ape_lh_839f8a', 'ape_inv_e9f316', 'ape_gi_a10d1b', 'ape_gi_29d435', 'ape_grp_caa6ff', 
                  'ape_grp_fd3bfb', 'ape_lh_e22a6a', 'ape_grp_70e1dd', 'ape_grp_e04c3a', 'ape_grp_fe5fb8', 'ape_gi_856320', 
                   'ape_grp_94baec', 'ape_gi_058815', 'ape_grp_e91421', 'ape_lh_f852af', 'ape_lh_947b15', 'sumins_gi_42e115', 
                   'sumins_ltc_1280bf', 'sumins_grp_6fc3e6', 'sumins_grp_de05ae', 'sumins_inv_dcd836', 'sumins_grp_945b5a', 'sumins_grp_6a5788', 
                  'sumins_ltc_43b9d5', 'sumins_grp_9cdedf', 'sumins_lh_d0adeb', 'sumins_grp_1581d7', 'sumins_grp_22decf', 'sumins_lh_507c37', 
                  'sumins_inv_e9f316', 'sumins_gi_a10d1b', 'sumins_gi_29d435', 'sumins_grp_caa6ff', 'sumins_grp_fd3bfb', 'sumins_lh_e22a6a', 
                  'sumins_grp_70e1dd', 'sumins_grp_e04c3a', 'sumins_grp_fe5fb8', 'sumins_gi_856320', 'sumins_grp_94baec', 'sumins_gi_058815',
                  'sumins_grp_e91421', 'sumins_lh_f852af', 'sumins_lh_947b15', 'sumins_32c74c', 'prempaid_gi_42e115', 'prempaid_ltc_1280bf', 
                  'prempaid_grp_6fc3e6', 'prempaid_grp_de05ae', 'prempaid_inv_dcd836', 'prempaid_grp_945b5a', 'prempaid_grp_6a5788', 
                  'prempaid_ltc_43b9d5', 'prempaid_grp_9cdedf', 'prempaid_lh_d0adeb', 'prempaid_grp_1581d7', 'prempaid_lh_507c37', 'prempaid_lh_839f8a', 
                  'prempaid_inv_e9f316', 'prempaid_gi_a10d1b', 'prempaid_gi_29d435', 'prempaid_grp_caa6ff', 'prempaid_grp_fd3bfb', 'prempaid_lh_e22a6a', 
                  'prempaid_grp_70e1dd', 'prempaid_grp_e04c3a', 'prempaid_grp_fe5fb8', 'prempaid_gi_856320', 'prempaid_grp_94baec', 
                  'prempaid_gi_058815', 'prempaid_grp_e91421', 'prempaid_lh_f852af', 'prempaid_lh_947b15', 'prempaid_32c74c', 
                  'ape_839f8a', 'ape_e22a6a', 'ape_d0adeb', 'ape_c4bda5', 'ape_ltc', 'ape_507c37', 'ape_gi', 'f_hold_839f8a', 
                  'f_hold_e22a6a', 'f_hold_d0adeb', 'f_hold_c4bda5', 'f_hold_ltc', 'f_hold_507c37', 'f_hold_gi', 'sumins_839f8a',
                  'sumins_e22a6a', 'sumins_d0adeb', 'sumins_c4bda5', 'sumins_ltc', 'sumins_507c37', 'sumins_gi', 'prempaid_839f8a', 'prempaid_e22a6a', 
                  'prempaid_d0adeb', 'prempaid_c4bda5', 'prempaid_ltc', 'prempaid_507c37', 'prempaid_gi', 'n_months_last_bought_839f8a',
                  'n_months_last_bought_e22a6a', 'n_months_last_bought_d0adeb', 'n_months_last_bought_c4bda5', 'n_months_last_bought_ltc', 
                   'n_months_last_bought_507c37', 'n_months_last_bought_gi', 'f_ever_bought_ltc_1280bf', 'f_ever_bought_grp_6fc3e6', 'f_ever_bought_grp_de05ae', 
                   'f_ever_bought_inv_dcd836', 'f_ever_bought_grp_945b5a', 'f_ever_bought_grp_6a5788', 'f_ever_bought_ltc_43b9d5', 'f_ever_bought_grp_9cdedf',
                   'f_ever_bought_lh_d0adeb', 'f_ever_bought_grp_1581d7', 'f_ever_bought_grp_22decf', 'f_ever_bought_lh_507c37', 'f_ever_bought_lh_839f8a',
                   'f_ever_bought_inv_e9f316', 'f_ever_bought_grp_caa6ff', 'f_ever_bought_grp_fd3bfb', 'f_ever_bought_lh_e22a6a', 'f_ever_bought_grp_70e1dd',
                   'f_ever_bought_grp_e04c3a', 'f_ever_bought_grp_fe5fb8', 'f_ever_bought_grp_94baec', 'f_ever_bought_grp_e91421', 'f_ever_bought_lh_f852af',
                   'f_ever_bought_lh_947b15', 'f_ever_bought_32c74c', 'n_months_last_bought_ltc_1280bf', 'n_months_last_bought_grp_6fc3e6', 'n_months_last_bought_grp_de05ae', 
                  'n_months_last_bought_inv_dcd836', 'n_months_last_bought_grp_945b5a', 'n_months_last_bought_grp_6a5788', 'n_months_last_bought_ltc_43b9d5', 
                  'n_months_last_bought_grp_9cdedf', 'n_months_last_bought_lh_d0adeb', 'n_months_last_bought_grp_1581d7', 'n_months_last_bought_grp_22decf', 
                  'n_months_last_bought_lh_507c37', 'n_months_last_bought_lh_507c37', 'n_months_last_bought_lh_839f8a', 'n_months_last_bought_inv_e9f316', 
                  'n_months_last_bought_grp_caa6ff', 'n_months_last_bought_grp_fd3bfb', 'n_months_last_bought_lh_e22a6a', 'n_months_last_bought_grp_70e1dd', 
                  'n_months_last_bought_grp_e04c3a', 'n_months_last_bought_grp_fe5fb8', 'n_months_last_bought_grp_94baec', 'n_months_last_bought_grp_e91421', 
                  'n_months_last_bought_lh_f852af', 'n_months_last_bought_lh_947b15', 'n_months_last_bought_32c74c', 'f_elx', 'f_mindef_mha', 'f_retail', 
                   'ape_32c74c', 'prempaid_grp_22decf', 'f_ever_bought_839f8a', 'f_ever_bought_e22a6a', 'f_ever_bought_d0adeb', 'f_ever_bought_c4bda5',
                   'f_ever_bought_ltc', 'f_ever_bought_507c37', 'f_ever_bought_gi']

    # Formatting data with Decimal('') to float and other string representations of numbers into floats
    for col in decimal_columns:
        data[col] = list(map(float, data[col]))

    #One Hot Encoding for nominal data
    from sklearn.compose import ColumnTransformer
    from sklearn.preprocessing import OneHotEncoder
    processed_columns = ['race_desc', 'ctrycode_desc', 'clttype', 'stat_flag', 'cltsex_fix',
                     'flg_substandard', 'flg_is_borderline_standard', 'flg_is_revised_term', 'flg_is_rental_flat', 
                     'flg_has_health_claim', 'flg_has_life_claim', 'flg_gi_claim', 'flg_is_proposal', 
                    'flg_with_preauthorisation', 'flg_is_returned_mail', 'is_consent_to_mail', 'is_consent_to_email',
                    'is_consent_to_call', 'is_consent_to_sms', 'is_valid_dm', 'is_valid_email', 'is_housewife_retiree',
                    'is_sg_pr', 'is_class_1_2', 'is_dependent_in_at_least_1_policy', 'hh_size_est', 'annual_income_est']
    preprocessor = ColumnTransformer(transformers=[('cat', OneHotEncoder(sparse_output=False, drop='first'), processed_columns)],remainder='passthrough')
    data_encoded = preprocessor.fit_transform(data)
    data = pd.DataFrame(data_encoded, columns=preprocessor.get_feature_names_out())

    #Imputing for numerical data
    num_data = [col for col in data.columns if data[col].dtype in ['float64', 'int64']]
    for col in num_data:
        median_value = data[col].median()
        data[col] = data[col].fillna(median_value)

    #Feature selection to reduce number of features
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.feature_selection import SelectFromModel
    import pandas as pd
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import accuracy_score
    X = data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)
    rf_classifier_0 = RandomForestClassifier(n_estimators=100, random_state=10)
    rf_classifier_0.fit(X_train, y_train)
    feature_importances = rf_classifier_0.feature_importances_
    feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})
    feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
    threshold = 0.001
    selected_features = feature_importance_df.loc[feature_importance_df['Importance'] > threshold, 'Feature']
    X_selected = X[selected_features]

    #Dropping columns with no variance
    X = X_selected
    zero_std_columns = X.columns[X.std() == 0]
    X = X.drop(columns=zero_std_columns)

    #Standardising the data
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    X = pd.DataFrame(X)

    #Splitting the dataset into train, test and validation sets
    from sklearn.model_selection import train_test_split
    X_train_temp, X_test, y_train_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=10)
    X_train, X_val, y_train, y_val = train_test_split(X_train_temp, y_train_temp, test_size=0.25, random_state=10)

    #Undersampling the majority class
    from imblearn.under_sampling import RandomUnderSampler
    under_sample = RandomUnderSampler(sampling_strategy = 1/9, random_state = 10)
    X_train_sampled, y_train_sampled = under_sample.fit_resample(X_train, y_train)

    #SMOTE the minority class
    from imblearn.over_sampling import SMOTE
    smote = SMOTE(sampling_strategy = 1.1/8.9, random_state = 10)
    X_train_smote, y_train_smote = smote.fit_resample(X_train_sampled, y_train_sampled)

    #Random Forest Classifier
    from sklearn.model_selection import GridSearchCV
    param_rf = {
        'n_estimators': [50, 100, 150],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]}
    rf_classifier = RandomForestClassifier(class_weight='balanced',random_state=10)
    grid_search_rf = GridSearchCV(estimator=rf_classifier, param_grid=param_rf, cv=5, scoring='f1')
    grid_search_rf.fit(X_train_smote, y_train_smote)
    best_params_rf = grid_search_rf.best_params_
    best_estimator_rf = grid_search_rf.best_estimator_
    
    return best_estimator_rf

## The cell below is **NOT** to be removed
##### The function is to be amended so that it accepts the given input (dataframe) and returns the required output (list). 
##### It is recommended to test the function out prior to submission
-------------------------------------------------------------------------------------------------------------------------------
##### The hidden_data parsed into the function below will have the same layout columns wise as the dataset *SENT* to you
##### Thus, ensure that steps taken to modify the initial dataset to fit into the model are also carried out in the function below

In [None]:
def testing_hidden_data(hidden_data: pd.DataFrame) -> list:
    '''DO NOT REMOVE THIS FUNCTION.

The function accepts a dataframe as input and return an iterable (list)
of binary classes as output.

The function should be coded to test on hidden data
and should include any preprocessing functions needed for your model to perform. 
    
All relevant code MUST be included in this function.'''
    #Preprocessing functions
    hidden_data = hidden_data.drop(['clntnum'], axis = 1)
    hidden_data = hidden_data.drop(['min_occ_date'], axis = 1)
    hidden_data = hidden_data.drop(['cltdob_fix'], axis = 1)
    thresh = 50
    percentage = [col for col in hidden_data if hidden_data[col].isna().sum()/hidden_data.shape[0]*100>thresh]
    hidden_data = hidden_data.drop(percentage, axis=1)
    cate_data = [col for col in hidden_data.columns if hidden_data[col].dtype not in ['float64', 'int64']]
    for col in cate_data:
        mode_value = hidden_data[col].mode().iloc[0]  # Use iloc[0] to get the first mode in case of multiple modes
        hidden_data[col] = hidden_data[col].fillna(mode_value)
    decimal_columns = ['hh_20', 'pop_20', 'hh_size', 'n_months_last_bought_products', 'flg_latest_being_lapse', 'flg_latest_being_cancel', 'tot_inforce_pols',
                    'ape_gi_42e115', 'ape_ltc_1280bf', 'ape_grp_6fc3e6', 'ape_grp_de05ae', 'ape_inv_dcd836', 'ape_grp_945b5a',
                  'ape_grp_6a5788', 'ape_ltc_43b9d5', 'ape_grp_9cdedf', 'ape_lh_d0adeb', 'ape_grp_1581d7', 'ape_grp_22decf',
                  'ape_lh_507c37', 'ape_lh_839f8a', 'ape_inv_e9f316', 'ape_gi_a10d1b', 'ape_gi_29d435', 'ape_grp_caa6ff', 
                  'ape_grp_fd3bfb', 'ape_lh_e22a6a', 'ape_grp_70e1dd', 'ape_grp_e04c3a', 'ape_grp_fe5fb8', 'ape_gi_856320', 
                   'ape_grp_94baec', 'ape_gi_058815', 'ape_grp_e91421', 'ape_lh_f852af', 'ape_lh_947b15', 'sumins_gi_42e115', 
                   'sumins_ltc_1280bf', 'sumins_grp_6fc3e6', 'sumins_grp_de05ae', 'sumins_inv_dcd836', 'sumins_grp_945b5a', 'sumins_grp_6a5788', 
                  'sumins_ltc_43b9d5', 'sumins_grp_9cdedf', 'sumins_lh_d0adeb', 'sumins_grp_1581d7', 'sumins_grp_22decf', 'sumins_lh_507c37', 
                  'sumins_inv_e9f316', 'sumins_gi_a10d1b', 'sumins_gi_29d435', 'sumins_grp_caa6ff', 'sumins_grp_fd3bfb', 'sumins_lh_e22a6a', 
                  'sumins_grp_70e1dd', 'sumins_grp_e04c3a', 'sumins_grp_fe5fb8', 'sumins_gi_856320', 'sumins_grp_94baec', 'sumins_gi_058815',
                  'sumins_grp_e91421', 'sumins_lh_f852af', 'sumins_lh_947b15', 'sumins_32c74c', 'prempaid_gi_42e115', 'prempaid_ltc_1280bf', 
                  'prempaid_grp_6fc3e6', 'prempaid_grp_de05ae', 'prempaid_inv_dcd836', 'prempaid_grp_945b5a', 'prempaid_grp_6a5788', 
                  'prempaid_ltc_43b9d5', 'prempaid_grp_9cdedf', 'prempaid_lh_d0adeb', 'prempaid_grp_1581d7', 'prempaid_lh_507c37', 'prempaid_lh_839f8a', 
                  'prempaid_inv_e9f316', 'prempaid_gi_a10d1b', 'prempaid_gi_29d435', 'prempaid_grp_caa6ff', 'prempaid_grp_fd3bfb', 'prempaid_lh_e22a6a', 
                  'prempaid_grp_70e1dd', 'prempaid_grp_e04c3a', 'prempaid_grp_fe5fb8', 'prempaid_gi_856320', 'prempaid_grp_94baec', 
                  'prempaid_gi_058815', 'prempaid_grp_e91421', 'prempaid_lh_f852af', 'prempaid_lh_947b15', 'prempaid_32c74c', 
                  'ape_839f8a', 'ape_e22a6a', 'ape_d0adeb', 'ape_c4bda5', 'ape_ltc', 'ape_507c37', 'ape_gi', 'f_hold_839f8a', 
                  'f_hold_e22a6a', 'f_hold_d0adeb', 'f_hold_c4bda5', 'f_hold_ltc', 'f_hold_507c37', 'f_hold_gi', 'sumins_839f8a',
                  'sumins_e22a6a', 'sumins_d0adeb', 'sumins_c4bda5', 'sumins_ltc', 'sumins_507c37', 'sumins_gi', 'prempaid_839f8a', 'prempaid_e22a6a', 
                  'prempaid_d0adeb', 'prempaid_c4bda5', 'prempaid_ltc', 'prempaid_507c37', 'prempaid_gi', 'n_months_last_bought_839f8a',
                  'n_months_last_bought_e22a6a', 'n_months_last_bought_d0adeb', 'n_months_last_bought_c4bda5', 'n_months_last_bought_ltc', 
                   'n_months_last_bought_507c37', 'n_months_last_bought_gi', 'f_ever_bought_ltc_1280bf', 'f_ever_bought_grp_6fc3e6', 'f_ever_bought_grp_de05ae', 
                   'f_ever_bought_inv_dcd836', 'f_ever_bought_grp_945b5a', 'f_ever_bought_grp_6a5788', 'f_ever_bought_ltc_43b9d5', 'f_ever_bought_grp_9cdedf',
                   'f_ever_bought_lh_d0adeb', 'f_ever_bought_grp_1581d7', 'f_ever_bought_grp_22decf', 'f_ever_bought_lh_507c37', 'f_ever_bought_lh_839f8a',
                   'f_ever_bought_inv_e9f316', 'f_ever_bought_grp_caa6ff', 'f_ever_bought_grp_fd3bfb', 'f_ever_bought_lh_e22a6a', 'f_ever_bought_grp_70e1dd',
                   'f_ever_bought_grp_e04c3a', 'f_ever_bought_grp_fe5fb8', 'f_ever_bought_grp_94baec', 'f_ever_bought_grp_e91421', 'f_ever_bought_lh_f852af',
                   'f_ever_bought_lh_947b15', 'f_ever_bought_32c74c', 'n_months_last_bought_ltc_1280bf', 'n_months_last_bought_grp_6fc3e6', 'n_months_last_bought_grp_de05ae', 
                  'n_months_last_bought_inv_dcd836', 'n_months_last_bought_grp_945b5a', 'n_months_last_bought_grp_6a5788', 'n_months_last_bought_ltc_43b9d5', 
                  'n_months_last_bought_grp_9cdedf', 'n_months_last_bought_lh_d0adeb', 'n_months_last_bought_grp_1581d7', 'n_months_last_bought_grp_22decf', 
                  'n_months_last_bought_lh_507c37', 'n_months_last_bought_lh_507c37', 'n_months_last_bought_lh_839f8a', 'n_months_last_bought_inv_e9f316', 
                  'n_months_last_bought_grp_caa6ff', 'n_months_last_bought_grp_fd3bfb', 'n_months_last_bought_lh_e22a6a', 'n_months_last_bought_grp_70e1dd', 
                  'n_months_last_bought_grp_e04c3a', 'n_months_last_bought_grp_fe5fb8', 'n_months_last_bought_grp_94baec', 'n_months_last_bought_grp_e91421', 
                  'n_months_last_bought_lh_f852af', 'n_months_last_bought_lh_947b15', 'n_months_last_bought_32c74c', 'f_elx', 'f_mindef_mha', 'f_retail', 
                   'ape_32c74c', 'prempaid_grp_22decf', 'f_ever_bought_839f8a', 'f_ever_bought_e22a6a', 'f_ever_bought_d0adeb', 'f_ever_bought_c4bda5',
                   'f_ever_bought_ltc', 'f_ever_bought_507c37', 'f_ever_bought_gi']
    for col in decimal_columns:
        hidden_data[col] = list(map(float, hidden_data[col]))
    from sklearn.compose import ColumnTransformer
    from sklearn.preprocessing import OneHotEncoder
    processed_columns = ['race_desc', 'ctrycode_desc', 'clttype', 'stat_flag', 'cltsex_fix',
                     'flg_substandard', 'flg_is_borderline_standard', 'flg_is_revised_term', 'flg_is_rental_flat', 
                     'flg_has_health_claim', 'flg_has_life_claim', 'flg_gi_claim', 'flg_is_proposal', 
                    'flg_with_preauthorisation', 'flg_is_returned_mail', 'is_consent_to_mail', 'is_consent_to_email',
                    'is_consent_to_call', 'is_consent_to_sms', 'is_valid_dm', 'is_valid_email', 'is_housewife_retiree',
                    'is_sg_pr', 'is_class_1_2', 'is_dependent_in_at_least_1_policy', 'hh_size_est', 'annual_income_est']
    preprocessor = ColumnTransformer(transformers=[('cat', OneHotEncoder(sparse_output=False, drop='first'), processed_columns)],remainder='passthrough')
    data_encoded = preprocessor.fit_transform(hidden_data)
    hidden_data = pd.DataFrame(data_encoded, columns=preprocessor.get_feature_names_out())
    num_data = [col for col in hidden_data.columns if hidden_data[col].dtype in ['float64', 'int64']]
    for col in num_data:
        median_value = hidden_data[col].median()
        hidden_data[col] = hidden_data[col].fillna(median_value)
    not_selected = ['cat__ctrycode_desc_Bosnia-Herzegovina', 'cat__ctrycode_desc_Brunei Darussalam', 'cat__ctrycode_desc_Canada', 'cat__ctrycode_desc_China', 'cat__ctrycode_desc_Denmark', 'cat__ctrycode_desc_Hong Kong', 'cat__ctrycode_desc_Indonesia', 'cat__ctrycode_desc_Ireland', 'cat__ctrycode_desc_Italy', 'cat__ctrycode_desc_Japan', 'cat__ctrycode_desc_Malaysia', 'cat__ctrycode_desc_Netherlands', 'cat__ctrycode_desc_New Zealand', 'cat__ctrycode_desc_Not Applicable', 'cat__ctrycode_desc_Philippines', 'cat__ctrycode_desc_Singapore', 'cat__ctrycode_desc_South Africa', 'cat__ctrycode_desc_Spain', 'cat__ctrycode_desc_Sweden', 'cat__ctrycode_desc_Taiwan (R.O.C)', 'cat__ctrycode_desc_Thailand', 'cat__ctrycode_desc_United Arab Emirates', 'cat__ctrycode_desc_United Kingdom', 'cat__ctrycode_desc_United States', 'cat__ctrycode_desc_Unknown Country Code', 'cat__stat_flag_MATURED', 'cat__flg_substandard_nan', 'cat__flg_is_borderline_standard_nan', 'cat__flg_is_revised_term_1.0', 'cat__flg_is_revised_term_nan', 'cat__flg_is_rental_flat_1.0', 'cat__flg_is_rental_flat_nan', 'cat__flg_has_health_claim_nan', 'cat__flg_has_life_claim_1.0', 'cat__flg_has_life_claim_nan', 'cat__flg_gi_claim_nan', 'cat__flg_is_proposal_nan', 'cat__flg_with_preauthorisation_1.0', 'cat__flg_with_preauthorisation_nan', 'cat__flg_is_returned_mail_1.0', 'cat__flg_is_returned_mail_nan', 'cat__is_consent_to_mail_nan', 'cat__is_consent_to_email_nan', 'cat__is_consent_to_call_nan', 'cat__is_consent_to_sms_nan', 'cat__is_valid_dm_nan', 'cat__is_valid_email_nan', 'cat__is_housewife_retiree_nan', 'cat__is_sg_pr_nan', 'cat__is_class_1_2_nan', 'cat__is_dependent_in_at_least_1_policy_nan', 'remainder__ape_gi_42e115', 'remainder__ape_ltc_1280bf', 'remainder__ape_grp_de05ae', 'remainder__ape_inv_dcd836', 'remainder__ape_grp_6a5788', 'remainder__ape_lh_d0adeb', 'remainder__ape_inv_e9f316', 'remainder__ape_gi_a10d1b', 'remainder__ape_gi_29d435', 'remainder__ape_grp_fd3bfb', 'remainder__ape_grp_e04c3a', 'remainder__ape_gi_856320', 'remainder__ape_gi_058815', 'remainder__ape_32c74c', 'remainder__sumins_gi_42e115', 'remainder__sumins_ltc_1280bf', 'remainder__sumins_grp_de05ae', 'remainder__sumins_inv_dcd836', 'remainder__sumins_grp_6a5788', 'remainder__sumins_lh_d0adeb', 'remainder__sumins_grp_22decf', 'remainder__sumins_inv_e9f316', 'remainder__sumins_gi_a10d1b', 'remainder__sumins_gi_29d435', 'remainder__sumins_grp_fd3bfb', 'remainder__sumins_lh_e22a6a', 'remainder__sumins_grp_e04c3a', 'remainder__sumins_grp_fe5fb8', 'remainder__sumins_gi_856320', 'remainder__sumins_grp_94baec', 'remainder__sumins_gi_058815', 'remainder__sumins_32c74c', 'remainder__prempaid_gi_42e115', 'remainder__prempaid_ltc_1280bf', 'remainder__prempaid_grp_de05ae', 'remainder__prempaid_inv_dcd836', 'remainder__prempaid_grp_6a5788', 'remainder__prempaid_lh_d0adeb', 'remainder__prempaid_grp_22decf', 'remainder__prempaid_inv_e9f316', 'remainder__prempaid_gi_a10d1b', 'remainder__prempaid_gi_29d435', 'remainder__prempaid_grp_fd3bfb', 'remainder__prempaid_grp_e04c3a', 'remainder__prempaid_gi_856320', 'remainder__prempaid_gi_058815', 'remainder__prempaid_32c74c', 'remainder__ape_d0adeb', 'remainder__ape_c4bda5', 'remainder__ape_gi', 'remainder__f_hold_d0adeb', 'remainder__f_hold_c4bda5', 'remainder__f_hold_gi', 'remainder__sumins_e22a6a', 'remainder__sumins_d0adeb', 'remainder__sumins_c4bda5', 'remainder__sumins_gi', 'remainder__prempaid_d0adeb', 'remainder__prempaid_c4bda5', 'remainder__prempaid_gi', 'remainder__f_ever_bought_d0adeb', 'remainder__f_ever_bought_c4bda5', 'remainder__n_months_last_bought_d0adeb', 'remainder__f_ever_bought_ltc_1280bf', 'remainder__f_ever_bought_grp_6fc3e6', 'remainder__f_ever_bought_grp_de05ae', 'remainder__f_ever_bought_inv_dcd836', 'remainder__f_ever_bought_grp_945b5a', 'remainder__f_ever_bought_grp_6a5788', 'remainder__f_ever_bought_grp_9cdedf', 'remainder__f_ever_bought_lh_d0adeb', 'remainder__f_ever_bought_grp_22decf', 'remainder__f_ever_bought_inv_e9f316', 'remainder__f_ever_bought_grp_fd3bfb', 'remainder__f_ever_bought_grp_e04c3a', 'remainder__f_ever_bought_grp_e91421', 'remainder__f_ever_bought_32c74c', 'remainder__n_months_last_bought_ltc_1280bf', 'remainder__n_months_last_bought_grp_de05ae', 'remainder__n_months_last_bought_inv_dcd836', 'remainder__n_months_last_bought_grp_6a5788', 'remainder__n_months_last_bought_lh_d0adeb', 'remainder__n_months_last_bought_inv_e9f316']
    hidden_data = hidden_data.drop(not_selected,axis=1)
    zero_std_columns = hidden_data.columns[hidden_data.std() == 0]
    hidden_data = hidden_data.drop(columns=zero_std_columns)
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    hidden_data = scaler.fit_transform(hidden_data)
    hidden_data = pd.DataFrame(hidden_data)
    y_pred_rf = train_model().predict(hidden_data)
    result = y_pred_rf

    return result

##### Cell to check testing_hidden_data function

In [None]:
# This cell should output a list of predictions.
test_df = pd.read_parquet(filepath)
test_df = test_df.drop(columns=["f_purchase_lh"]) 
print(testing_hidden_data(test_df))

### Please have the filename renamed and ensure that it can be run with the requirements above being met. All the best!