##### The cell below is for you to keep track of the libraries used and install those libraries quickly
##### Ensure that the proper library names are used and the syntax of `%pip install PACKAGE_NAME` is followed

In [None]:
%pip install pandas 
%pip install matplotlib
# add commented pip installation lines for packages used as shown above for ease of testing
# the line should be of the format %pip install PACKAGE_NAME 

## **DO NOT CHANGE** the filepath variable
##### Instead, create a folder named 'data' in your current working directory and 
##### have the .parquet file inside that. A relative path *must* be used when loading data into pandas

In [None]:
# Can have as many cells as you want for code
import pandas as pd
import sklearn
filepath = "./data/catB_train.parquet" 
# the initialised filepath MUST be a relative path to a folder named data that contains the parquet file

### **ALL** Code for machine learning and dataset analysis should be entered below. 
##### Ensure that your code is clear and readable.
##### Comments and Markdown notes are advised to direct attention to pieces of code you deem useful.

In [None]:
data = pd.read_parquet(filepath)
print(data.head())

In [None]:
data['f_purchase_lh'] = data['f_purchase_lh'].fillna(0)

In [None]:
with pd.option_context('display.max_rows', 5,
                       'display.max_columns', None,
                       'display.precision', 3,
                       ):
    print(data)

In [None]:
print(data.dtypes)

In [None]:
#Deleting data columns with more than 50% missing data
thresh = 50
percentage = [col for col in data if data[col].isna().sum()/data.shape[0]*100>thresh]
#percentage.remove('f_purchase_lh')
data.drop(percentage, axis=1,inplace=True)

In [None]:
print(data.dtypes.unique())

In [None]:
print(data['stat_flag'].isna().sum()/data.shape[0]*100) # No missing values for stat_flag
data['stat_flag'].unique()

In [None]:
print(data['clttype'].isna().sum()/data.shape[0]*100)
data['clttype'].unique()

In [None]:
print(data['race_desc'].isna().sum()/data.shape[0]*100)
data['race_desc'].unique()
counts = data['race_desc'].value_counts()
print(counts)

In [None]:
cate_data = [col for col in data.columns if data[col].dtype not in ['float64', 'int64']]

In [None]:
for col in cate_data:
    mode_value = data[col].mode().iloc[0]  # Use iloc[0] to get the first mode in case of multiple modes
    data[col] = data[col].fillna(mode_value)
data

In [None]:
num_data = [col for col in data.columns if data[col].dtype in ['float64', 'int64']]

In [None]:
'''
for col in num_data:
    median_value = data[col].median()  # Use iloc[0] to get the first mode in case of multiple modes
    data[col] = data[col].fillna(median_value)
data
'''


In [None]:
import copy
data_upsample = copy.deepcopy(data)
data_downsample = copy.deepcopy(data)
data_SMOTE = copy.deepcopy(data)

In [None]:
# Will be using label encoding for norminal categorical data

from sklearn import preprocessing 
 
label_encoder = preprocessing.LabelEncoder() 

#manually filtered categorical data columns to undergo label processing

processed_columns = ['race_desc', 'ctrycode_desc', 'clttype', 'stat_flag', 'cltsex_fix',
                     'flg_substandard', 'flg_is_borderline_standard', 'flg_is_revised_term', 'flg_is_rental_flat', 
                     'flg_has_health_claim', 'flg_has_life_claim', 'flg_gi_claim', 'flg_is_proposal', 
                    'flg_with_preauthorisation', 'flg_is_returned_mail', 'is_consent_to_mail', 'is_consent_to_email',
                    'is_consent_to_call', 'is_consent_to_sms', 'is_valid_dm', 'is_valid_email', 'is_housewife_retiree',
                    'is_sg_pr', 'is_class_1_2', 'is_dependent_in_at_least_1_policy', 'hh_size_est', 'annual_income_est']

# Encode labels in column 'species'.
for col in processed_columns:
    data[col]= label_encoder.fit_transform(data[col])

In [None]:
# manually filtered list of numerical values classified as categroical data

decimal_columns = ['hh_20', 'pop_20', 'hh_size', 'n_months_last_bought_products', 'flg_latest_being_lapse', 'flg_latest_being_cancel', 'tot_inforce_pols',
                    'ape_gi_42e115', 'ape_ltc_1280bf', 'ape_grp_6fc3e6', 'ape_grp_de05ae', 'ape_inv_dcd836', 'ape_grp_945b5a',
                  'ape_grp_6a5788', 'ape_ltc_43b9d5', 'ape_grp_9cdedf', 'ape_lh_d0adeb', 'ape_grp_1581d7', 'ape_grp_22decf',
                  'ape_lh_507c37', 'ape_lh_839f8a', 'ape_inv_e9f316', 'ape_gi_a10d1b', 'ape_gi_29d435', 'ape_grp_caa6ff', 
                  'ape_grp_fd3bfb', 'ape_lh_e22a6a', 'ape_grp_70e1dd', 'ape_grp_e04c3a', 'ape_grp_fe5fb8', 'ape_gi_856320', 
                   'ape_grp_94baec', 'ape_gi_058815', 'ape_grp_e91421', 'ape_lh_f852af', 'ape_lh_947b15', 'sumins_gi_42e115', 
                   'sumins_ltc_1280bf', 'sumins_grp_6fc3e6', 'sumins_grp_de05ae', 'sumins_inv_dcd836', 'sumins_grp_945b5a', 'sumins_grp_6a5788', 
                  'sumins_ltc_43b9d5', 'sumins_grp_9cdedf', 'sumins_lh_d0adeb', 'sumins_grp_1581d7', 'sumins_grp_22decf', 'sumins_lh_507c37', 
                  'sumins_inv_e9f316', 'sumins_gi_a10d1b', 'sumins_gi_29d435', 'sumins_grp_caa6ff', 'sumins_grp_fd3bfb', 'sumins_lh_e22a6a', 
                  'sumins_grp_70e1dd', 'sumins_grp_e04c3a', 'sumins_grp_fe5fb8', 'sumins_gi_856320', 'sumins_grp_94baec', 'sumins_gi_058815',
                  'sumins_grp_e91421', 'sumins_lh_f852af', 'sumins_lh_947b15', 'sumins_32c74c', 'prempaid_gi_42e115', 'prempaid_ltc_1280bf', 
                  'prempaid_grp_6fc3e6', 'prempaid_grp_de05ae', 'prempaid_inv_dcd836', 'prempaid_grp_945b5a', 'prempaid_grp_6a5788', 
                  'prempaid_ltc_43b9d5', 'prempaid_grp_9cdedf', 'prempaid_lh_d0adeb', 'prempaid_grp_1581d7', 'prempaid_lh_507c37', 'prempaid_lh_839f8a', 
                  'prempaid_inv_e9f316', 'prempaid_gi_a10d1b', 'prempaid_gi_29d435', 'prempaid_grp_caa6ff', 'prempaid_grp_fd3bfb', 'prempaid_lh_e22a6a', 
                  'prempaid_grp_70e1dd', 'prempaid_grp_e04c3a', 'prempaid_grp_fe5fb8', 'prempaid_gi_856320', 'prempaid_grp_94baec', 
                  'prempaid_gi_058815', 'prempaid_grp_e91421', 'prempaid_lh_f852af', 'prempaid_lh_947b15', 'prempaid_32c74c', 
                  'ape_839f8a', 'ape_e22a6a', 'ape_d0adeb', 'ape_c4bda5', 'ape_ltc', 'ape_507c37', 'ape_gi', 'f_hold_839f8a', 
                  'f_hold_e22a6a', 'f_hold_d0adeb', 'f_hold_c4bda5', 'f_hold_ltc', 'f_hold_507c37', 'f_hold_gi', 'sumins_839f8a',
                  'sumins_e22a6a', 'sumins_d0adeb', 'sumins_c4bda5', 'sumins_ltc', 'sumins_507c37', 'sumins_gi', 'prempaid_839f8a', 'prempaid_e22a6a', 
                  'prempaid_d0adeb', 'prempaid_c4bda5', 'prempaid_ltc', 'prempaid_507c37', 'prempaid_gi', 'n_months_last_bought_839f8a',
                  'n_months_last_bought_e22a6a', 'n_months_last_bought_d0adeb', 'n_months_last_bought_c4bda5', 'n_months_last_bought_ltc', 
                   'n_months_last_bought_507c37', 'n_months_last_bought_gi', 'f_ever_bought_ltc_1280bf', 'f_ever_bought_grp_6fc3e6', 'f_ever_bought_grp_de05ae', 
                   'f_ever_bought_inv_dcd836', 'f_ever_bought_grp_945b5a', 'f_ever_bought_grp_6a5788', 'f_ever_bought_ltc_43b9d5', 'f_ever_bought_grp_9cdedf',
                   'f_ever_bought_lh_d0adeb', 'f_ever_bought_grp_1581d7', 'f_ever_bought_grp_22decf', 'f_ever_bought_lh_507c37', 'f_ever_bought_lh_839f8a',
                   'f_ever_bought_inv_e9f316', 'f_ever_bought_grp_caa6ff', 'f_ever_bought_grp_fd3bfb', 'f_ever_bought_lh_e22a6a', 'f_ever_bought_grp_70e1dd',
                   'f_ever_bought_grp_e04c3a', 'f_ever_bought_grp_fe5fb8', 'f_ever_bought_grp_94baec', 'f_ever_bought_grp_e91421', 'f_ever_bought_lh_f852af',
                   'f_ever_bought_lh_947b15', 'f_ever_bought_32c74c', 'n_months_last_bought_ltc_1280bf', 'n_months_last_bought_grp_6fc3e6', 'n_months_last_bought_grp_de05ae', 
                  'n_months_last_bought_inv_dcd836', 'n_months_last_bought_grp_945b5a', 'n_months_last_bought_grp_6a5788', 'n_months_last_bought_ltc_43b9d5',

'n_months_last_bought_grp_9cdedf', 'n_months_last_bought_lh_d0adeb', 'n_months_last_bought_grp_1581d7', 'n_months_last_bought_grp_22decf', 
                  'n_months_last_bought_lh_507c37', 'n_months_last_bought_lh_507c37', 'n_months_last_bought_lh_839f8a', 'n_months_last_bought_inv_e9f316', 
                  'n_months_last_bought_grp_caa6ff', 'n_months_last_bought_grp_fd3bfb', 'n_months_last_bought_lh_e22a6a', 'n_months_last_bought_grp_70e1dd', 
                  'n_months_last_bought_grp_e04c3a', 'n_months_last_bought_grp_fe5fb8', 'n_months_last_bought_grp_94baec', 'n_months_last_bought_grp_e91421', 
                  'n_months_last_bought_lh_f852af', 'n_months_last_bought_lh_947b15', 'n_months_last_bought_32c74c', 'f_elx', 'f_mindef_mha', 'f_retail', 
                   'ape_32c74c', 'prempaid_grp_22decf', 'f_ever_bought_839f8a', 'f_ever_bought_e22a6a', 'f_ever_bought_d0adeb', 'f_ever_bought_c4bda5',
                   'f_ever_bought_ltc', 'f_ever_bought_507c37', 'f_ever_bought_gi']

In [None]:
# Formatting data with Decimal('') to float and other string representations of numbers into floats

for col in decimal_columns:
    data[col] = list(map(float, data[col]))

In [None]:
num_data = [col for col in data.columns if data[col].dtype in ['float64', 'int64']]

for col in num_data:
    median_value = data[col].median()  # Use iloc[0] to get the first mode in case of multiple modes
    data[col] = data[col].fillna(median_value)

In [None]:
#Feature selection to reduce number of features
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
X = data
X = X.drop(['clntnum'], axis = 1)
X = X.drop(['min_occ_date'], axis = 1)
X = X.drop(['cltdob_fix'], axis = 1)
y = X['f_purchase_lh']
X = X.drop(['f_purchase_lh'], axis = 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

rf_classifier = RandomForestClassifier(n_estimators=100, random_state=10)

rf_classifier.fit(X_train, y_train)

feature_importances = rf_classifier.feature_importances_

feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})

feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df)
plt.title('Feature Importances')
plt.show()

threshold = 0.01 # Adjust the threshold as needed
selected_features = feature_importance_df.loc[feature_importance_df['Importance'] > threshold, 'Feature']

X_selected = X[selected_features]

#Standardising the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_selected = scaler.fit_transform(X_selected)
X_selected = pd.DataFrame(X_selected)

# Upsampling + Downsampling

sauce platter:



trying this one:
https://machinelearningmastery.com/combine-oversampling-and-undersampling-for-imbalanced-classification/


lazy then anyhow do with this:
https://wellsr.com/python/upsampling-and-downsampling-imbalanced-data-in-python/

In [None]:
%pip install pyarrow
%pip install fastparquet
pd.show_versions()

In [None]:
%pip install imbalanced-learn

In [None]:
import imblearn
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

In [None]:
data_sample_1 = copy.deepcopy(data)

In [None]:
'''
# X should contain the features, y should contain the labels
# Splitting data into features and labels
X = data_sample_1.drop('f_purchase_lh', axis=1)
y = data_sample_1['f_purchase_lh']

print(data_sample_1.shape)
print(X.shape)
print(y.shape)
'''

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# Define the pipeline(numbers prolly to be adjusted)
over_sampler = RandomOverSampler(sampling_strategy=0.5)  #oversamples the minority class to 50 percent of the majority class
under_sampler = RandomUnderSampler(sampling_strategy=0.8)  #undersamples the majority class to 80 percent more than the minority class

pipeline = Pipeline([('over_sampler', over_sampler),('under_sampler', under_sampler)])

# Apply the pipeline to the training data only
X_train_resampled, y_train_resampled = pipeline.fit_resample(X_train, y_train)

print(X_train_resampled.shape)
print(y_train_resampled.shape)

print(y_train_resampled.value_counts())

## Confusion Matrix

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

# Initialize the SVM model (using a linear kernel in this example)
svm_model = SVC(kernel='rbf', random_state=42)

# Train the SVM model on the training data
svm_model.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test data
y_pred = svm_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(classification_rep)

In [None]:
# from numpy import mean
# from sklearn.datasets import make_classification
# from sklearn.model_selection import cross_val_score
# from sklearn.model_selection import RepeatedStratifiedKFold
# from sklearn.tree import DecisionTreeClassifier

In [None]:

# #Easy method from second link if laze, incomplete
# #Separating the data into success and fail
# success_df = data_sample_1[data_sample_1["f_purchase_lh"] == 1]
# fail_df = data_sample_1[data_sample_1["f_purchase_lh"] == 0]
# #Checking if value counts match
# print(success_df.shape)
# print(fail_df.shape)
# print(data["f_purchase_lh"].value_counts()) 

# #Downsampling with sklearn
# from sklearn.utils import resample


# fail_downsample = resample(fail_df, replace=True, n_samples=len(success_df), random_state=42) ##

# print(len(success_df))
# print(fail_downsample.shape)


## The cell below is **NOT** to be removed
##### The function is to be amended so that it accepts the given input (dataframe) and returns the required output (list). 
##### It is recommended to test the function out prior to submission
-------------------------------------------------------------------------------------------------------------------------------
##### The hidden_data parsed into the function below will have the same layout columns wise as the dataset *SENT* to you
##### Thus, ensure that steps taken to modify the initial dataset to fit into the model are also carried out in the function below

In [None]:
def testing_hidden_data(hidden_data: pd.DataFrame) -> list:
    '''DO NOT REMOVE THIS FUNCTION.

The function accepts a dataframe as input and return an iterable (list)
of binary classes as output.

The function should be coded to test on hidden data
and should include any preprocessing functions needed for your model to perform. 
    
All relevant code MUST be included in this function.'''
    result = [] 
    return result

##### Cell to check testing_hidden_data function

In [None]:
# This cell should output a list of predictions.
test_df = pd.read_parquet(filepath)
test_df = test_df.drop(columns=["f_purchase_lh"])
print(testing_hidden_data(test_df))

### Please have the filename renamed and ensure that it can be run with the requirements above being met. All the best!