##### The cell below is for you to keep track of the libraries used and install those libraries quickly
##### Ensure that the proper library names are used and the syntax of `%pip install PACKAGE_NAME` is followed

In [None]:
%pip install pandas 
%pip install matplotlib
# add commented pip installation lines for packages used as shown above for ease of testing
# the line should be of the format %pip install PACKAGE_NAME 

## **DO NOT CHANGE** the filepath variable
##### Instead, create a folder named 'data' in your current working directory and 
##### have the .parquet file inside that. A relative path *must* be used when loading data into pandas

In [None]:
# Can have as many cells as you want for code
import pandas as pd
import sklearn
filepath = "./data/catB_train.parquet" 
# the initialised filepath MUST be a relative path to a folder named data that contains the parquet file

### **ALL** Code for machine learning and dataset analysis should be entered below. 
##### Ensure that your code is clear and readable.
##### Comments and Markdown notes are advised to direct attention to pieces of code you deem useful.

In [None]:
data = pd.read_parquet(filepath)
print(data.head())

In [None]:
data['f_purchase_lh'] = data['f_purchase_lh'].fillna(0)

In [None]:
with pd.option_context('display.max_rows', 5,
                       'display.max_columns', None,
                       'display.precision', 3,
                       ):
    print(data)

In [None]:
print(data.dtypes)

In [None]:
#Deleting data columns with more than 50% missing data
thresh = 50
percentage = [col for col in data if data[col].isna().sum()/data.shape[0]*100>thresh]
#percentage.remove('f_purchase_lh')
data.drop(percentage, axis=1,inplace=True)

In [None]:
print(data.dtypes.unique())

In [None]:
print(data['stat_flag'].isna().sum()/data.shape[0]*100) # No missing values for stat_flag
data['stat_flag'].unique()

In [None]:
print(data['clttype'].isna().sum()/data.shape[0]*100)
data['clttype'].unique()

In [None]:
print(data['race_desc'].isna().sum()/data.shape[0]*100)
data['race_desc'].unique()
counts = data['race_desc'].value_counts()
print(counts)

In [None]:
cate_data = [col for col in data.columns if data[col].dtype not in ['float64', 'int64']]

In [None]:
for col in cate_data:
    mode_value = data[col].mode().iloc[0]  # Use iloc[0] to get the first mode in case of multiple modes
    data[col] = data[col].fillna(mode_value)
data

In [None]:
num_data = [col for col in data.columns if data[col].dtype in ['float64', 'int64']]

In [None]:
'''
for col in num_data:
    median_value = data[col].median()  # Use iloc[0] to get the first mode in case of multiple modes
    data[col] = data[col].fillna(median_value)
data
'''


In [None]:
import copy
data_upsample = copy.deepcopy(data)
data_downsample = copy.deepcopy(data)
data_SMOTE = copy.deepcopy(data)

# Upsampling + Downsampling

sauce platter:



trying this one:
https://machinelearningmastery.com/combine-oversampling-and-undersampling-for-imbalanced-classification/


lazy then anyhow do with this:
https://wellsr.com/python/upsampling-and-downsampling-imbalanced-data-in-python/

In [None]:
%pip install pyarrow
%pip install fastparquet
pd.show_versions()

In [None]:
%pip install imbalanced-learn

In [None]:
import imblearn
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

In [None]:
data_sample_1 = copy.deepcopy(data)

In [None]:
# X should contain the features, y should contain the labels
# Splitting data into features and labels
X = data_sample_1.drop('f_purchase_lh', axis=1)
y = data_sample_1['f_purchase_lh']

print(data_sample_1.shape)
print(X.shape)
print(Y.shape)

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the pipeline(numbers prolly to be adjusted)
over_sampler = RandomOverSampler(sampling_strategy=0.5)  # oversamples the minority class to 50 percent of the majority class
under_sampler = RandomUnderSampler(sampling_strategy=0.8)  #undersamples the majority class to 80 percent more than the minority class

pipeline = Pipeline([('over_sampler', over_sampler),('under_sampler', under_sampler)])

# Apply the pipeline to the training data only
X_train_resampled, y_train_resampled = pipeline.fit_resample(X_train, y_train)

print(X_train_resampled.shape)
print(y_train_resampled.shape)

print(y_train_resampled.value_counts())

## Tried testing with decision tree then gave up gee gee ff go next below all useless 

In [None]:
from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier

In [None]:
# define model
model = DecisionTreeClassifier()
# Define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

# Evaluate model
scores = cross_val_score(model, X_train_resampled, y_train_resampled, scoring='roc_auc', cv=cv, n_jobs=-1)
# summarize performance
print('Mean ROC AUC: %.3f' % mean(scores))

In [None]:
#Easy method from second link if laze, incomplete
#Separating the data into success and fail
success_df = data_sample_1[data_sample_1["f_purchase_lh"] == 1]
fail_df = data_sample_1[data_sample_1["f_purchase_lh"] == 0]
#Checking if value counts match
print(success_df.shape)
print(fail_df.shape)
print(data["f_purchase_lh"].value_counts()) 

#Downsampling with sklearn
from sklearn.utils import resample


fail_downsample = resample(fail_df, replace=True, n_samples=len(success_df), random_state=42) ##

print(len(success_df))
print(fail_downsample.shape)


## The cell below is **NOT** to be removed
##### The function is to be amended so that it accepts the given input (dataframe) and returns the required output (list). 
##### It is recommended to test the function out prior to submission
-------------------------------------------------------------------------------------------------------------------------------
##### The hidden_data parsed into the function below will have the same layout columns wise as the dataset *SENT* to you
##### Thus, ensure that steps taken to modify the initial dataset to fit into the model are also carried out in the function below

In [None]:
def testing_hidden_data(hidden_data: pd.DataFrame) -> list:
    '''DO NOT REMOVE THIS FUNCTION.

The function accepts a dataframe as input and return an iterable (list)
of binary classes as output.

The function should be coded to test on hidden data
and should include any preprocessing functions needed for your model to perform. 
    
All relevant code MUST be included in this function.'''
    result = [] 
    return result

##### Cell to check testing_hidden_data function

In [None]:
# This cell should output a list of predictions.
test_df = pd.read_parquet(filepath)
test_df = test_df.drop(columns=["f_purchase_lh"])
print(testing_hidden_data(test_df))

### Please have the filename renamed and ensure that it can be run with the requirements above being met. All the best!