In [1]:
import pandas as pd
import numpy as np

In [2]:
train_data = pd.read_parquet('../../data/non-null/train_data_12_months_unprocessed_non_null.parquet')
test_data = pd.read_parquet('../../data/non-null/test_data_12_months_unprocessed_non_null.parquet')


In [3]:
# train and val ids
train_ids = pd.read_parquet('../../data/initial_modelling/y_train.parquet')['customer_ID'].values
val_ids = pd.read_parquet('../../data/initial_modelling/y_val.parquet')['customer_ID'].values

In [4]:
train_data_filtered = train_data[train_data['customer_ID'].isin(train_ids)]
val_data_filtered = train_data[train_data['customer_ID'].isin(val_ids)]

In [5]:
train_data_filtered.info(verbose=True, show_counts=True, memory_usage=True)

<class 'pandas.core.frame.DataFrame'>
Index: 3242676 entries, 0 to 4632395
Data columns (total 95 columns):
 #   Column        Non-Null Count    Dtype         
---  ------        --------------    -----         
 0   customer_ID   3242676 non-null  object        
 1   S_2           3242676 non-null  datetime64[ns]
 2   D_39          3242676 non-null  float32       
 3   B_1           3242676 non-null  float32       
 4   B_2           3242542 non-null  float32       
 5   R_1           3242676 non-null  float32       
 6   D_41          3242542 non-null  float32       
 7   B_3           3242542 non-null  float32       
 8   B_4           3242676 non-null  float32       
 9   D_45          3242542 non-null  float32       
 10  B_5           3242676 non-null  float32       
 11  R_2           3242676 non-null  float32       
 12  D_47          3242676 non-null  float32       
 13  B_6           3242676 non-null  float32       
 14  B_7           3242676 non-null  float32       
 15  D_5

In [6]:
#Fill null values with 0
train_data_filtered = train_data_filtered.fillna(0)
val_data_filtered = val_data_filtered.fillna(0)
test_data_filtered = test_data.fillna(0)

In [7]:
train_data_filtered.isna().sum().sum(), val_data_filtered.isna().sum().sum()

(0, 0)

In [8]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import numpy as np

def one_hot_encode_categories(df, categorical_columns=None, drop_original=True, handle_unknown='error'):
    """
    One-hot encode categorical columns in a DataFrame.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        Input DataFrame containing categorical columns
    categorical_columns : list or None
        List of categorical column names to encode. If None, automatically detects categorical columns.
    drop_original : bool
        Whether to drop the original categorical columns
    handle_unknown : str
        Strategy for handling unknown categories in new data: 'error', 'ignore' or 'infrequent_if_exist'
    
    Returns:
    --------
    pandas.DataFrame
        DataFrame with one-hot encoded columns
    OneHotEncoder
        Fitted encoder for future transformations
    """
    # Make a copy to avoid modifying the original
    result_df = df.copy()
    
    # Automatically detect categorical columns if not specified
    if categorical_columns is None:
        categorical_columns = result_df.select_dtypes(include=['object', 'category']).columns.tolist()
    
    if not categorical_columns:
        print("No categorical columns found to encode.")
        return result_df, None
        
    # Initialize the encoder
    encoder = OneHotEncoder(sparse_output=False, handle_unknown=handle_unknown, drop='if_binary')
    
    # Fit and transform the categorical columns
    encoded_array = encoder.fit_transform(result_df[categorical_columns])
    
    # Get feature names
    feature_names = encoder.get_feature_names_out(categorical_columns)
    
    # Create a DataFrame with the encoded features
    encoded_df = pd.DataFrame(encoded_array, columns=feature_names, index=result_df.index)
    
    # Combine with the original DataFrame
    if drop_original:
        # Drop the original categorical columns
        result_df = result_df.drop(columns=categorical_columns)
    
    # Concatenate the encoded columns with the original DataFrame
    result_df = pd.concat([result_df, encoded_df], axis=1)
    
    print(f"One-hot encoded {len(categorical_columns)} categorical columns into {len(feature_names)} binary features.")
    
    return result_df, encoder

In [9]:
cat_cols = ['B_38', 'D_63', 'B_30', 'D_126']
train_data_filtered[cat_cols] = train_data_filtered[cat_cols].astype('category')
val_data_filtered[cat_cols] = val_data_filtered[cat_cols].astype('category')
test_data_filtered[cat_cols] = test_data_filtered[cat_cols].astype('category')

In [10]:
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='if_binary')
ohe.fit_transform(train_data_filtered[cat_cols])

array([[0., 0., 1., ..., 0., 0., 1.],
       [0., 0., 1., ..., 0., 0., 1.],
       [0., 0., 1., ..., 0., 0., 1.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [11]:
ohe.get_feature_names_out(cat_cols)

array(['B_38_0.0', 'B_38_1.0', 'B_38_2.0', 'B_38_3.0', 'B_38_4.0',
       'B_38_5.0', 'B_38_6.0', 'B_38_7.0', 'D_63_CL', 'D_63_CO',
       'D_63_CR', 'D_63_XL', 'D_63_XM', 'D_63_XZ', 'B_30_0.0', 'B_30_1.0',
       'B_30_2.0', 'D_126_-1.0', 'D_126_0.0', 'D_126_1.0'], dtype=object)

In [12]:
# Apply to your training data
train_data_filtered_oh, cat_encoder = one_hot_encode_categories(train_data_filtered, handle_unknown='ignore', categorical_columns=cat_cols)

One-hot encoded 4 categorical columns into 20 binary features.


In [13]:
val_data_filtered_oh = pd.concat([val_data_filtered.drop(columns=cat_encoder.feature_names_in_),pd.DataFrame(cat_encoder.transform(val_data_filtered[cat_encoder.feature_names_in_]), columns=cat_encoder.get_feature_names_out(),index=val_data_filtered.index)], axis=1)
test_data_filtered_oh = pd.concat([test_data_filtered.drop(columns=cat_encoder.feature_names_in_),pd.DataFrame(cat_encoder.transform(test_data_filtered[cat_encoder.feature_names_in_]), columns=cat_encoder.get_feature_names_out(),index=test_data_filtered.index)], axis=1)

In [14]:
test_data_filtered_oh

Unnamed: 0,customer_ID,S_2,D_39,B_1,B_2,R_1,D_41,B_3,B_4,D_45,...,D_63_CR,D_63_XL,D_63_XM,D_63_XZ,B_30_0.0,B_30_1.0,B_30_2.0,D_126_-1.0,D_126_0.0,D_126_1.0
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-04-07,0.005775,0.004923,1.000653,0.006151,0.000798,0.002714,0.069419,0.712795,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-05-28,0.091505,0.021655,1.009672,0.006815,0.007598,0.009423,0.068839,0.720884,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-06-13,0.002455,0.013683,1.002700,0.001373,0.000685,0.005531,0.055630,0.723997,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-07-16,0.002483,0.015193,1.000727,0.007605,0.004653,0.009312,0.038862,0.720619,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-08-04,0.001746,0.007863,1.005006,0.004220,0.009857,0.009866,0.027265,0.721371,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4696183,fffff1d38b785cef84adeace64f8f83db3a0c31e8d92ea...,2017-11-05,0.416013,0.020818,0.828199,0.003487,0.005340,0.025139,0.000029,0.738777,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4696184,fffff1d38b785cef84adeace64f8f83db3a0c31e8d92ea...,2017-12-23,0.296536,0.007209,0.812610,0.005904,0.002243,0.023691,0.014354,0.744180,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4696185,fffff1d38b785cef84adeace64f8f83db3a0c31e8d92ea...,2018-01-06,0.443984,0.013151,0.815422,0.003457,0.002111,0.012343,0.016425,0.747021,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4696186,fffff1d38b785cef84adeace64f8f83db3a0c31e8d92ea...,2018-02-06,0.442553,0.009855,1.003541,0.005117,0.009930,0.008578,0.004424,0.748190,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [15]:
val_data_filtered_oh

Unnamed: 0,customer_ID,S_2,D_39,B_1,B_2,R_1,D_41,B_3,B_4,D_45,...,D_63_CR,D_63_XL,D_63_XM,D_63_XZ,B_30_0.0,B_30_1.0,B_30_2.0,D_126_-1.0,D_126_0.0,D_126_1.0
12,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,2017-03-01,0.382790,0.025782,1.002647,0.005515,0.001479,0.005830,0.021776,0.239459,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
13,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,2017-04-16,0.002224,0.006806,1.008186,0.003287,0.007831,0.000861,0.012991,0.247009,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
14,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,2017-05-07,0.567403,0.033713,1.007497,0.005594,0.003162,0.010466,0.015447,0.243068,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
15,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,2017-06-25,0.213193,0.031170,1.008534,0.008996,0.009513,0.005360,0.015686,0.254167,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
16,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,2017-07-30,0.325467,0.109644,1.005098,0.008041,0.000732,0.005484,0.046556,0.248585,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4632403,fffff1d38b785cef84adeace64f8f83db3a0c31e8d92ea...,2017-10-20,0.002230,0.022695,0.557029,0.008897,0.000880,0.045471,0.006506,0.744973,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4632404,fffff1d38b785cef84adeace64f8f83db3a0c31e8d92ea...,2017-11-05,0.416013,0.020818,0.828199,0.003487,0.005340,0.025139,0.000029,0.738777,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4632405,fffff1d38b785cef84adeace64f8f83db3a0c31e8d92ea...,2017-12-23,0.296536,0.007209,0.812610,0.005904,0.002243,0.023691,0.014354,0.744180,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4632406,fffff1d38b785cef84adeace64f8f83db3a0c31e8d92ea...,2018-01-06,0.443984,0.013151,0.815422,0.003457,0.002111,0.012343,0.016425,0.747021,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [16]:
train_data_filtered_oh.info(verbose=True, show_counts=True, memory_usage=True)

<class 'pandas.core.frame.DataFrame'>
Index: 3242676 entries, 0 to 4632395
Data columns (total 111 columns):
 #    Column        Non-Null Count    Dtype         
---   ------        --------------    -----         
 0    customer_ID   3242676 non-null  object        
 1    S_2           3242676 non-null  datetime64[ns]
 2    D_39          3242676 non-null  float32       
 3    B_1           3242676 non-null  float32       
 4    B_2           3242676 non-null  float32       
 5    R_1           3242676 non-null  float32       
 6    D_41          3242676 non-null  float32       
 7    B_3           3242676 non-null  float32       
 8    B_4           3242676 non-null  float32       
 9    D_45          3242676 non-null  float32       
 10   B_5           3242676 non-null  float32       
 11   R_2           3242676 non-null  float32       
 12   D_47          3242676 non-null  float32       
 13   B_6           3242676 non-null  float32       
 14   B_7           3242676 non-null  float

In [17]:
cols_drop = ['S_2', 'target']

In [18]:
train_targets = train_data_filtered_oh[['customer_ID', 'end_of_month','target']]
val_targets = val_data_filtered_oh[['customer_ID', 'end_of_month','target']]
test_targets = test_data_filtered_oh[['customer_ID', 'end_of_month','target']]

In [19]:
train_data_filtered_oh.drop(columns=cols_drop, inplace=True)
val_data_filtered_oh.drop(columns=cols_drop, inplace=True)
test_data_filtered_oh.drop(columns=cols_drop, inplace=True)

In [20]:
train_data_filtered_oh.info(verbose=True, show_counts=True, memory_usage=True)

<class 'pandas.core.frame.DataFrame'>
Index: 3242676 entries, 0 to 4632395
Data columns (total 109 columns):
 #    Column        Non-Null Count    Dtype         
---   ------        --------------    -----         
 0    customer_ID   3242676 non-null  object        
 1    D_39          3242676 non-null  float32       
 2    B_1           3242676 non-null  float32       
 3    B_2           3242676 non-null  float32       
 4    R_1           3242676 non-null  float32       
 5    D_41          3242676 non-null  float32       
 6    B_3           3242676 non-null  float32       
 7    B_4           3242676 non-null  float32       
 8    D_45          3242676 non-null  float32       
 9    B_5           3242676 non-null  float32       
 10   R_2           3242676 non-null  float32       
 11   D_47          3242676 non-null  float32       
 12   B_6           3242676 non-null  float32       
 13   B_7           3242676 non-null  float32       
 14   D_51          3242676 non-null  float

In [22]:
num_scale_drop_cols = ['customer_ID', 'end_of_month']+cat_encoder.get_feature_names_out().tolist()
print(num_scale_drop_cols)

['customer_ID', 'end_of_month', 'B_38_0.0', 'B_38_1.0', 'B_38_2.0', 'B_38_3.0', 'B_38_4.0', 'B_38_5.0', 'B_38_6.0', 'B_38_7.0', 'D_63_CL', 'D_63_CO', 'D_63_CR', 'D_63_XL', 'D_63_XM', 'D_63_XZ', 'B_30_0.0', 'B_30_1.0', 'B_30_2.0', 'D_126_-1.0', 'D_126_0.0', 'D_126_1.0']


In [27]:
len(num_scale_drop_cols)

22

In [23]:
from sklearn.preprocessing import RobustScaler
rb_scaler = RobustScaler()
rb_scaler.fit(train_data_filtered_oh.drop(columns=num_scale_drop_cols))

In [28]:
#Train data - numerically scaled
train_data_filtered_num_scaled = pd.DataFrame(rb_scaler.transform(train_data_filtered_oh.drop(columns=num_scale_drop_cols)), columns=train_data_filtered_oh.drop(columns=num_scale_drop_cols).columns, index=train_data_filtered_oh.index)
train_data_filtered_scaled = pd.concat([train_data_filtered_oh[num_scale_drop_cols], train_data_filtered_num_scaled], axis=1)

In [33]:
val_data_filtered_num_scaled = pd.DataFrame(rb_scaler.transform(val_data_filtered_oh.drop(columns=num_scale_drop_cols)), columns=val_data_filtered_oh.drop(columns=num_scale_drop_cols).columns, index=val_data_filtered_oh.index)
val_data_filtered_scaled = pd.concat([val_data_filtered_oh[num_scale_drop_cols], val_data_filtered_num_scaled], axis=1)

In [34]:
test_data_filtered_num_scaled = pd.DataFrame(rb_scaler.transform(test_data_filtered_oh.drop(columns=num_scale_drop_cols)), columns=test_data_filtered_oh.drop(columns=num_scale_drop_cols).columns, index=test_data_filtered_oh.index)
test_data_filtered_scaled = pd.concat([test_data_filtered_oh[num_scale_drop_cols], test_data_filtered_num_scaled], axis=1)

In [31]:
train_data_filtered_scaled.head()

Unnamed: 0,customer_ID,end_of_month,B_38_0.0,B_38_1.0,B_38_2.0,B_38_3.0,B_38_4.0,B_38_5.0,B_38_6.0,B_38_7.0,...,R_24,R_25,D_96,D_102,B_36,D_127,D_133,R_28,D_140,D_144
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-03-31,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,-0.219306,-0.27265,-0.041485,2.311544,0.981211,178.048201,-0.195475,-0.693695,-0.270751,-0.884744
1,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-04-30,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.650272,0.76484,-0.384333,2.37047,-0.221135,178.953927,0.382777,-0.014753,-0.376212,-0.001863
2,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-05-31,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,-0.511399,0.948012,0.052165,2.434015,-0.749395,179.116713,0.700725,0.823031,0.437769,0.268263
3,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-06-30,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.680948,-0.03126,-0.988148,2.438205,-0.458049,177.754051,0.329728,-0.518947,-0.112386,0.18535
4,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-07-31,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.420909,-0.405085,-0.896716,2.481113,0.987143,179.22006,0.165614,-0.108503,-0.028332,0.474512


In [35]:
# Pivot the data to create a 3D array
train_data_3d = train_data_filtered_oh.set_index(['customer_ID', 'end_of_month']).sort_index()

In [36]:
train_targets

Unnamed: 0,customer_ID,end_of_month,target
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-03-31,0
1,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-04-30,0
2,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-05-31,0
3,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-06-30,0
4,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-07-31,0
...,...,...,...
4632391,ffffa5c46bc8de74f5a4554e74e239c8dee6b9baf38814...,2017-10-31,1
4632392,ffffa5c46bc8de74f5a4554e74e239c8dee6b9baf38814...,2017-11-30,1
4632393,ffffa5c46bc8de74f5a4554e74e239c8dee6b9baf38814...,2017-12-31,1
4632394,ffffa5c46bc8de74f5a4554e74e239c8dee6b9baf38814...,2018-01-31,1


In [37]:
train_data_3d

Unnamed: 0_level_0,Unnamed: 1_level_0,D_39,B_1,B_2,R_1,D_41,B_3,B_4,D_45,B_5,R_2,...,D_63_CR,D_63_XL,D_63_XM,D_63_XZ,B_30_0.0,B_30_1.0,B_30_2.0,D_126_-1.0,D_126_0.0,D_126_1.0
customer_ID,end_of_month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a,2017-03-31,0.001733,0.008724,1.006838,0.009228,0.008771,0.004709,0.080986,0.708906,0.170600,0.006204,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a,2017-04-30,0.005775,0.004923,1.000653,0.006151,0.000798,0.002714,0.069419,0.712795,0.113239,0.006206,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a,2017-05-31,0.091505,0.021655,1.009672,0.006815,0.007598,0.009423,0.068839,0.720884,0.060492,0.003259,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a,2017-06-30,0.002455,0.013683,1.002700,0.001373,0.000685,0.005531,0.055630,0.723997,0.166782,0.009918,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a,2017-07-31,0.002483,0.015193,1.000727,0.007605,0.004653,0.009312,0.038862,0.720619,0.143630,0.006667,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ffffa5c46bc8de74f5a4554e74e239c8dee6b9baf388145b2c3d01967fcce461,2017-10-31,0.002767,0.025240,1.004200,0.009835,0.006601,0.095549,0.154402,0.135532,0.015174,0.001389,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
ffffa5c46bc8de74f5a4554e74e239c8dee6b9baf388145b2c3d01967fcce461,2017-11-30,0.004490,0.022121,1.001141,0.003452,0.004425,0.044108,0.163308,0.066010,0.007564,0.006320,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
ffffa5c46bc8de74f5a4554e74e239c8dee6b9baf388145b2c3d01967fcce461,2017-12-31,0.001332,0.029211,1.005610,0.002905,0.001444,0.039642,0.160784,0.062111,0.005290,0.006462,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
ffffa5c46bc8de74f5a4554e74e239c8dee6b9baf388145b2c3d01967fcce461,2018-01-31,0.030946,0.015508,0.209527,0.008004,0.008975,0.083690,0.183500,0.070414,0.009461,0.006594,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [38]:
# Convert to a 3D numpy array
train_data_3d_array = train_data_3d.to_numpy().reshape(
    len(train_data_3d.index.levels[0]),  # Number of unique customer_IDs
    len(train_data_3d.index.levels[1]),  # Number of unique end_of_month entries
    -1                                # Number of features
)

print(train_data_3d_array.shape)

(270223, 12, 107)


In [39]:
train_data_3d_array[0,0,:]

array([1.73333904e-03, 8.72445107e-03, 1.00683820e+00, 9.22772195e-03,
       8.77113175e-03, 4.70924051e-03, 8.09863359e-02, 7.08906293e-01,
       1.70600235e-01, 6.20403141e-03, 5.25351048e-01, 6.39022142e-02,
       5.94157316e-02, 1.33585584e+00, 8.20673909e-03, 1.42250210e-03,
       9.62188095e-02, 2.33811215e-02, 2.76806159e-03, 8.32164567e-03,
       1.00151896e+00, 8.29843525e-03, 1.48266062e-01, 9.22998071e-01,
       1.18075132e-01, 1.88179361e-03, 1.58611953e-01, 1.83846187e-02,
       1.99617013e-01, 4.01618570e-01, 7.12616276e-03, 7.66526628e-03,
       6.52984440e-01, 8.52043927e-03, 4.72982554e-03, 2.72007585e-01,
       8.36253911e-03, 5.15222073e-01, 2.64402619e-03, 4.80751088e-03,
       1.19403206e-01, 1.08271115e-01, 5.08818515e-02, 7.55443284e-03,
       6.90667853e-02, 4.32678638e-03, 7.56244501e-03, 2.71827972e-04,
       1.43398868e-03, 2.27093743e-03, 7.12108519e-03, 2.45606480e-03,
       2.31029745e-03, 8.03302322e-03, 1.00982463e+00, 8.46826136e-02,
      

In [40]:
val_data_3d = val_data_filtered_oh.set_index(['customer_ID', 'end_of_month']).sort_index()
val_data_3d_array = val_data_3d.to_numpy().reshape(
    len(val_data_3d.index.levels[0]),  # Number of unique customer_IDs
    len(val_data_3d.index.levels[1]),  # Number of unique end_of_month entries
    -1                                # Number of features
)

print(val_data_3d_array.shape)

(115811, 12, 107)


In [41]:
test_data_3d = test_data_filtered_oh.set_index(['customer_ID', 'end_of_month']).sort_index()
test_data_3d_array = test_data_3d.to_numpy().reshape(
    len(test_data_3d.index.levels[0]),  # Number of unique customer_IDs
    len(test_data_3d.index.levels[1]),  # Number of unique end_of_month entries
    -1                                # Number of features
)
print(test_data_3d_array.shape)

(391349, 12, 107)


In [36]:
#save the 3D array in npz format
np.savez_compressed('../../data/3d_array/train_data_3d.npz', train_data_3d_array)
np.savez_compressed('../../data/3d_array/val_data_3d.npz', val_data_3d_array)

In [43]:
train_targets.to_parquet('../../data/3d_array/train_targets.parquet', index=False)
val_targets.to_parquet('../../data/3d_array/val_targets.parquet', index=False)
test_targets.to_parquet('../../data/3d_array/test_targets.parquet', index=False)

In [42]:
import h5py

with h5py.File('../../data/3d_array/train_data_3d_h5.h5', 'w') as f:
    f.create_dataset('train_data_3d', data=train_data_3d_array, compression='gzip')

with h5py.File('../../data/3d_array/val_data_3d_h5.h5', 'w') as f:
    f.create_dataset('val_data_3d', data=val_data_3d_array, compression='gzip')

with h5py.File('../../data/3d_array/test_data_3d_h5.h5', 'w') as f:
    f.create_dataset('test_data_3d', data=test_data_3d_array, compression='gzip')