# AMEX - Aggregated Dataset 🦝
## Predict if a customer will default in the future...

#### Notebook Goal (Work in Progress...)
The objective of this Notebook is to create a more complex aggregated dataset to train models on it; **so far has been quite challenging to not run out of memory**...
I will keep trying new ways to optimize the memory utilization...


#### Dataset
The objective of this competition is to predict the probability that a customer does not pay back their credit card balance amount in the future based on their monthly customer profile. The target binary variable is calculated by observing 18 months performance window after the latest credit card statement, and if the customer does not pay due amount in 120 days after their latest statement date it is considered a default event.







#### Resources
* https://www.kaggle.com/code/huseyincot/amex-agg-data-how-it-created
* https://waylonwalker.com/reset-ipython/ 

# 1.0 Loading Model Libraries...

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gc as gc # garbage collector

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

---

# 2.0 Setting the Notebook Parameters and Default Configuration...

In [None]:
%%time
# I like to disable my Notebook Warnings.
import warnings
warnings.filterwarnings('ignore')

In [None]:
%%time
# Notebook Configuration...

# Amount of data we want to load into the Model...
DATA_ROWS = None
# Dataframe, the amount of rows and cols to visualize...
NROWS = 50
NCOLS = 15
# Main data location path...
BASE_PATH = '...'

In [None]:
%%time
# Configure notebook display settings to only use 2 decimal places, tables look nicer.
pd.options.display.float_format = '{:,.5f}'.format
pd.set_option('display.max_columns', NCOLS) 
pd.set_option('display.max_rows', NROWS)

---

# 3.0 Loading the Train Dataset Information (Using Feather)...

In [None]:
%%time
# Load the CSV information into a Pandas DataFrame...
trn_data = pd.read_feather('../input/parquet-files-amexdefault-prediction/train_data.ftr')
trn_lbls = pd.read_csv('/kaggle/input/amex-default-prediction/train_labels.csv')

---

# 4.0 Exploring the Dataset, Quick EDA...

In [None]:
%%time
# Explore the shape of the DataFrame...
# trn_data.shape

In [None]:
%%time
# Display simple information of the variables in the dataset...
# trn_data.info()

In [None]:
%%time
# Display the first few rows of the DataFrame...
# trn_data.head()

In [None]:
%%time
# Display the Min Date...
# trn_data['S_2'].min()

In [None]:
%%time
# Display the Max Date...
# trn_data['S_2'].max()

In [None]:
%%time
# Generate a simple statistical summary of the DataFrame, Only Numerical...
# trn_data.describe() # I believe it consume significant memory

In [None]:
%%time
# Calculates the total number of missing values...
# trn_data.isnull().sum().sum()

In [None]:
%%time
# Display the number of missing values by variable...
# trn_data.isnull().sum()

In [None]:
%%time
# Display the number of unique values for each variable...
# trn_data.nunique()

---

# 6.0 Structuring data for the Model (Aggreations and More)

In [None]:
%%time
remove = ['customer_ID', 'S_2', 'target']
features = [f for f in trn_data.columns if f not in remove]

In [None]:
# Based on the dataset Descriptions this are Categorical Variables...
cat_variables_dataset = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']

In [None]:
%%time
# Extract the name of categorical variables based in the number of unique values...
cutoff = 64
cat_variables_calc = [f for f in trn_data.columns if trn_data[f].nunique() < cutoff] 

In [None]:
%%time
num_features_calc = [col for col in features if col not in cat_variables_calc]

In [None]:
%%time
# Print the number of categorical varibles identified
print(f'Categorical Variables Dataset:{len(cat_variables_dataset)}')
print(f'Categorical Variables Base on Calculations: {len(cat_variables_calc)}')

---

# 7.0 Defining an Aggregation function.

In [None]:
%%time
def agg_data(df, features, agg_calcs = ['mean']):
    '''
    
    '''
    
    tmp = df.groupby("customer_ID")[features].agg(agg_calcs)
    tmp.columns = ['_'.join(x) for x in tmp.columns]
    tmp = tmp.reset_index()
    
    return tmp

---

# 8.0 Aggregating the Train Dataset.

## 8.1 Aggregating Numerical Variables

In [None]:
%%time
agg_calculations = ['mean', 'std', 'min', 'max', 'last',]
trn_num_agg = agg_data(trn_data, num_features_calc, agg_calculations)

In [None]:
%%time
trn_agg_dataset = pd.concat([trn_num_agg, trn_lbls], axis = 1)

In [None]:
%%time
del trn_num_agg, trn_lbls
gc.collect()

## 8.2 Aggregating Categorical Variables

In [None]:
%%time
agg_calculations = ['count', 'last', 'nunique']
trn_cat_agg = agg_data(trn_data, cat_variables_dataset, agg_calculations)

In [None]:
%%time
trn_agg_dataset = pd.concat([trn_agg_dataset, trn_cat_agg], axis = 1)

In [None]:
%%time
del trn_cat_agg
gc.collect()

## 8.3 Reviewing the Agg Dataset

In [None]:
%%time
trn_agg_dataset.head()

## 8.4 Destroying some of the datasets to release some memory

In [None]:
%%time
del trn_data
gc.collect()

## 8.5 Creating a Pickle file Backup

In [None]:
%%time
trn_agg_dataset.to_pickle('trn_agg_dataset.pkl', compression = 'gzip')

In [None]:
%%time
del trn_agg_dataset
gc.collect()

In [None]:
%reset -f

---

# 9.0 Loading Model Libraries One More Time (Due to Reset)...

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gc as gc # garbage collector

# 10.0 Loading the Test Dataset Information (Using Feather)...

In [None]:
%%time
#tst_data = pd.read_feather('../input/parquet-files-amexdefault-prediction/test_data.ftr')

# 11.0 Structuring Data for the Model (Aggreations and More)

In [None]:
%%time
remove = ['customer_ID', 'S_2', 'target']
features = [f for f in tst_data.columns if f not in remove]

In [None]:
# Based on the dataset Descriptions this are Categorical Variables...
cat_variables_dataset = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']

In [None]:
%%time
# Extract the name of categorical variables based in the number of unique values...
cutoff = 64
cat_variables_calc = [f for f in tst_data.columns if tst_data[f].nunique() < cutoff] 

In [None]:
%%time
num_features_calc = [col for col in features if col not in cat_variables_calc]

In [None]:
%%time
# Print the number of categorical varibles identified
print(f'Categorical Variables Dataset:{len(cat_variables_dataset)}')
print(f'Categorical Variables Base on Calculations: {len(cat_variables_calc)}')

# 12.0 Defining an Aggregation function.

In [None]:
%%time
def agg_data(df, features, agg_calcs = ['mean']):
    '''
    
    '''
    
    tmp = df.groupby("customer_ID")[features].agg(agg_calcs)
    tmp.columns = ['_'.join(x) for x in tmp.columns]
    tmp = tmp.reset_index()
    
    return tmp

---

# 13.0 Aggregating the Test Dataset.

## 13.1 Aggregating Numerical Variables

In [None]:
%%time
#agg_calculations = ['mean', 'std', 'min', 'max', 'last',]
#tst_num_agg = agg_data(tst_data, num_features_calc, agg_calculations)

In [None]:
%%time
#tst_agg_dataset = pd.concat([tst_num_agg, trn_lbls], axis = 1)

In [None]:
%%time
#del tst_num_agg
#gc.collect()

## 13.2 Aggregating Categorical Variables

In [None]:
%%time
#agg_calculations = ['count', 'last', 'nunique']
#tst_cat_agg = agg_data(tst_data, cat_variables_dataset, agg_calculations)

In [None]:
%%time
#tst_agg_dataset = pd.concat([tst_agg_dataset, tst_cat_agg], axis = 1)

In [None]:
%%time
#del tst_cat_agg
#gc.collect()

## 13.3 Reviewing the Agg Dataset

In [None]:
%%time
#tst_agg_dataset.head()

## 13.4 Destroying some of the datasets to release some memory

In [None]:
#del tst_data
#gc.collect()

## 13.5 Creating a Pickle file Backup

In [None]:
%%time
#tst_agg_dataset.to_pickle('tst_agg_dataset.pkl', compression = 'gzip')

In [None]:
#del tst_agg_dataset
#gc.collect()

In [None]:
%reset -f

---