# <span style="color:DarkCyan">Incremental Principal Component Analysis (IPCA)</span> | American Express - Default Prediction

Thank you for viewing my notebook, I hope you enjoy it 📊<br>
Don't hesitate to leave any feedback 😉

# Overview

<span style="font-size:22px"><span style="color:DarkCyan">Incremental Principal Component Analysis (IPCA)</span> is an alternative for principal component analysis (PCA)<br>
<span style="color:DarkCyan">IPCA</span> allows us to decompose large datasets that cannot be fit in typical PCA.<br><br>
The size of the original dataset is 16GB. We can read the dataset with chunks and fit in IPCA<br><br></span>

In [None]:
import numpy as np
import pandas as pd
import os
import pickle as pk
from sklearn.decomposition import IncrementalPCA
import matplotlib.pyplot as plt

## Define dtypes for the pandas DataFrame

In [None]:
path = '/kaggle/input/amex-default-prediction'
train_path = os.path.join(path, 'train_data.csv')
train_df = pd.read_csv(train_path, nrows=100_000)

bools = train_df.select_dtypes(include=[int])
floats = train_df.select_dtypes(include=[float])
categorical_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
pca_cols = set(train_df.columns) - set(categorical_cols) - set(['customer_ID', 'S_2'])
del train_df

dtypes = dict(zip(floats, [np.float32]*len(floats)))
dtypes.update(dict(zip(bools, [bool]*len(bools))))
dtypes.update(dict(zip(categorical_cols, ['category']*len(categorical_cols))))

## Create pandas DataFrame iterator - read data in chunks

In [None]:
df = pd.read_csv(train_path, chunksize=10**2, usecols=pca_cols, dtype=dtypes, iterator=True)

## Initialize Incremental PCA

In [None]:
ipca = IncrementalPCA(batch_size=10)

## Fit Incremental PCA with chunks

In [None]:
for chunk in df:
    ipca.partial_fit(chunk.fillna(0))

## Visualize results: Explained Variance Ratio

In [None]:
cum_expl_var_ratio = np.cumsum(ipca.explained_variance_ratio_)

In [None]:
threshold = 0.99  # Define cumulative variance ratio threshold 

In [None]:
# Get the cumulative variance
cum_var = np.cumsum(ipca.explained_variance_ratio_)

# Calculate how many PCs explain 95% of the variance?
k = np.argmax(cum_var>threshold)
print(f'Number of components explaining {threshold:.0%} variance: {k}')
print('\n')

plt.figure(figsize=[10,5])
plt.title('Cumulative Explained Variance explained by the components')
plt.ylabel('Cumulative Explained Variance')
plt.xlabel('Principal components')
plt.axvline(x=k, color="k", linestyle="--")
plt.axhline(y=threshold, color="r", linestyle="--")
ax = plt.plot(cum_var)

In [None]:
plt.figure(figsize=(12, 4))
plt.bar(range(k), ipca.explained_variance_ratio_[:k], alpha=0.5, align='center',
        label='individual explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.legend(loc='best')
plt.tight_layout()

## Save Incremental PCA

In [None]:
pk.dump(ipca, open("pca.pkl","wb"))