# Cross-Validation

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Load Data

In [2]:
df = pd.read_csv("PS_20174392719_1491204439457_log.csv")

### Data Pre-processing

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


In [5]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


## Cross Validation

In [8]:
features = [
    'amount',
    'oldbalanceOrg',
    'newbalanceOrig',
    'oldbalanceDest',
    'newbalanceDest'
]

### Train-Test-Split

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X = df[features]
y = df['isFraud']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [11]:
X_train

Unnamed: 0,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest
1186641,186994.49,4980.00,191974.49,2565463.66,2225790.42
1199598,86377.61,31480.00,0.00,0.00,86377.61
1185636,215489.19,21518.00,0.00,6345756.55,6794954.89
6211424,697319.21,25493.00,0.00,2150239.95,2847559.16
4452703,59243.58,29879.00,0.00,0.00,0.00
...,...,...,...,...,...
1570006,129715.85,5054252.83,5183968.68,246692.94,116977.09
2234489,2459.70,0.00,0.00,0.00,0.00
4926484,10579.16,59279.00,48699.84,322754.16,333333.32
4304572,73020.76,20289.00,0.00,256102.84,329123.61


### K-Fold Cross Validation

In [12]:
from sklearn.model_selection import KFold

In [13]:
kf = KFold(n_splits=2)
kf.get_n_splits(X)

2

In [32]:
folds = {}

for train, test in kf.split(X):
    ## Logging the fold number
    fold_num = 1
    
    ## Saving the fold number
    folds[fold_num] = (df.iloc[train], df.iloc[test])
    print('train: %s, test: %s' % (df.iloc[train], df.iloc[test]))
    fold_num += 1

train:          step      type      amount     nameOrig  oldbalanceOrg  \
3181310   239  CASH_OUT    64425.92   C846324167           0.00   
3181311   239  CASH_OUT   135675.56   C558686155           0.00   
3181312   239  CASH_OUT    80869.52   C181219207           0.00   
3181313   239  CASH_OUT    51326.53  C1118966972           0.00   
3181314   239   PAYMENT     1827.59  C2146482932       10916.00   
...       ...       ...         ...          ...            ...   
6362615   743  CASH_OUT   339682.13   C786484425      339682.13   
6362616   743  TRANSFER  6311409.28  C1529008245     6311409.28   
6362617   743  CASH_OUT  6311409.28  C1162922333     6311409.28   
6362618   743  TRANSFER   850002.52  C1685995037      850002.52   
6362619   743  CASH_OUT   850002.52  C1280323807      850002.52   

         newbalanceOrig     nameDest  oldbalanceDest  newbalanceDest  isFraud  \
3181310            0.00  C2127689854       541170.89       605596.80        0   
3181311            0.00  C

### Time Series Cross Validation

In [6]:
import random
import time
    
def random_datetimes_or_dates(start, end, out_format='datetime', n=10): 

    '''   
    unix timestamp is in ns by default. 
    I divide the unix time value by 10**9 to make it seconds (or 24*60*60*10**9 to make it days).
    The corresponding unit variable is passed to the pd.to_datetime function. 
    Values for the (divide_by, unit) pair to select is defined by the out_format parameter.
    for 1 -> out_format='datetime'
    for 2 -> out_format=anything else
    '''
    (divide_by, unit) = (10**9, 's') if out_format=='datetime' else (24*60*60*10**9, 'D')

    start_u = start.value//divide_by
    end_u = end.value//divide_by

    return pd.to_datetime(np.random.randint(start_u, end_u, n), unit=unit) 

In [48]:
start = pd.to_datetime('2021-01-01')
end = pd.to_datetime('2022-09-01')

dates_vals = random_datetimes_or_dates(start, end, out_format='datetime', n=len(df))

df['date'] = dates_vals
df['date'] = df['date'].apply(lambda x: x.date())

### Date Split

In [46]:
DATE = '2021-12-31'

train_df = df[df['date'] < DATE].copy()
test_df = df[df['date'] >= DATE].copy()

### Time Series KFold

In [60]:
from sklearn.model_selection import TimeSeriesSplit

tscv = TimeSeriesSplit()

for train_index, test_index in tscv.split(X):
        print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

TRAIN: [      0       1       2 ... 1060437 1060438 1060439] TEST: [1060440 1060441 1060442 ... 2120873 2120874 2120875]
TRAIN: [      0       1       2 ... 2120873 2120874 2120875] TEST: [2120876 2120877 2120878 ... 3181309 3181310 3181311]
TRAIN: [      0       1       2 ... 3181309 3181310 3181311] TEST: [3181312 3181313 3181314 ... 4241745 4241746 4241747]
TRAIN: [      0       1       2 ... 4241745 4241746 4241747] TEST: [4241748 4241749 4241750 ... 5302181 5302182 5302183]
TRAIN: [      0       1       2 ... 5302181 5302182 5302183] TEST: [5302184 5302185 5302186 ... 6362617 6362618 6362619]


### Expanding Window

In [3]:
class ExpandingWindowCV:
    def __init__(self, date_col, date_range = None, custom_range = None):
        self.date_col = date_col
        self.date_range = date_range
        self.custom_range = custom_range
        
        if date_range is not None and custom_range is not None:
            raise ValueError("Date Range and Custom Range both cannot be None.")
    
    def split(self, df):
        if self.date_range is None:         
            dates = list(set(df[self.date_col].astype(str).values))
        
        if self.date_range is not None:
            dates = pd.date_range(start=self.date_range[0], end=self.date_range[1])
            dates = [str(d.date()) for d in dates]
        
        if self.custom_range is not None:
            dates = self.custom_range
            
        for d in dates:
            df_train = df[df[self.date_col].astype(str) <= d].copy()
            df_test = df[df[self.date_col].astype(str) > d].copy()
            yield df_train, df_test

In [83]:
ew = ExpandingWindowCV()
ew.fit(date_col = 'date', date_range = ['2022-01-02','2022-01-08'])
ew.split(df)

<generator object ExpandingWindow.split at 0x1601572e0>

### Monte Carlo Cross Validation

In [7]:
from sklearn.model_selection import ShuffleSplit

rs = ShuffleSplit(n_splits=5, test_size=.25, random_state=0)
rs.get_n_splits(df)

for train_index, test_index in rs.split(df):
    print("TRAIN:", train_index, "TEST:", test_index)

TRAIN: [2709048 3571212 3854856 ... 2215104 1484405 4500015] TEST: [4644207 3800666 4426240 ... 1073982 2701914 4267490]
TRAIN: [2330462 3787202 5883603 ... 1223827 5080219 4649937] TEST: [1041639 2927469 5636415 ... 5408133 2161008 3408050]
TRAIN: [4920355 4264750 2193193 ... 2632093 3621905  962318] TEST: [ 581011 1549467 3755681 ... 2072810 3746000  479231]
TRAIN: [1586464  469627 2243918 ... 5925382 1533923 4925594] TEST: [2971947 2049983 5821566 ... 1072730 2843610 5701307]
TRAIN: [5353038 3842368 4444265 ... 2799028  632353 5982630] TEST: [2081746 1395342  307074 ... 5615815 2871877 6079963]
