# Split

In [13]:
import os
import sys
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split

warnings.filterwarnings('ignore')

# define functions

In [14]:
# During sampling, we need to check if the distributions are similar to the original data by setting a significance level 
# and using Kolmogorov-Smirnoff method.

def get_sample(df, significance = 0.05, sample_size = 5000, iterations = 100):
    for i in range(iterations):
        sample = df.sample(sample_size)
        sample_indexes = sample.index
        retrieved = True
        for var in range(df.shape[1]):
            var_sample = np.array(sample.iloc[:,var])
            metrics = ks_2samp(df.iloc[:,var], var_sample)
            pvalue = round(metrics[1], 3)
            if pvalue < significance: 
                retrieved = False
                break
        if retrieved == True: 
            print('found sample after {} iterations'.format(i+1) )
            return sample
    if not retrieved: raise ValueError("Could not build samples with {} iterations, significane={}, and sample_size={}"
                           .format(iterations,significance,sample_size))

# Define paths and capture data

In [15]:
inputs = os.path.join('..', 'data', '02_intermediate')
outputs = os.path.join('..', 'data', '02_intermediate')
reports = os.path.join('..', 'data', '06_reporting')

data = pd.read_csv(os.path.join(inputs, 'data.csv'), index_col='id')

In [16]:
print('Dataset dimensions:', data.shape)
data.head()

Dataset dimensions: (10000, 54)


Unnamed: 0_level_0,y,transactiondt,transactionamt,productcd,card1,card2,card3,card4,card5,card6,...,d15,m1,m2,m3,m4,m5,m6,m7,m8,m9
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3486774,0,13107389,38.056,C,9633,130.0,185.0,visa,138.0,debit,...,0.0,,,,M2,,,,,
3062695,0,1650884,150.0,R,15063,514.0,150.0,visa,226.0,credit,...,,,,,,,,,,
3273443,0,7048761,56.5,W,9006,555.0,143.0,mastercard,224.0,debit,...,2.0,1.0,1.0,1.0,M0,0.0,,0.0,0.0,1.0
3384445,0,10011292,8.459,C,11201,103.0,185.0,visa,226.0,debit,...,0.0,,,,M2,,,,,
3489059,0,13159069,77.95,W,7919,194.0,150.0,mastercard,166.0,debit,...,0.0,1.0,0.0,0.0,,,1.0,0.0,0.0,0.0


# data sampling
if sampling_data == True: faster to run project, but will work on a data sample.

During sampling, we need to check if the distributions are similar to the original data by setting a significance level and using Kolmogorov-Smirnoff method. See function defined in the beginning of the notebook.

In [17]:
from scipy.stats import ks_2samp

In [18]:
sampling_data = False

In [19]:
if sampling_data == True:
    data = get_sample(data, significance=0.05, sample_size=5000, iterations=10)
data.shape

(10000, 54)

# final description

In [20]:
data.tail()

Unnamed: 0_level_0,y,transactiondt,transactionamt,productcd,card1,card2,card3,card4,card5,card6,...,d15,m1,m2,m3,m4,m5,m6,m7,m8,m9
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3160187,0,3766034,310.0,W,12695,490.0,150.0,visa,226.0,debit,...,11.0,1.0,1.0,1.0,M1,1.0,1.0,1.0,1.0,1.0
3536048,0,14499172,50.0,R,8695,170.0,150.0,visa,226.0,credit,...,,,,,,,,,,
3306856,0,7947694,29.0,W,12582,298.0,150.0,visa,226.0,debit,...,74.0,1.0,1.0,1.0,M0,0.0,0.0,,,
3298800,0,7763089,20.301,C,15885,545.0,185.0,visa,138.0,debit,...,0.0,,,,M2,,,,,
3358202,0,9234794,134.95,W,6936,225.0,150.0,mastercard,224.0,debit,...,488.0,,,,,,1.0,,,


In [21]:
data.describe()

Unnamed: 0,y,transactiondt,transactionamt,card1,card2,card3,card5,addr1,addr2,dist1,...,d14,d15,m1,m2,m3,m5,m6,m7,m8,m9
count,10000.0,10000.0,10000.0,10000.0,9842.0,9978.0,9934.0,8849.0,8849.0,4111.0,...,1036.0,8511.0,5476.0,5476.0,5476.0,4029.0,7126.0,4162.0,4162.0,4162.0
mean,0.0363,7378482.0,137.002317,9879.69,361.055375,153.33614,198.852829,289.112668,86.820093,118.68037,...,54.573359,164.542004,0.999817,0.892805,0.776662,0.441301,0.4659,0.129265,0.354877,0.840221
std,0.187045,4599073.0,247.033617,4917.656486,158.014321,11.351212,41.632274,101.642908,2.41921,363.370928,...,130.815766,203.495484,0.013514,0.309389,0.416521,0.496604,0.498871,0.335533,0.478534,0.366445
min,0.0,88410.0,0.272,1001.0,100.0,100.0,100.0,110.0,19.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,3030122.0,42.95,6019.0,206.0,150.0,166.0,204.0,87.0,3.0,...,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
50%,0.0,7327499.0,67.95,9633.0,361.0,150.0,226.0,299.0,87.0,9.0,...,0.0,55.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
75%,0.0,11234400.0,122.0,14276.0,512.0,150.0,226.0,330.0,87.0,25.0,...,1.0,313.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0
max,1.0,15810120.0,4463.95,18390.0,600.0,226.0,237.0,536.0,96.0,4966.0,...,730.0,807.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Split data
test_size could depend on data size. For instance, for 1 million entries, it would work fine to establish test_size=0.1.

In [22]:
X = data.drop('y', axis=1)
print('dimensions of X:', X.shape)

y = data.loc[:, 'y']
y = y.astype('float')
print('dimensions of y:', y.shape)

dimensions of X: (10000, 53)
dimensions of y: (10000,)


In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle = False, random_state=42)
print('dimensions of X_train:', X_train.shape)
print('dimensions of y_train:', y_train.shape)
print('dimensions of X_test:', X_test.shape)
print('dimensions of y_test:', y_test.shape)

dimensions of X_train: (7000, 53)
dimensions of y_train: (7000,)
dimensions of X_test: (3000, 53)
dimensions of y_test: (3000,)


# save train and test sets

In [24]:
y_train = pd.DataFrame(y_train, columns=['y'])
y_test = pd.DataFrame(y_test, columns=['y'])

X_train.to_csv(os.path.join(outputs, 'X_train.csv'))
X_test.to_csv(os.path.join(outputs, 'X_test.csv'))
y_train.to_csv(os.path.join(outputs, 'y_train.csv'))
y_test.to_csv(os.path.join(outputs, 'y_test.csv'))