Load necessary libraries and set data path

In [1]:
import numpy as np
import cudf, cuml, cuxfilter
from utils import label_dist

DATA_PATH = './data/'

Read from file and explore

In [2]:
df = cudf.read_csv(DATA_PATH+'creditcard.csv',dtype=np.float64)
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


In [3]:
df.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,...,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0
mean,94813.859575,0.0,0.0,-0.0,0.0,0.0,0.0,-0.0,0.0,-0.0,...,0.0,-0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,88.349619,0.001727
std,47488.145955,1.958696,1.651309,1.516255,1.415869,1.380247,1.332271,1.237094,1.194353,1.098632,...,0.734524,0.725702,0.62446,0.605647,0.521278,0.482227,0.403632,0.330083,250.120109,0.041527
min,0.0,-56.40751,-72.715728,-48.325589,-5.683171,-113.743307,-26.160506,-43.557242,-73.216718,-13.434066,...,-34.830382,-10.933144,-44.807735,-2.836627,-10.295397,-2.604551,-22.565679,-15.430084,0.0,0.0
25%,54201.5,-0.920373,-0.59855,-0.890365,-0.84864,-0.691597,-0.768296,-0.554076,-0.20863,-0.643098,...,-0.228395,-0.54235,-0.161846,-0.354586,-0.317145,-0.326984,-0.07084,-0.05296,5.6,0.0
50%,84692.0,0.018109,0.065486,0.179846,-0.019847,-0.054336,-0.274187,0.040103,0.022358,-0.051429,...,-0.02945,0.006782,-0.011193,0.040976,0.016594,-0.052139,0.001342,0.011244,22.0,0.0
75%,139320.5,1.315642,0.803724,1.027196,0.743341,0.611926,0.398565,0.570436,0.327346,0.597139,...,0.186377,0.528554,0.147642,0.439527,0.350716,0.240952,0.091045,0.07828,77.165,0.0
max,172792.0,2.45493,22.057729,9.382558,16.875344,34.801666,73.301626,120.589494,20.007208,15.594995,...,27.202839,10.50309,22.528412,4.584549,7.519589,3.517346,31.612198,33.847808,25691.16,1.0


Check for null values

In [4]:
null_values = df.isnull().sum().sum()
print("Count of null values:", null_values)

Count of null values: 0


In [5]:
label_dist(df)

cux_df = cuxfilter.DataFrame.from_dataframe(df)
bar_chart = cuxfilter.charts.bar('Class', 'Class', add_interaction=False, width=800, height=500)
d = cux_df.dashboard([bar_chart])
bar_chart.view()

Non-fraud count:284315, percentage:0.9983
Fraud count:492, percentage:0.0017


Standartize amount(pca should have centered data before using svd) and drop time

In [6]:
scaler = cuml.preprocessing.StandardScaler()
df['amount_scaled'] = scaler.fit_transform(df['Amount'].values.reshape(-1, 1))

df.drop(['Time', 'Amount'], axis=1, inplace=True)

In [7]:
df['amount_scaled'].describe()

count    284807.000000
mean          0.000000
std           1.000002
min          -0.353229
25%          -0.330840
50%          -0.265271
75%          -0.044717
max         102.362243
Name: amount_scaled, dtype: float64

Split data into 70/10/20 train/valid/test sets

In [8]:
X_train, X_test, y_train, y_test = cuml.model_selection.train_test_split(df, 'Class')
train_df = cudf.concat([X_train,y_train],axis=1)
X_train, X_valid, y_train, y_valid = cuml.model_selection.train_test_split(train_df, 'Class', train_size = 0.875)

Check to see if proportion of fraud/non-fraud stays the same

In [11]:
train_df = cudf.concat([X_train,y_train],axis=1)
valid_df = cudf.concat([X_valid,y_valid],axis=1)
test_df = cudf.concat([X_test,y_test],axis=1)

label_dist(train_df)
label_dist(valid_df)
label_dist(test_df)

print(train_df.shape)
print(valid_df.shape)
print(test_df.shape)

Non-fraud count:186588, percentage:0.9983
Fraud count:316, percentage:0.0017
Non-fraud count:26656, percentage:0.9983
Fraud count:45, percentage:0.0017
Non-fraud count:71071, percentage:0.9982
Fraud count:131, percentage:0.0018
(186904, 30)
(26701, 30)
(71202, 30)


Write sets to files

In [10]:
train_df.to_parquet(DATA_PATH+'train.parquet',index=True)
valid_df.to_parquet(DATA_PATH+'valid.parquet',index=True)
test_df.to_parquet(DATA_PATH+'test.parquet',index=True)