In [1]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_pickle('data/transactions.pkl.zip', compression="infer")
df.head()

Unnamed: 0,accountNumber,customerId,creditLimit,availableMoney,transactionDateTime,transactionAmount,merchantName,acqCountry,merchantCountryCode,posEntryMode,...,echoBuffer,currentBalance,merchantCity,merchantState,merchantZip,cardPresent,posOnPremises,recurringAuthInd,expirationDateKeyInMatch,isFraud
0,737265056,737265056,5000.0,5000.0,2016-08-13T14:27:32,98.55,Uber,US,US,2,...,,0.0,,,,False,,,False,False
1,737265056,737265056,5000.0,5000.0,2016-10-11T05:05:54,74.51,AMC #191138,US,US,9,...,,0.0,,,,True,,,False,False
2,737265056,737265056,5000.0,5000.0,2016-11-08T09:18:39,7.47,Play Store,US,US,9,...,,0.0,,,,False,,,False,False
3,737265056,737265056,5000.0,5000.0,2016-12-10T02:14:50,7.47,Play Store,US,US,9,...,,0.0,,,,False,,,False,False
4,830329091,830329091,5000.0,5000.0,2016-03-24T21:04:46,71.18,Tim Hortons #947751,US,US,2,...,,0.0,,,,True,,,False,False


In [10]:
df.shape

(786363, 29)

In [6]:
expected_ratio = df['isFraud'].value_counts(normalize=True)
expected_ratio = expected_ratio.round(4)*100
print(expected_ratio)

isFraud
False    98.42
True      1.58
Name: proportion, dtype: float64


In [7]:
ratio_df = pd.DataFrame({'Expected':expected_ratio})
ratio_df

Unnamed: 0_level_0,Expected
isFraud,Unnamed: 1_level_1
False,98.42
True,1.58


## Simple Random Sampling
Collect 5 random rows of Fraud-classified observations and 10 random rows of non-Fraud-classified observations.

In [27]:
fraud_df = df[df["isFraud"] == True].sample(5)
nonfraud_df = df[df["isFraud"] == False].sample(10)
sample_df = pd.concat([fraud_df, nonfraud_df], axis=0)
sample_df

Unnamed: 0,accountNumber,customerId,creditLimit,availableMoney,transactionDateTime,transactionAmount,merchantName,acqCountry,merchantCountryCode,posEntryMode,...,echoBuffer,currentBalance,merchantCity,merchantState,merchantZip,cardPresent,posOnPremises,recurringAuthInd,expirationDateKeyInMatch,isFraud
669566,527298269,527298269,15000.0,11845.01,2016-03-26T00:28:39,0.0,Starbucks #910811,US,US,9,...,,3154.99,,,,False,,,False,True
431869,419709514,419709514,5000.0,784.51,2016-11-16T01:20:39,361.23,ebay.com,US,US,2,...,,4215.49,,,,False,,,False,True
622607,588383631,588383631,5000.0,3295.4,2016-01-04T08:45:53,191.2,cheapfast.com,US,US,9,...,,1704.6,,,,False,,,False,True
544331,948004238,948004238,50000.0,10130.05,2016-10-14T12:33:02,452.28,apple.com,US,US,2,...,,39869.95,,,,False,,,False,True
234053,693502825,693502825,2500.0,148.11,2016-06-06T11:10:45,40.28,Convenient Auto Services,US,US,5,...,,2351.89,,,,True,,,False,True
402581,727645439,727645439,5000.0,2852.54,2016-07-19T06:00:14,454.78,Boston Cafe #105215,US,US,5,...,,2147.46,,,,True,,,False,False
592838,826422111,826422111,5000.0,3593.32,2016-05-13T07:56:30,34.09,Shell Gas #786931,US,US,9,...,,1406.68,,,,True,,,False,False
626679,247130864,247130864,5000.0,945.42,2016-02-15T13:19:34,136.92,Uber,US,US,9,...,,4054.58,,,,False,,,False,False
471686,454870319,454870319,20000.0,6827.04,2016-05-08T20:49:57,82.84,Five Guys #15552,US,US,5,...,,13172.96,,,,True,,,False,False
473451,102204865,102204865,2500.0,2155.07,2016-11-04T08:22:17,45.84,Renaissance Hotel #151808,US,US,9,...,,344.93,,,,True,,,False,False


In [29]:
# sample_df.to_csv("tests/sample_df.csv", index=False)

## Try Stratified Sampling
Does not work out since our dataset is too imbalanced.

In [24]:
stratified_sample = df.groupby('isFraud').apply(lambda x: x.sample(frac=0.00005))
stratified_sample = stratified_sample.droplevel(0)
print(stratified_sample["isFraud"].value_counts())
stratified_sample

isFraud
False    39
True      1
Name: count, dtype: int64


Unnamed: 0,accountNumber,customerId,creditLimit,availableMoney,transactionDateTime,transactionAmount,merchantName,acqCountry,merchantCountryCode,posEntryMode,...,echoBuffer,currentBalance,merchantCity,merchantState,merchantZip,cardPresent,posOnPremises,recurringAuthInd,expirationDateKeyInMatch,isFraud
33677,345663377,345663377,15000.0,12307.6,2016-05-11T00:11:44,330.29,Fresh Flowers,US,US,5,...,,2692.4,,,,False,,,False,False
522949,530741979,530741979,10000.0,3460.42,2016-07-11T22:21:15,0.0,AMC #191138,US,US,5,...,,6539.58,,,,False,,,False,False
166764,212548902,212548902,15000.0,14754.23,2016-01-18T01:20:48,7.23,Apple iTunes,US,US,9,...,,245.77,,,,False,,,False,False
164920,240966299,240966299,15000.0,9369.86,2016-08-15T02:09:52,20.76,netflix.com,US,US,5,...,,5630.14,,,,False,,,False,False
344890,211726796,211726796,5000.0,1362.28,2016-10-23T19:35:03,432.18,In-N-Out #65601,US,US,2,...,,3637.72,,,,True,,,False,False
376133,165980878,165980878,15000.0,14262.97,2016-02-13T14:17:27,217.88,Krispy Kreme #340971,US,US,5,...,,737.03,,,,True,,,False,False
618147,152973583,152973583,15000.0,12532.61,2016-11-23T20:30:57,95.38,Shake Shack #62182,US,US,5,...,,2467.39,,,,True,,,False,False
112881,952302633,952302633,2500.0,1284.1,2016-05-02T04:12:08,68.73,Universe Massage #357768,US,US,9,...,,1215.9,,,,True,,,False,False
296890,253508360,253508360,5000.0,4018.57,2016-07-20T09:27:42,748.31,South Steakhouse #796913,US,US,5,...,,981.43,,,,True,,,False,False
231063,601827864,601827864,7500.0,635.36,2016-08-07T08:02:14,47.74,EZ Putt Putt #804489,US,US,2,...,,6864.64,,,,True,,,False,False
