In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from pandas.plotting import autocorrelation_plot
import statsmodels
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor
from tqdm import tqdm

  from pandas.core import datetools


In [2]:
data = pd.read_csv('Paysim.csv')

In [3]:
data.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [4]:
len(data)

6362620

In [5]:
cu_data = data.iloc[3300000:, :]

In [6]:
cu_data.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
3300000,252,CASH_OUT,96509.39,C1865620108,0.0,0.0,C1755326136,7222336.86,7318846.25,0,0
3300001,252,CASH_OUT,151609.48,C1161252418,0.0,0.0,C1255989746,4099567.9,4251177.38,0,0
3300002,252,CASH_OUT,260755.25,C142980523,0.0,0.0,C1580855586,2848164.42,3108919.66,0,0
3300003,252,CASH_OUT,169955.81,C689336560,0.0,0.0,C714841497,1914122.53,2084078.34,0,0
3300004,252,CASH_OUT,179762.23,C1577510948,0.0,0.0,C241591244,712939.7,892701.93,0,0


# Sanity check

In [7]:
cu_data.isnull().any()

step              False
type              False
amount            False
nameOrig          False
oldbalanceOrg     False
newbalanceOrig    False
nameDest          False
oldbalanceDest    False
newbalanceDest    False
isFraud           False
isFlaggedFraud    False
dtype: bool

In [8]:
tmp = cu_data.groupby('isFraud', as_index = False).count()
tmp

Unnamed: 0,isFraud,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFlaggedFraud
0,0,3057260,3057260,3057260,3057260,3057260,3057260,3057260,3057260,3057260,3057260
1,1,5360,5360,5360,5360,5360,5360,5360,5360,5360,5360


In [9]:
tmp['step'][1]/ tmp['step'][0]

0.0017532038491983018

# Simulation

### Bootstrap

In [10]:
fraud_data = cu_data[cu_data['isFraud'] == 1]
non_fraud_data = cu_data[cu_data['isFraud'] == 0]
fraud_data.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
3320846,253,TRANSFER,690510.31,C1283193575,690510.31,0.0,C1577097476,0.0,0.0,1,0
3320847,253,CASH_OUT,690510.31,C1587643191,690510.31,0.0,C1346914355,53206.61,743716.92,1,0
3328118,253,TRANSFER,157628.35,C787232450,157628.35,0.0,C257156867,0.0,0.0,1,0
3328119,253,CASH_OUT,157628.35,C418787765,157628.35,0.0,C492603644,17051087.32,17208715.67,1,0
3339185,253,TRANSFER,168622.3,C1044222858,168622.3,0.0,C2146789865,0.0,0.0,1,0


In [11]:
print(len(fraud_data), fraud_data['nameOrig'].nunique())

5360 5360


In [13]:
print(len(non_fraud_data), non_fraud_data['nameOrig'].nunique())

3057260 3055149


In [12]:
# first, keep all the names in the original dataset that are not unique
tmp = non_fraud_data
ids = tmp['nameOrig']
keep = tmp[ids.isin(ids[ids.duplicated()])]

In [13]:
print(len(keep))
keep.head()

4221


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
3300975,252,CASH_OUT,293487.78,C751395361,10770.0,0.0,C1683666309,1062447.43,1355935.2,0,0
3301327,252,CASH_OUT,129436.02,C855939807,21456.0,0.0,C1143249588,137196.56,266632.58,0,0
3302368,252,CASH_IN,104304.29,C289706812,6453.0,110757.29,C1149496263,0.0,0.0,0,0
3302637,252,CASH_OUT,202971.84,C1180617355,0.0,0.0,C1411872125,5908353.24,6111325.08,0,0
3302968,252,PAYMENT,9956.19,C48851076,138092.49,128136.3,M2128999421,0.0,0.0,0,0


In [14]:
keep.head(50)

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
3300975,252,CASH_OUT,293487.78,C751395361,10770.0,0.0,C1683666309,1062447.43,1355935.2,0,0
3301327,252,CASH_OUT,129436.02,C855939807,21456.0,0.0,C1143249588,137196.56,266632.58,0,0
3302368,252,CASH_IN,104304.29,C289706812,6453.0,110757.29,C1149496263,0.0,0.0,0,0
3302637,252,CASH_OUT,202971.84,C1180617355,0.0,0.0,C1411872125,5908353.24,6111325.08,0,0
3302968,252,PAYMENT,9956.19,C48851076,138092.49,128136.3,M2128999421,0.0,0.0,0,0
3305065,252,PAYMENT,7478.11,C1180173859,0.0,0.0,M60906360,0.0,0.0,0,0
3305962,252,CASH_OUT,24518.85,C648568063,951.0,0.0,C1837558204,1368054.75,1392573.6,0,0
3306004,252,CASH_IN,143083.39,C276386995,2567938.49,2711021.88,C37957335,2658301.97,2515218.58,0,0
3306333,252,PAYMENT,11002.05,C1727941582,82637.0,71634.95,M574418015,0.0,0.0,0,0
3306508,252,PAYMENT,13727.24,C1914256039,40397.0,26669.76,M1594323014,0.0,0.0,0,0


### keep a ratio of fraud and non fraud to be 1: 100 

In [16]:
# second, keep part of the non fraud data randomly with a reasonable ratio 1:100
size = len(fraud_data)*100 - len(keep)
#randindex = np.random.randint(3300000, size = size)
randindex = np.random.randint(3300000, high = 6362619,size = size)
keep2 = data.iloc[randindex, :]
keep2.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
3760114,279,CASH_IN,216379.84,C1264380114,356415.04,572794.88,C13103044,1303928.78,1087548.94,0,0
4483716,324,CASH_OUT,52277.4,C1823846327,0.0,0.0,C319738216,718513.52,770790.92,0,0
3845432,282,CASH_IN,57876.78,C1153323012,122740.0,180616.78,C651306145,0.0,0.0,0,0
6199888,574,PAYMENT,3873.37,C1934701593,82860.57,78987.2,M971814,0.0,0.0,0,0
5577365,393,CASH_IN,62641.91,C1821257693,10540.0,73181.91,C1581641742,70144.38,7502.47,0,0


In [17]:
df_base = pd.concat([keep, keep2, fraud_data], ignore_index = True)
print(len(df_base))
df_base.head(2)

541360


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,252,CASH_OUT,293487.78,C751395361,10770.0,0.0,C1683666309,1062447.43,1355935.2,0,0
1,252,CASH_OUT,129436.02,C855939807,21456.0,0.0,C1143249588,137196.56,266632.58,0,0


In [16]:
# df_base.sample(frac = 0.01, replace = True)

In [18]:
np.random.rand(2) +1

array([1.48153603, 1.29599653])

In [19]:
# third, randomly sample from the base dataframe, to simulate more transactions for each person
for _ in tqdm(range(100)):
    tmp = df_base.sample(frac = 0.05, replace = True)
    # create random change to the amount, oldbalance, new balance, oldbalanceDest newbalanceDest
    multiplier = np.random.rand(len(tmp)) + 1
    perterbation = np.random.rand(len(tmp))*100
    
    tmp['amount'] = multiplier * tmp['amount'] + perterbation
    tmp['oldbalanceOrg'] = multiplier * tmp['oldbalanceOrg'] + perterbation
    tmp['newbalanceOrig'] = multiplier * tmp['newbalanceOrig'] + perterbation
    tmp['oldbalanceDest'] = multiplier * tmp['oldbalanceDest'] + perterbation
    tmp['newbalanceDest'] = multiplier * tmp['newbalanceDest'] + perterbation
    # 
    df_base = df_base.append(tmp)

100%|██████████| 100/100 [14:51<00:00,  8.91s/it]


In [20]:
print(len(df_base))
df_base.head()

71189353


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,252,CASH_OUT,293487.78,C751395361,10770.0,0.0,C1683666309,1062447.43,1355935.2,0,0
1,252,CASH_OUT,129436.02,C855939807,21456.0,0.0,C1143249588,137196.56,266632.58,0,0
2,252,CASH_IN,104304.29,C289706812,6453.0,110757.29,C1149496263,0.0,0.0,0,0
3,252,CASH_OUT,202971.84,C1180617355,0.0,0.0,C1411872125,5908353.24,6111325.08,0,0
4,252,PAYMENT,9956.19,C48851076,138092.49,128136.3,M2128999421,0.0,0.0,0,0


In [21]:
# percentage of names that correspond to only 1 transaction
df_base['nameOrig'].nunique()/ len(df_base)

0.006936346225818347

In [27]:
'''
HERE IS A DEMOSTRATION OF MULTIPLE TRANSACTIONS FROM THIS PERSON, THE VALUES ARE RANDOMIZED
'''
tmp_Demo = df_base[df_base['nameOrig'] == 'C1880614464']
print('number of transactions from C1880614464', len(tmp_Demo))
tmp_Demo.head()

number of transactions from C1880614464 1947


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
89821,377,TRANSFER,68881.11,C1880614464,107.0,0.0,C1271421975,42030.38,110911.49,0,0
491991,377,TRANSFER,68881.11,C1880614464,107.0,0.0,C1271421975,42030.38,110911.49,0,0
89821,377,TRANSFER,90979.719357,C1880614464,206.779993,65.553818,C1271421975,55540.224957,146454.390496,0,0
89821,377,TRANSFER,115863.59907,C1880614464,354.329296,174.617892,C1271421975,70766.567423,186455.5486,0,0
89821,377,TRANSFER,109127.714482,C1880614464,252.628436,83.238614,C1271421975,66620.795248,175665.271116,0,0


In [22]:
df_base.tail()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
217827,355,CASH_IN,1328042.0,C1113159414,17923.12,1345417.0,C963817305,548.1525,548.1525,0,0
69462,259,DEBIT,46477.64,C514651022,9967.104,905.4062,C810609494,1669160.0,1714733.0,0,0
118898,406,PAYMENT,55758.19,C905854124,49877.48,827.8153,M2125090765,827.8153,827.8153,0,0
94802,403,CASH_IN,100500.6,C1908704672,5392289.0,5492692.0,C1886938557,4032575.0,3932172.0,0,0
28045,527,PAYMENT,28213.67,C1203585652,78471.83,50884.19,M866668491,626.0286,626.0286,0,0


## Simulate communities

# PLEASE SEE https://www.icij.org/blog/2018/01/how-to-explore-networks-and-entity-metadata-in-the-offshore-leaks-database/

In [23]:
df_base['receiver'] = np.nan

In [24]:
df_base.head(2)

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,receiver
0,252,CASH_OUT,293487.78,C751395361,10770.0,0.0,C1683666309,1062447.43,1355935.2,0,0,
1,252,CASH_OUT,129436.02,C855939807,21456.0,0.0,C1143249588,137196.56,266632.58,0,0,


In [28]:
def set_community(ids, sz):
    '''
    Simulates communities
    ids: sequence of id's
    sz: largest size of a group
    '''
    n = len(ids)
    groups = []
    num_in_group = np.random.randint(sz, size = 50) + 2
    # assumes the sequence in the original dataset does not affect the result
    i = 0
    for j in num_in_group:
        a_group = ids[i : i+j]
        groups.append(a_group)
        i += j
    return groups


def search_community(group_list, curr_id):
    '''
    return the group that current id is in
    '''
    for sublist in group_list:
        if curr_id in sublist:
            return sublist
    return None

#### Randomly assign the non fraud communities

In [29]:
non_fraud_ids = df_base[df_base['isFraud'] == 0]['nameOrig'].unique()
non_fraud_groups = set_community(ids = non_fraud_ids, sz = 100)
non_fraud_ids[:5]

array(['C751395361', 'C855939807', 'C289706812', 'C1180617355',
       'C48851076'], dtype=object)

In [30]:
len(non_fraud_ids)

488436

#### Randomly assign the fraud communities

In [48]:
fraud_ids = df_base[df_base['isFraud'] == 1]['nameOrig'].unique()

In [49]:
fraud_ids[1:3]

array(['C797588864', 'C1666828573'], dtype=object)

In [174]:
# use a tmp dataframe to test
#tmp_df = df_base.iloc[10000:50000,:]
tmp_df = df_base.iloc[:,:]
tmp_df[tmp_df['isFraud'] == 1]

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,receiver
4259,399,CASH_OUT,2.051703e+05,C1048601853,2.051703e+05,0.000000e+00,C1408847191,1.255559e+07,1.276076e+07,1,0,
4663,603,TRANSFER,2.319926e+06,C797588864,2.319926e+06,0.000000e+00,C510193996,0.000000e+00,0.000000e+00,1,0,
5068,620,TRANSFER,1.840516e+04,C1666828573,1.840516e+04,0.000000e+00,C886178722,0.000000e+00,0.000000e+00,1,0,
6244,278,TRANSFER,2.405804e+05,C338881900,2.405804e+05,0.000000e+00,C738144053,0.000000e+00,0.000000e+00,1,0,
7978,578,CASH_OUT,6.274753e+06,C789526780,6.274753e+06,0.000000e+00,C1905238884,0.000000e+00,6.274753e+06,1,0,
9573,438,TRANSFER,4.550147e+05,C312774959,4.550147e+05,0.000000e+00,C2129735982,0.000000e+00,0.000000e+00,1,0,
9969,531,CASH_OUT,2.963703e+04,C1157110061,2.963703e+04,0.000000e+00,C1111669331,0.000000e+00,2.963703e+04,1,0,
10149,327,TRANSFER,1.467297e+05,C1943481220,1.467297e+05,0.000000e+00,C676856624,0.000000e+00,0.000000e+00,1,0,
10353,465,CASH_OUT,4.494620e+04,C1797253023,4.494620e+04,0.000000e+00,C1822942612,1.136952e+06,1.181898e+06,1,0,
12312,343,TRANSFER,2.379436e+05,C421063586,2.379436e+05,0.000000e+00,C339061789,0.000000e+00,0.000000e+00,1,0,


In [175]:
fraud_ids = tmp_df[tmp_df['isFraud'] == 1]['nameOrig'].unique()
fraud_ids

array(['C1048601853', 'C797588864', 'C1666828573', ..., 'C1162922333',
       'C1685995037', 'C1280323807'], dtype=object)

In [176]:
tmp_df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,receiver
0,252,CASH_OUT,293487.78,C751395361,10770.0,0.0,C1683666309,1062447.43,1355935.2,0,0,
1,252,CASH_OUT,129436.02,C855939807,21456.0,0.0,C1143249588,137196.56,266632.58,0,0,
2,252,CASH_IN,104304.29,C289706812,6453.0,110757.29,C1149496263,0.0,0.0,0,0,
3,252,CASH_OUT,202971.84,C1180617355,0.0,0.0,C1411872125,5908353.24,6111325.08,0,0,
4,252,PAYMENT,9956.19,C48851076,138092.49,128136.3,M2128999421,0.0,0.0,0,0,


In [145]:
tmp_df.to_csv("tmp_df.csv")

In [179]:
# randomly set a community
fraud_groups = set_community(fraud_ids, sz = 10)

def fradu_dect(x):
    if x in fraud_ids:
        group = search_community(fraud_groups, x)
        group_choice = np.setdiff1d(group,x)
        receiver = np.random.choice(group_choice)
        return receiver
#tmp_df["receiver"] = tmp_df["nameOrig"].apply(fradu_dect)

In [180]:
tmp_df["receiver"] = tmp_df["nameOrig"].apply(fradu_dect)

In [181]:
tmp_df.receiver = np.where(tmp_df.receiver.isnull(), tmp_df.nameDest, tmp_df.receiver)
#tmp_df['receiver'] = tmp_df['receiver'].fillna(tmp_df['nameDest']) # Same way to do that

In [144]:
#tmp_df[tmp_df["receiver"].apply(lambda x:x == "C562635378")]

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,receiver
17085,402,TRANSFER,3107315.03,C252212539,0.0,0.0,C562635378,8218143.23,11325458.26,0,0,C562635378
21189,395,CASH_IN,42047.38,C1223144886,2406963.43,2449010.82,C562635378,8260190.61,8218143.23,0,0,C562635378
23432,276,CASH_OUT,141809.25,C1168936304,395.0,0.0,C562635378,3480013.51,3621822.76,0,0,C562635378


In [182]:
tmp_df["receiver"].value_counts()

C1580898291    2426
C1805552012    2192
C1300753775    2081
C123584876     2042
C1205902111    1955
C1271421975    1947
C1670922644    1905
C1258713620    1898
C159568170     1887
C1744813930    1811
C386051745     1774
C599869540     1773
C1011327941    1758
C923763422     1744
C1409937101    1742
C1763040255    1735
C778788657     1687
C1081190270    1685
C94191262      1683
C1169233308    1683
C1982479597    1671
C205015475     1666
C1915071784    1664
C1776177253    1663
C2085373603    1660
C1075270504    1658
C1781907974    1654
C866410359     1653
C1405984618    1651
C534279474     1651
               ... 
C1862067577       1
C2025213143       1
M1621688753       1
M161954478        1
M882556567        1
C759116532        1
M1403360124       1
C1087468074       1
M1160934666       1
M182854508        1
C372296974        1
C131344324        1
C732104967        1
C721884636        1
C1698571173       1
C1514143343       1
C598186244        1
M1360604458       1
M1605038884       1


In [183]:
tmp_df.sort_values(['receiver'], ascending= False)

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,receiver
528775,308,PAYMENT,3.002989e+04,C1211430763,587716.379587,5.580976e+05,M9999977,411.107065,411.107065,0,0,M9999977
528775,308,PAYMENT,1.304948e+04,C1211430763,257458.174393,2.444776e+05,M9999977,68.912610,68.912610,0,0,M9999977
528775,308,PAYMENT,1.609926e+04,C1211430763,316265.980996,3.003241e+05,M9999977,157.376867,157.376867,0,0,M9999977
528775,308,PAYMENT,1.524640e+04,C1211430763,300772.361544,2.856081e+05,M9999977,82.095480,82.095480,0,0,M9999977
528775,308,PAYMENT,8.975642e+03,C1211430763,176777.853059,1.678659e+05,M9999977,63.651531,63.651531,0,0,M9999977
528775,308,PAYMENT,2.014798e+04,C1211430763,394820.280911,3.749214e+05,M9999977,249.106320,249.106320,0,0,M9999977
528775,308,PAYMENT,5.263300e+03,C1211430763,104365.000000,9.910170e+04,M9999977,0.000000,0.000000,0,0,M9999977
528775,308,PAYMENT,2.099565e+04,C1211430763,411816.265775,3.910597e+05,M9999977,239.134989,239.134989,0,0,M9999977
528775,308,PAYMENT,1.167892e+04,C1211430763,228693.394245,2.171677e+05,M9999977,153.258314,153.258314,0,0,M9999977
528775,308,PAYMENT,1.781480e+04,C1211430763,349739.305974,3.321108e+05,M9999977,186.260668,186.260668,0,0,M9999977


In [198]:
tmp_df = tmp_df[1:5000000]

In [199]:
tmp_df[tmp_df['isFraud'] == 1].shape

(38337, 12)

In [200]:
import random
#找出有fraud的和没有fraud的数据
fraud_tmp = tmp_df[tmp_df["isFraud"]==1]
non_fraud_tmp = tmp_df[tmp_df["isFraud"]==0]

#找出出现一次的receiver
onetime_receiver = fraud_tmp["receiver"].value_counts()[fraud_tmp["receiver"].value_counts()==1].index.tolist()

#根据这些receiver，提取相应的列，组成dataframe，然后开始造数据
create_fraud_data= tmp_df[tmp_df["receiver"].apply(lambda x:x in onetime_receiver)]

#造数据
##制造随机值
choose_list = [1,2,3,4] #设置随机复制次数
type_list = create_fraud_data["type"].unique().tolist() # 设置type为随机出现

##构建空的dataframe
new_columns = create_fraud_data.columns.tolist()
new_data = pd.DataFrame(columns=new_columns)

for row in create_fraud_data.values:
    num = random.choice(choose_list)#设置编造数据的次数
    for i in range(num):
        new_step = row[0]
        new_type = random.choice(type_list)
        new_amount = row[2] + random.uniform(10,10000) * random.uniform(1,10)
        new_nameOrig = np.where(new_step % 5 != 0, random.choice(fraud_tmp["nameOrig"].tolist()),random.choice(non_fraud_tmp["nameOrig"].tolist()))
        new_nameOrig = str(new_nameOrig) # np.where返回类型为array，此处改为字符串
        new_oldbalanceOrg = row[4]+ new_amount - row[2]
        new_newbalanceOrig = row[5]
        new_nameDest = row[6]
        new_oldbalanceDest = row[7]
        new_newbalanceDest = row[8]
        new_isFraud = row[9]
        new_isFlaggedFraud = row[10]
        new_receiver = row[11]
        new_row = pd.DataFrame(data = [new_step,new_type,new_amount,new_nameOrig,new_oldbalanceOrg,new_newbalanceOrig,new_nameDest,new_oldbalanceDest,new_newbalanceDest,new_isFraud,new_isFlaggedFraud,new_receiver],index=new_columns).T
        new_data = pd.concat([new_data, new_row],ignore_index=True)


new_data.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,receiver
0,399,CASH_IN,183523,C141206514,55832.2,0.0,C1949981498,197624.0,275703.0,0,0,C1949981498
1,399,DEBIT,181349,C1907033447,53658.0,0.0,C1949981498,197624.0,275703.0,0,0,C1949981498
2,399,CASH_IN,205522,C818983201,77830.9,0.0,C1949981498,197624.0,275703.0,0,0,C1949981498
3,399,CASH_IN,188243,C525104705,60552.2,0.0,C1949981498,197624.0,275703.0,0,0,C1949981498
4,298,CASH_IN,408225,C1557521706,449134.0,40908.6,C1283529992,6045910.0,6401410.0,0,0,C1283529992


## Simulate additional attributes

#### First get random values assigned to each transaction
#### Then set all attributes for a single person(id) to be the same

In [202]:
# Add occupation
occ = np.random.randint(50, size=len(tmp_df))
tmp_df['occupation'] = occ.tolist()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [203]:
# Add account open country
country = np.random.randint(10, size=len(tmp_df))
tmp_df['country'] = country

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [204]:
# Add account type
tp = np.random.choice(['personal', 'business', 'other'], size = len(tmp_df))
tmp_df['account_type'] = tp

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [205]:
# Add random key words from unstructured dataset
# Can augment and make this more realistic by linking NLP on tweets related to bank accounts
# But here just throw some key words that might be relevant
'''
Set some special key words to frauds,
some random key words to non-frauds
REMEMBER: These key words can be changed to anything realistic if given the real data
'''
fraud_key_words = ['Panama','linked','confident','movie','bitcoin','crypto','offshore','investment','shares','equity']
non_fraud_words = ['financing','loan','salary', 'routine', 'general', 'miscellaneous', 'payment', 'delaying',
                   'late', 'overdue','rent']

tmp_df['words'] = tmp_df["isFraud"].apply(lambda x:np.where(x == 1, np.random.choice(fraud_key_words), np.random.choice(non_fraud_words)))


# TODO: set a column called 'key_words', initialize with NULLS, then assign these key words randomly to the dataframe

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


In [64]:
df_base.iloc[1, :][['nameOrig','occupation']]

nameOrig      C1101598632
occupation              2
Name: 1, dtype: object

# TODO: Check this function works correctly!

In [None]:
def set_correct_attribute(df):
    '''
    set the same occupation, account type and country for each id 
    '''
    df = df.sort_values('nameOrig')
    for i in tqdm(range(1,len(df))):
        if df.iloc[i,:]['nameOrig'] == df.iloc[i-1,:]['nameOrig']:
            df.iloc[i, :]['occupation'] = df.iloc[i-1, :]['occupation']
            df.iloc[i, :]['country'] = df.iloc[i-1, :]['country']
            df.iloc[i, :]['account_type'] = df.iloc[i-1, :]['account_type']
            #df.iloc[i,:][['occupation','country','account_type']] = df.iloc[i-1,:][[['occupation','country','account_type']]]
    return df

In [None]:
set_correct_attribute(tmp_df)

In [89]:
set_correct_attribute(df_base.iloc[:1000, :])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
100%|████████████████████████████████████████████████████████████████████████████████| 999/999 [00:12<00:00, 77.43it/s]


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,occupation,country,account_type
473,15,CASH_OUT,237622.96,C1001890218,10171.00,0.00,C596168920,0.00,237622.96,0,0,33,6,other
673,18,CASH_IN,150783.67,C1004863218,1509405.92,1660189.59,C294136632,354792.45,204008.78,0,0,8,1,other
890,23,PAYMENT,37035.37,C1005468625,0.00,0.00,M747492202,0.00,0.00,0,0,6,3,personal
847,21,CASH_OUT,156163.47,C1005646800,22368.00,0.00,C663955611,853100.75,1009264.21,0,0,46,3,personal
748,19,CASH_OUT,356995.42,C1006323448,0.00,0.00,C845458098,1204544.49,1953114.44,0,0,25,9,other
144,10,CASH_IN,237438.75,C1007679575,11481467.98,11718906.72,C1616253157,414056.97,176618.22,0,0,4,5,personal
786,20,CASH_IN,36957.52,C1007785632,24562753.21,24599710.72,C1433779445,7757953.95,8317978.60,0,0,45,5,other
559,17,CASH_IN,217490.48,C1009421149,10578056.31,10795546.79,C406287897,508059.25,290568.77,0,0,27,2,business
785,20,TRANSFER,745288.75,C1012240507,26932.00,0.00,C1047579551,0.00,745288.75,0,0,38,8,other
95,9,CASH_IN,346471.91,C1012695456,32575830.26,32922302.17,C1582765215,5443067.26,4378039.17,0,0,12,4,personal


# Sanitity check on simulated data 

- save as csv

In [None]:
#print(len(df_base))
#df_base.head()

In [None]:
# df.to_csv('cu_data')

In [209]:
tmp_df.to_csv("data_simulated.csv")

In [207]:
tmp_df.shape

(3299999, 16)