# Library / Packages

In [None]:
# 
import pandas as pd
import numpy as np
from automate import auto_preparation, auto_statistic, auto_preprocessing

# 
from dotenv import dotenv_values

# Format

In [2]:
def lab_round(x, pos): 
    if abs(x) >= 1e9: 
        return f'{x/1e9} B'
    
    elif abs(x) >= 1e6:
        return f'{x/1e6} M'
    
    elif abs(x) >= 1e3:
        return f'{x/1e3} K'
    
    else:
        return f'{x}'
    
def val_round(x):
    if abs(x) >= 1e9:
        return f'{x/1e9:.2f} B'
    
    elif abs(x) >= 1e6:
        return f'{x/1e6:.2f} M'
    
    elif abs(x) >= 1e3:
        return f'{x/1e3:.2f} K'
    
    else:
        return f'{x:.2f}'

# Sampling Data

Jika data berada pada dataset atau database terpisah maka lakukan merging terlebih dahulu, berikut merupakan data sintetis yang sudah melalu proses merging data

In [3]:
pd.set_option('display.max_columns', None)

In [4]:
# Jumlah sampel yang diinginkan
num_samples = 100000

# Definisi lokasi (kota, negara bagian, kode pos)
locations = [
    ("New York", "NY", 10001),
    ("Los Angeles", "CA", 90001),
    ("Chicago", "IL", 60601),
    ("Houston", "TX", 77001),
    ("Phoenix", "AZ", 85001),
    ("Philadelphia", "PA", 19101),
    ("San Antonio", "TX", 78201),
    ("San Diego", "CA", 92101),
    ("Dallas", "TX", 75201),
    ("San Jose", "CA", 95101)
]

# Generate data sintetis
df_synthetic = pd.DataFrame({
    "credit_card": np.random.randint(10**15, 10**16, num_samples, dtype=np.int64),
    "datetime": np.random.choice(pd.date_range("2015-01-01", "2016-12-31", freq="min"), num_samples),
    "transaction_dollar_amount": np.round(np.random.uniform(5, 500, num_samples), 2),
    "long": np.round(np.random.uniform(-125, -65, num_samples), 6),
    "lat": np.round(np.random.uniform(25, 50, num_samples), 6),
})

# Pilih lokasi secara acak
selected_indices = np.random.randint(0, len(locations), num_samples)
df_synthetic["city"] = [locations[i][0] for i in selected_indices]
df_synthetic["state"] = [locations[i][1] for i in selected_indices]
df_synthetic["zipcode"] = [locations[i][2] for i in selected_indices]

# Generate limit kartu kredit
df_synthetic["credit_card_limit"] = np.random.choice([5000, 10000, 15000, 20000, 25000, 30000], num_samples)

# Tampilkan hasil
df_synthetic.head()

Unnamed: 0,credit_card,datetime,transaction_dollar_amount,long,lat,city,state,zipcode,credit_card_limit
0,1191540631854938,2015-09-21 13:18:00,349.47,-90.458542,27.055849,Dallas,TX,75201,20000
1,8662811733784080,2015-08-21 17:16:00,290.62,-121.914416,33.942142,Philadelphia,PA,19101,10000
2,6910218654961236,2016-03-24 20:40:00,423.4,-105.03683,42.08556,San Diego,CA,92101,5000
3,7218355942214491,2016-12-02 23:34:00,118.09,-120.355727,43.239657,San Diego,CA,92101,30000
4,8578100930790344,2016-08-05 07:41:00,207.54,-76.964329,28.772639,New York,NY,10001,10000


# Data Preparation

In [5]:
# Terapkan proses data
df_prepare = auto_preparation.preprocess_credit_card_data(df_synthetic)

Missing values before processing:
credit_card                  0
datetime                     0
transaction_dollar_amount    0
long                         0
lat                          0
city                         0
state                        0
zipcode                      0
credit_card_limit            0
dtype: int64

Total duplicate data before processing: 0
Total duplicate transactions: 0

Fraud cases percentage:
fraud
not_fraud    95.0
fraud         5.0
Name: proportion, dtype: float64

Fraud cases count:
fraud
not_fraud    95000
fraud         5000
Name: count, dtype: int64


In [6]:
df_prepare.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 27 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   credit_card                100000 non-null  int64         
 1   datetime                   100000 non-null  datetime64[ns]
 2   transaction_dollar_amount  100000 non-null  float64       
 3   long                       100000 non-null  float64       
 4   lat                        100000 non-null  float64       
 5   city                       100000 non-null  object        
 6   state                      100000 non-null  object        
 7   zipcode                    100000 non-null  int64         
 8   credit_card_limit          100000 non-null  int32         
 9   date                       100000 non-null  object        
 10  year                       100000 non-null  int32         
 11  month                      100000 non-null  object   

In [7]:
df_prepare.head()

Unnamed: 0,credit_card,datetime,transaction_dollar_amount,long,lat,city,state,zipcode,credit_card_limit,date,year,month,quarter,day,time,week_cat,season,limit_cat,transaction_count,prev_time,time_diff_hour,prev_long,prev_lat,distance_km,geo_cat,speed_km/h,fraud
0,1191540631854938,2015-09-21 13:18:00,349.47,-90.458542,27.055849,dallas,tx,75201,20000,2015-09-21,2015,september,2015Q3,monday,13:18:00,weekday,fall,medium,1.0,2015-09-21 13:18:00,0.0,0.0,0.0,10060.787646,anomaly,0.0,not_fraud
1,8662811733784080,2015-08-21 17:16:00,290.62,-121.914416,33.942142,philadelphia,pa,19101,10000,2015-08-21,2015,august,2015Q3,friday,17:16:00,weekday,summer,low,1.0,2015-08-21 17:16:00,0.0,0.0,0.0,12911.010729,anomaly,0.0,not_fraud
2,6910218654961236,2016-03-24 20:40:00,423.4,-105.03683,42.08556,san diego,ca,92101,5000,2016-03-24,2016,march,2016Q1,thursday,20:40:00,weekday,spring,very_low,1.0,2016-03-24 20:40:00,0.0,0.0,0.0,11248.58128,anomaly,0.0,not_fraud
3,7218355942214491,2016-12-02 23:34:00,118.09,-120.355727,43.239657,san diego,ca,92101,30000,2016-12-02,2016,december,2016Q4,friday,23:34:00,weekday,winter,high,1.0,2016-12-02 23:34:00,0.0,0.0,0.0,12418.153529,anomaly,0.0,not_fraud
4,8578100930790344,2016-08-05 07:41:00,207.54,-76.964329,28.772639,new york,ny,10001,10000,2016-08-05,2016,august,2016Q3,friday,07:41:00,weekday,summer,low,1.0,2016-08-05 07:41:00,0.0,0.0,0.0,8744.361001,anomaly,0.0,not_fraud


In [8]:
len(df_prepare)

100000

# Data Statistics

In [None]:
# Terapkan proses data
df_stats = auto_statistic.analyze_numeric_features(df_prepare)

===== Uji Normalitas Data =====
Column: transaction_dollar_amount
Statistic: 1118.7624, Critical Value (5%): 0.7870
Column transaction_dollar_amount: Data tidak berdistribusi normal

Column: transaction_dollar_amount, Statistics = 90104.8220, p = 0.0000
Column transaction_dollar_amount: Data tidak berdistribusi normal

Column: credit_card_limit
Statistic: 2796.2343, Critical Value (5%): 0.7870
Column credit_card_limit: Data tidak berdistribusi normal

Column: credit_card_limit, Statistics = 178732.1147, p = 0.0000
Column credit_card_limit: Data tidak berdistribusi normal

Column transaction_count: Variansi nol, tidak dapat diuji.

Column: transaction_count, Statistics = nan, p = 1.0000
Column transaction_count: Data berdistribusi normal

Column time_diff_hour: Variansi nol, tidak dapat diuji.

Column: time_diff_hour, Statistics = nan, p = 1.0000
Column time_diff_hour: Data berdistribusi normal

Column prev_long: Variansi nol, tidak dapat diuji.

Column: prev_long, Statistics = nan, p =

# Data Preprocessing

In [None]:
df_preprocessing = auto_preprocessing.