# Library / Packages

In [1]:
# basic library
import os
import pandas as pd
import numpy as np
import sys

# graph
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns

# complex math
from scipy import stats
from scipy.stats import gaussian_kde

# data preparation
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.pipeline import Pipeline 
from sklearn.compose import ColumnTransformer 

# data modeling
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet

# data scoring
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# data tuning   
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split

# visualization
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns

# pickle and .env
from dotenv import dotenv_values
import pickle

# Format

In [2]:
def lab_round(x, pos): 
    if abs(x) >= 1e9: 
        return f'{x/1e9} B'
    
    elif abs(x) >= 1e6:
        return f'{x/1e6} M'
    
    elif abs(x) >= 1e3:
        return f'{x/1e3} K'
    
    else:
        return f'{x}'
    
def val_round(x):
    if abs(x) >= 1e9:
        return f'{x/1e9:.2f} B'
    
    elif abs(x) >= 1e6:
        return f'{x/1e6:.2f} M'
    
    elif abs(x) >= 1e3:
        return f'{x/1e3:.2f} K'
    
    else:
        return f'{x:.2f}'

In [3]:
pd.set_option('display.max_columns', None)

In [4]:
# Fungsi untuk konversi tipe data
def convert_object_columns_to_numeric(df):
    for col in df.select_dtypes(include = ['object']).columns:  
        try:
            # Cek apakah semua nilai bisa dikonversi ke float
            df[col] = pd.to_numeric(df[col], errors='raise')
            
            # Jika bisa, ubah ke int jika semua nilai adalah bilangan bulat
            if all(df[col] % 1 == 0):  # Cek apakah semua nilai adalah bilangan bulat
                df[col] = df[col].astype(int)

        except ValueError:
            pass  # Jika ada nilai non-angka, biarkan tetap object
        
    return df

# Data Source

In [5]:
# parameter
share = {**dotenv_values('../.env.shared')} 

# read pickle
with open(share['CLEAN_DATA'], 'rb') as f:
    loaded_data = pickle.load(f)

cc_df = pd.DataFrame(loaded_data)
cc_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 283712 entries, 0 to 283711
Data columns (total 25 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   credit_card                283712 non-null  int64         
 1   datetime                   283712 non-null  datetime64[ns]
 2   long                       283712 non-null  float64       
 3   lat                        283712 non-null  float64       
 4   zipcode                    283712 non-null  int64         
 5   state                      283712 non-null  object        
 6   city                       283712 non-null  object        
 7   year                       283712 non-null  int32         
 8   quarter                    283712 non-null  object        
 9   month                      283712 non-null  object        
 10  season                     283712 non-null  object        
 11  week_cat                   283712 non-null  object  

In [6]:
cc_df.head()

Unnamed: 0,credit_card,datetime,long,lat,zipcode,state,city,year,quarter,month,season,week_cat,day,credit_card_limit,limit_cat,transaction_dollar_amount,transaction_count,time_diff,prev_long,prev_lat,distance,geo_cat,fraud_status,cc_id,trx_id
0,9484591448272784,2015-07-31 09:39:48,-90.045639,29.889039,70112,la,new orleans,2015,2015Q3,july,summer,weekday,friday,4000,very_low,17.99,1.0,-7642455.0,-90.151504,29.945202,11.969568,normal,not_fraud,31f99a9aebe0137a2b83f01f8b5161916620b6a6df34e9...,8463fecb2d6b556ab586ab045ec71e8482c8499cf63d7c...
1,7053196367895112,2015-07-31 11:03:48,-74.027561,40.689615,10001,ny,new york,2015,2015Q3,july,summer,weekday,friday,18000,low,12.09,1.0,-2527299.0,-73.927029,40.806511,15.51121,normal,not_fraud,8293222133c756307f0a81cc840abfbe8ab91d0380d6d5...,399cc2b39574803782377b8fea885c7dc98175c8545b99...
2,9528285469413252,2015-07-31 11:10:14,-72.139485,43.1081,3280,nh,washington,2015,2015Q3,july,summer,weekday,friday,40000,very_high,78.21,1.0,-6508550.0,-72.064113,43.172281,9.404226,normal,not_fraud,ee31e149528e6d268ed6e46111f0b80081283e078a354f...,514f845ac781eb6631cc7f179d460e73e001dda448e9d1...
3,1845720274833905,2015-07-31 11:28:55,-89.002148,40.804323,61738,il,el paso,2015,2015Q3,july,summer,weekday,friday,20000,medium,74.41,1.0,-2534699.0,-88.974492,40.720877,9.556419,normal,not_fraud,cb528e873c8279472ac90696f1920a97427f06e43bad1f...,01f40d52c00ea974bfae194525de524892ddc1f4f0eef0...
4,7850942767136368,2015-07-31 11:38:51,-72.025675,43.210753,3280,nh,washington,2015,2015Q3,july,summer,weekday,friday,4000,very_low,54.89,1.0,-1785659.0,-72.125392,43.219223,8.15713,normal,not_fraud,c02fbffcf91d7e423cbbb0cd04b2adec539b4e6be3e9be...,7684f22bce701314124d22e6e4d3764fdb7c4a45982ad9...


# Data Modeling

### Noise and Irrelevant Data

#### Checking Threshold Column 

In [7]:
from sklearn.feature_selection import VarianceThreshold

# Drop kolom non-numerik
df_numeric = cc_df.select_dtypes(include = ['number'])
print(f'numeric columns: {df_numeric.columns}\n')

# Inisialisasi VarianceThreshold (misalnya, ambang batas 0.01)
selector = VarianceThreshold(threshold = 0.01)
df_var_selected = selector.fit_transform(df_numeric)

# Fitur yang dipertahankan
selected_features = df_numeric.columns[selector.get_support()]
print("Fitur yang dipilih:", selected_features)

# Fitur yang akan digunakan pada modeling


numeric columns: Index(['credit_card', 'long', 'lat', 'zipcode', 'year', 'credit_card_limit',
       'transaction_dollar_amount', 'transaction_count', 'time_diff',
       'prev_long', 'prev_lat', 'distance'],
      dtype='object')

Fitur yang dipilih: Index(['credit_card', 'long', 'lat', 'zipcode', 'credit_card_limit',
       'transaction_dollar_amount', 'time_diff', 'prev_long', 'prev_lat',
       'distance'],
      dtype='object')


#### Check Relevant Column

In [8]:
# Check Column Category
check_cat = cc_df.select_dtypes(include = ['object'])

for i in check_cat.columns:
    print(f'{i.upper()} \t: {check_cat[i].unique()} \n')
    print(f'{'-' * 50} \n')

STATE 	: ['la' 'ny' 'nh' 'il' 'pa' 'nj' 'mo' 'md' 'ca' 'tx' 'me' 'vt' 'al' 'wv'
 'pr' 'wa' 'nc' 'ga' 'ma' 'ok' 'mi' 'ut' 'fl' 'hi' 'ia' 'nm' 'oh' 'az'
 'va' 'in' 'ri' 'id' 'co' 'ct' 'ks'] 

-------------------------------------------------- 

CITY 	: ['new orleans' 'new york' 'washington' 'el paso' 'dallas' 'houston'
 'birmingham' 'kansas city' 'austin' 'pasadena' 'los angeles' 'fort worth'
 'jackson' 'pittsburgh' 'portland' 'albany' 'charlotte' 'huntsville'
 'madison' 'orlando' 'san antonio' 'seattle' 'minneapolis' 'sacramento'
 'san francisco' 'memphis' 'dayton' 'denver' 'milwaukee' 'omaha' 'trenton'
 'springfield' 'oklahoma city' 'charleston' 'miami' 'long beach' 'quitman'
 'saint louis' 'friendship' 'chicago' 'salt lake city' 'richmond'
 'pensacola' 'san diego' 'atlanta' 'honolulu' 'greensboro' 'newark'
 'rochester' 'lafayette' 'columbus' 'staten island' 'des moines'
 'las vegas' 'chester' 'cincinnati' 'hillsboro' 'tucson' 'buffalo'
 'arlington' 'shreveport' 'philadelphia' 'tulsa' 

In [9]:
# Daftar kolom untuk label encoding (kolom ordinal)
encoding_set = {'limit_cat'}

# Inisialisasi list untuk menyimpan kolom yang telah dikelompokkan
ordinal_cols = []
one_hot_cols = []
numeric_cols = []

# Mengelompokkan kolom berdasarkan tipe data
for col in cc_df.columns:
    if cc_df[col].dtype in ['int', 'float']:
        numeric_cols.append(col)

    elif cc_df[col].dtype == 'object':
        if col in encoding_set:
            ordinal_cols.append(col)

        else:
            one_hot_cols.append(col)

# Menampilkan hasil
print("Ordinal Encoding Columns:", ordinal_cols)
print("One-Hot Encoding Columns:", one_hot_cols)
print("Numeric Columns:", numeric_cols)

Ordinal Encoding Columns: ['limit_cat']
One-Hot Encoding Columns: ['state', 'city', 'quarter', 'month', 'season', 'week_cat', 'day', 'geo_cat', 'fraud_status', 'cc_id', 'trx_id']
Numeric Columns: ['long', 'lat', 'year', 'transaction_dollar_amount', 'transaction_count', 'time_diff', 'prev_long', 'prev_lat', 'distance']


In [10]:
# Transformasi
numerical_transformer = StandardScaler()
categorical_transformer = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown = 'ignore', sparse_output = True, max_categories = 50)), 
    ('svd', TruncatedSVD(n_components = 100))  # Mengurangi dimensi fitur kategori
])
ordinal_transformer = OrdinalEncoder(handle_unknown = 'use_encoded_value', unknown_value = -1)

# Column Transformer
prep_stage_1 = ColumnTransformer(
    transformers = [
        ("num", numerical_transformer, numeric_cols), 
        ("cat", categorical_transformer, one_hot_cols), 
        ("ord", ordinal_transformer, ordinal_cols)
    ], remainder = "passthrough")

#### sample

In [11]:
# Transform data menggunakan fit_transform pada tahap 1
cc_df = prep_stage_1.fit_transform(cc_df)

# Columns After: ubah kembali ke DataFrame dengan kolom dari prep_stage_1
cc_df = pd.DataFrame(cc_df, columns = prep_stage_1.get_feature_names_out())

# Hilangkan prefix (misalnya, "num__", "cat__", "out__")
clean_columns = [col.split("__", 1)[-1] for col in cc_df.columns]
cc_df.columns = clean_columns

In [12]:
# Menampilkan total null pada setiap kolom
null_columns = cc_df.isnull().sum()[cc_df.isnull().sum() > 0]
print(f'Total null columns: {null_columns} \n')
cc_df.info()

Total null columns: Series([], dtype: int64) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 283712 entries, 0 to 283711
Columns: 114 entries, long to credit_card_limit
dtypes: datetime64[ns](1), object(113)
memory usage: 246.8+ MB


In [13]:
# change object after transform
cc_df = convert_object_columns_to_numeric(cc_df)
cc_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 283712 entries, 0 to 283711
Columns: 114 entries, long to credit_card_limit
dtypes: datetime64[ns](1), float64(108), int32(5)
memory usage: 241.3 MB


In [14]:
cc_df.head()

Unnamed: 0,long,lat,year,transaction_dollar_amount,transaction_count,time_diff,prev_long,prev_lat,distance,truncatedsvd0,truncatedsvd1,truncatedsvd2,truncatedsvd3,truncatedsvd4,truncatedsvd5,truncatedsvd6,truncatedsvd7,truncatedsvd8,truncatedsvd9,truncatedsvd10,truncatedsvd11,truncatedsvd12,truncatedsvd13,truncatedsvd14,truncatedsvd15,truncatedsvd16,truncatedsvd17,truncatedsvd18,truncatedsvd19,truncatedsvd20,truncatedsvd21,truncatedsvd22,truncatedsvd23,truncatedsvd24,truncatedsvd25,truncatedsvd26,truncatedsvd27,truncatedsvd28,truncatedsvd29,truncatedsvd30,truncatedsvd31,truncatedsvd32,truncatedsvd33,truncatedsvd34,truncatedsvd35,truncatedsvd36,truncatedsvd37,truncatedsvd38,truncatedsvd39,truncatedsvd40,truncatedsvd41,truncatedsvd42,truncatedsvd43,truncatedsvd44,truncatedsvd45,truncatedsvd46,truncatedsvd47,truncatedsvd48,truncatedsvd49,truncatedsvd50,truncatedsvd51,truncatedsvd52,truncatedsvd53,truncatedsvd54,truncatedsvd55,truncatedsvd56,truncatedsvd57,truncatedsvd58,truncatedsvd59,truncatedsvd60,truncatedsvd61,truncatedsvd62,truncatedsvd63,truncatedsvd64,truncatedsvd65,truncatedsvd66,truncatedsvd67,truncatedsvd68,truncatedsvd69,truncatedsvd70,truncatedsvd71,truncatedsvd72,truncatedsvd73,truncatedsvd74,truncatedsvd75,truncatedsvd76,truncatedsvd77,truncatedsvd78,truncatedsvd79,truncatedsvd80,truncatedsvd81,truncatedsvd82,truncatedsvd83,truncatedsvd84,truncatedsvd85,truncatedsvd86,truncatedsvd87,truncatedsvd88,truncatedsvd89,truncatedsvd90,truncatedsvd91,truncatedsvd92,truncatedsvd93,truncatedsvd94,truncatedsvd95,truncatedsvd96,truncatedsvd97,truncatedsvd98,truncatedsvd99,limit_cat,credit_card,datetime,zipcode,credit_card_limit
0,-0.684286,-2.044195,0,-0.980072,-0.010948,-2.394849,-0.686047,-1.84212,-0.169832,2.372568,0.78735,-0.518556,-0.484932,-0.485623,0.790154,-0.337573,0.283287,0.004126,-0.522616,-0.079971,-0.550065,-0.106706,0.373447,-0.225585,0.060054,0.02686,0.092936,0.051575,-0.08313,0.35942,1.125401,-0.085217,0.065949,0.000331,0.04565,0.018911,0.000932,0.0203,0.006688,0.015101,-0.006451,-0.003048,-0.006232,-0.055961,-0.01969,0.02529,0.008606,0.003974,-0.031918,-0.004391,0.006828,0.003955,0.005099,0.00151,-0.002901,0.036922,0.02606,0.004297,-0.005434,-0.001548,-0.002637,-0.034087,0.028,-0.034341,0.020602,-0.010024,0.051251,-0.000986,0.035173,-0.002816,-0.048843,0.04536,0.263414,0.617227,-0.64454,-0.278313,-0.012394,-0.075016,0.016628,-0.061599,-0.052296,0.000885,-0.006138,0.007595,0.008803,-0.010135,-0.004366,0.001009,0.000278,0.000709,0.003561,0.001918,0.004988,0.000848,0.001928,0.002226,-0.001672,0.001266,-0.000129,-0.000341,6.7e-05,2.075466e-05,9.279399e-05,1.33364e-05,4.1e-05,7.7e-05,0.000236,7e-06,4.7e-05,4,-2011353200,2015-07-31 09:39:48,70112,4000
1,0.111486,-0.044188,0,-1.088841,-0.010948,-0.792138,0.101297,0.002508,-0.167818,2.389811,0.788431,-0.530527,-0.481475,-0.553596,0.784632,-0.69382,0.255521,0.005932,1.039869,-0.095343,-0.032296,-0.053017,0.136939,-0.146003,0.140455,0.084895,0.085831,0.091676,0.029212,0.482916,1.080775,-0.002436,0.321407,-0.001701,0.019964,0.00278,0.01242,0.011119,0.02132,0.003202,-0.002104,-0.000169,-0.002086,0.006889,-0.001199,0.000208,-8e-06,0.033264,-0.009173,0.001846,-0.00019,0.00305,-0.00136,0.00368,-0.001024,0.003143,0.001386,0.00778,-0.001833,0.000751,-0.001495,0.004742,0.005053,0.00148,0.000152,0.002412,-0.000292,-0.000517,0.000352,-0.001016,-0.000139,0.000256,-0.000306,-0.000666,-0.000473,0.000446,0.001133,-0.000399,0.000364,0.000669,0.000817,0.000811,-0.000449,-0.000102,-0.000233,-0.001236,9.1e-05,-0.000343,0.000694,0.000328,0.000271,0.002969,-0.000508,0.000343,-0.000175,0.000208,0.000212,0.000284,-0.000285,-3.8e-05,2.6e-05,-8.339092e-06,-5.075389e-07,-3.282499e-06,1.9e-05,3.3e-05,4.6e-05,-2.6e-05,2.3e-05,1,1074403912,2015-07-31 11:03:48,10001,18000
2,0.205285,0.403657,0,0.13011,-0.010948,-2.039567,0.191701,0.404298,-0.171291,2.661801,0.792547,-0.195061,-0.546193,0.892559,0.795553,0.050462,0.271276,-0.003891,0.027866,-0.084113,0.002702,-0.046229,-0.006001,-0.14316,0.048929,0.028176,0.031403,0.035286,0.01522,0.383121,1.120097,-0.071925,0.036734,-0.001152,0.012733,-0.008156,0.006401,0.00389,-0.003368,0.002792,-0.002468,-0.00026,-0.002116,0.007206,-0.001135,0.001453,0.003741,-0.001267,-0.004741,0.002284,-0.001056,0.002983,-0.00144,0.003293,-0.003722,0.003082,0.001632,0.000459,-0.001817,0.000745,-0.001519,0.002483,0.001548,0.001407,0.000329,-0.001146,-0.001332,0.000268,0.000343,0.000442,-0.000143,0.000222,-0.000358,-0.0007,-0.000481,0.000424,0.001131,-0.00041,0.000355,0.000696,0.000992,0.000638,-0.000559,-8e-05,-0.000297,-0.001081,9.1e-05,-0.000463,-3.8e-05,-0.00043,-0.000539,0.000123,6.3e-05,0.000384,9.2e-05,0.000298,0.000211,0.000332,-0.000273,-5.7e-05,2.9e-05,2.209628e-07,1.879806e-05,-1.203654e-05,-5e-06,2e-05,3.6e-05,-1e-06,2.2e-05,3,-692514940,2015-07-31 11:10:14,3280,40000
3,-0.632446,-0.022947,0,0.060055,-0.010948,-0.794456,-0.628929,-0.012035,-0.171205,2.370907,0.787479,-0.519526,-0.487023,-0.478131,0.784423,-0.324123,0.277429,0.006107,-0.457613,-0.070586,1.151395,0.110252,0.372924,-0.157183,0.19668,0.113393,0.099624,0.082881,0.011819,0.404455,1.115674,-0.06605,0.029484,-0.000787,0.012904,-0.010939,0.002035,2.7e-05,0.009517,-0.030257,-0.033244,-0.156727,0.002863,0.020543,0.003754,0.001813,0.003141,-0.001277,-0.004665,0.002274,-0.001163,0.002963,-0.001342,0.00265,-0.00116,0.002976,0.001536,0.000491,-0.001979,0.000645,-0.001913,0.00205,0.00186,0.000537,0.000493,-0.00103,-0.001584,0.00026,-2.3e-05,0.000445,0.000115,-0.000688,-0.001436,-0.001315,-0.000616,2.5e-05,0.001111,-0.00062,0.000432,0.000349,0.001023,0.000216,-0.000873,6.1e-05,-0.000246,-0.000975,0.000287,-0.000135,0.00072,7.5e-05,-0.00017,0.000623,0.000761,0.000464,0.004654,3.4e-05,0.000249,0.00045,-0.000385,-3.3e-05,3.5e-05,1.096275e-06,8.05959e-06,2.429957e-07,9e-06,3.8e-05,6.1e-05,-5e-06,2.2e-05,2,1029050865,2015-07-31 11:28:55,61738,20000
4,0.210939,0.422666,0,-0.299804,-0.010948,-0.559763,0.188727,0.412271,-0.172001,2.661801,0.792547,-0.195061,-0.546193,0.892559,0.795553,0.050462,0.271276,-0.003891,0.027866,-0.084113,0.002702,-0.046229,-0.006001,-0.14316,0.048929,0.028176,0.031403,0.035286,0.01522,0.383121,1.120097,-0.071925,0.036734,-0.001152,0.012733,-0.008156,0.006401,0.00389,-0.003368,0.002792,-0.002468,-0.00026,-0.002116,0.007206,-0.001135,0.001453,0.003741,-0.001267,-0.004741,0.002284,-0.001056,0.002983,-0.00144,0.003293,-0.003722,0.003082,0.001632,0.000459,-0.001817,0.000745,-0.001519,0.002483,0.001548,0.001407,0.000329,-0.001146,-0.001332,0.000268,0.000343,0.000442,-0.000143,0.000222,-0.000358,-0.0007,-0.000481,0.000424,0.001131,-0.00041,0.000355,0.000696,0.000992,0.000638,-0.000559,-8e-05,-0.000297,-0.001081,9.1e-05,-0.000463,-3.8e-05,-0.00043,-0.000539,0.000123,6.3e-05,0.000384,9.2e-05,0.000298,0.000211,0.000332,-0.000273,-5.7e-05,2.9e-05,2.209628e-07,1.879806e-05,-1.203654e-05,-5e-06,2e-05,3.6e-05,-1e-06,2.2e-05,4,248086128,2015-07-31 11:38:51,3280,4000
