# Library / Packages

In [1]:
# basic library
import os
import pandas as pd
import numpy as np
import sys

# graph
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns

# complex math
from scipy import stats
from scipy.stats import gaussian_kde

# data preparation
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.pipeline import Pipeline 
from sklearn.compose import ColumnTransformer 

# data modeling
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet

# data scoring
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# data tuning   
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split

# visualization
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns

# pickle and .env
from dotenv import dotenv_values
import pickle

# Format

In [2]:
def lab_round(x, pos): 
    if abs(x) >= 1e9: 
        return f'{x/1e9} B'
    
    elif abs(x) >= 1e6:
        return f'{x/1e6} M'
    
    elif abs(x) >= 1e3:
        return f'{x/1e3} K'
    
    else:
        return f'{x}'
    
def val_round(x):
    if abs(x) >= 1e9:
        return f'{x/1e9:.2f} B'
    
    elif abs(x) >= 1e6:
        return f'{x/1e6:.2f} M'
    
    elif abs(x) >= 1e3:
        return f'{x/1e3:.2f} K'
    
    else:
        return f'{x:.2f}'

In [3]:
pd.set_option('display.max_columns', None)

In [4]:
# Fungsi untuk konversi tipe data
def convert_object_columns_to_numeric(df):
    for col in df.select_dtypes(include = ['object']).columns:  
        try:
            # Cek apakah semua nilai bisa dikonversi ke float
            df[col] = pd.to_numeric(df[col], errors='raise')
            
            # Jika bisa, ubah ke int jika semua nilai adalah bilangan bulat
            if all(df[col] % 1 == 0):  # Cek apakah semua nilai adalah bilangan bulat
                df[col] = df[col].astype(int)

        except ValueError:
            pass  # Jika ada nilai non-angka, biarkan tetap object
        
    return df

# Data Source

In [5]:
# parameter
share = {**dotenv_values('../.env.shared')} 

# read pickle
with open(share['CLEAN_DATA'], 'rb') as f:
    loaded_data = pickle.load(f)

cc_df = pd.DataFrame(loaded_data)
cc_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 283712 entries, 0 to 283711
Data columns (total 25 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   credit_card                283712 non-null  int64         
 1   datetime                   283712 non-null  datetime64[ns]
 2   long                       283712 non-null  float64       
 3   lat                        283712 non-null  float64       
 4   zipcode                    283712 non-null  int64         
 5   state                      283712 non-null  object        
 6   city                       283712 non-null  object        
 7   year                       283712 non-null  int32         
 8   quarter                    283712 non-null  object        
 9   month                      283712 non-null  object        
 10  season                     283712 non-null  object        
 11  week_cat                   283712 non-null  object  

In [6]:
cc_df.head()

Unnamed: 0,credit_card,datetime,long,lat,zipcode,state,city,year,quarter,month,season,week_cat,day,credit_card_limit,limit_cat,transaction_dollar_amount,transaction_count,time_diff,prev_long,prev_lat,distance,geo_cat,fraud_status,cc_id,trx_id
0,9484591448272784,2015-07-31 09:39:48,-90.045639,29.889039,70112,la,new orleans,2015,2015Q3,july,summer,weekday,friday,4000,very_low,17.99,1.0,-7642455.0,-90.151504,29.945202,11.969568,normal,not_fraud,5db09b1ec5ad96a2712386f2b4182cd037f260554e2bb0...,e0d9b92eee611e3975acf1bd772917e76c7f681f927f37...
1,7053196367895112,2015-07-31 11:03:48,-74.027561,40.689615,10001,ny,new york,2015,2015Q3,july,summer,weekday,friday,18000,low,12.09,1.0,-2527299.0,-73.927029,40.806511,15.51121,normal,not_fraud,b7846e01cf198b78cdf813937a81304a0b18fa55267331...,84e727d1fdf6e3ab4cfb7cf03d790efce6e7728ca4679a...
2,9528285469413252,2015-07-31 11:10:14,-72.139485,43.1081,3280,nh,washington,2015,2015Q3,july,summer,weekday,friday,40000,very_high,78.21,1.0,-6508550.0,-72.064113,43.172281,9.404226,normal,not_fraud,59514fdc67e3525d63fc4e282a812eb40719a774b2cb81...,53edf5aa52320a20ad091759c9c9685a8c5aae2ff1637e...
3,1845720274833905,2015-07-31 11:28:55,-89.002148,40.804323,61738,il,el paso,2015,2015Q3,july,summer,weekday,friday,20000,medium,74.41,1.0,-2534699.0,-88.974492,40.720877,9.556419,normal,not_fraud,607b6c0a24fb22f056c2ebe27a03512894cdce5ff4667b...,7657ddc4b2b2a94ac89ccdd56b92141d893f431a9f978e...
4,7850942767136368,2015-07-31 11:38:51,-72.025675,43.210753,3280,nh,washington,2015,2015Q3,july,summer,weekday,friday,4000,very_low,54.89,1.0,-1785659.0,-72.125392,43.219223,8.15713,normal,not_fraud,3f58d63f98573e14e51a02d98c664bd09e3ffe23641eb9...,f21cc793c1a4c5584c700c1d36f515ed17648fd88c463e...


# Data Modeling

### Noise and Irrelevant Data

#### Checking Threshold Column 

In [7]:
from sklearn.feature_selection import VarianceThreshold

# Drop kolom non-numerik
df_numeric = cc_df.select_dtypes(include = ['number'])
print(f'numeric columns: {df_numeric.columns}\n')

# Inisialisasi VarianceThreshold (misalnya, ambang batas 0.01)
selector = VarianceThreshold(threshold = 0.01)
df_var_selected = selector.fit_transform(df_numeric)

# Fitur yang dipertahankan
selected_features = df_numeric.columns[selector.get_support()]
print("Fitur yang dipertahankan:", selected_features)

numeric columns: Index(['credit_card', 'long', 'lat', 'zipcode', 'year', 'credit_card_limit',
       'transaction_dollar_amount', 'transaction_count', 'time_diff',
       'prev_long', 'prev_lat', 'distance'],
      dtype='object')

Fitur yang dipertahankan: Index(['credit_card', 'long', 'lat', 'zipcode', 'credit_card_limit',
       'transaction_dollar_amount', 'time_diff', 'prev_long', 'prev_lat',
       'distance'],
      dtype='object')


In [8]:
# Seleceted numeric columns
filter_numeric = ['credit_card', 'long', 'lat', 'zipcode', 'credit_card_limit', 'prev_long', 'prev_lat']
selected_numeric = selected_features.drop(filter_numeric)

#
print("Numeric column untuk modeling:", selected_numeric)

Numeric column untuk modeling: Index(['transaction_dollar_amount', 'time_diff', 'distance'], dtype='object')


#### Check Relevant Column

In [9]:
# Check Column Category
check_cat = cc_df.select_dtypes(include = ['object'])

for i in check_cat.columns:
    print(f'{i.upper()} \t: {check_cat[i].unique()} \n')
    print(f'{'-' * 50} \n')

STATE 	: ['la' 'ny' 'nh' 'il' 'pa' 'nj' 'mo' 'md' 'ca' 'tx' 'me' 'vt' 'al' 'wv'
 'pr' 'wa' 'nc' 'ga' 'ma' 'ok' 'mi' 'ut' 'fl' 'hi' 'ia' 'nm' 'oh' 'az'
 'va' 'in' 'ri' 'id' 'co' 'ct' 'ks'] 

-------------------------------------------------- 

CITY 	: ['new orleans' 'new york' 'washington' 'el paso' 'dallas' 'houston'
 'birmingham' 'kansas city' 'austin' 'pasadena' 'los angeles' 'fort worth'
 'jackson' 'pittsburgh' 'portland' 'albany' 'charlotte' 'huntsville'
 'madison' 'orlando' 'san antonio' 'seattle' 'minneapolis' 'sacramento'
 'san francisco' 'memphis' 'dayton' 'denver' 'milwaukee' 'omaha' 'trenton'
 'springfield' 'oklahoma city' 'charleston' 'miami' 'long beach' 'quitman'
 'saint louis' 'friendship' 'chicago' 'salt lake city' 'richmond'
 'pensacola' 'san diego' 'atlanta' 'honolulu' 'greensboro' 'newark'
 'rochester' 'lafayette' 'columbus' 'staten island' 'des moines'
 'las vegas' 'chester' 'cincinnati' 'hillsboro' 'tucson' 'buffalo'
 'arlington' 'shreveport' 'philadelphia' 'tulsa' 

In [10]:
# Drop kolom numerik
df_obj = cc_df.select_dtypes(include = ['object'])
print(f'objetc columns: {df_obj.columns}\n')

# selected object columns
filter_obj = ['limit_cat', 'geo_cat', 'fraud_status']
selected_object = df_obj[filter_obj].columns

#
print("Object column untuk modeling:", selected_object)

objetc columns: Index(['state', 'city', 'quarter', 'month', 'season', 'week_cat', 'day',
       'limit_cat', 'geo_cat', 'fraud_status', 'cc_id', 'trx_id'],
      dtype='object')

Object column untuk modeling: Index(['limit_cat', 'geo_cat', 'fraud_status'], dtype='object')


In [11]:
# 
model_col = selected_numeric.append(selected_object)

# 
model_df = cc_df[model_col]
model_df.head()

Unnamed: 0,transaction_dollar_amount,time_diff,distance,limit_cat,geo_cat,fraud_status
0,17.99,-7642455.0,11.969568,very_low,normal,not_fraud
1,12.09,-2527299.0,15.51121,low,normal,not_fraud
2,78.21,-6508550.0,9.404226,very_high,normal,not_fraud
3,74.41,-2534699.0,9.556419,medium,normal,not_fraud
4,54.89,-1785659.0,8.15713,very_low,normal,not_fraud


# CHECK TIME DIFF

## Transform Data

In [12]:
# Daftar kolom untuk label encoding (kolom ordinal)
encoding_set = {'limit_cat'}

# Inisialisasi list untuk menyimpan kolom yang telah dikelompokkan
ordinal_cols = []
one_hot_cols = []
numeric_cols = []

# Mengelompokkan kolom berdasarkan tipe data
for col in model_df.columns:
    if cc_df[col].dtype in ['int', 'float']:
        numeric_cols.append(col)

    elif cc_df[col].dtype == 'object':
        if col in encoding_set:
            ordinal_cols.append(col)

        else:
            one_hot_cols.append(col)

# Menampilkan hasil
print("Ordinal Encoding Columns:", ordinal_cols)
print("One-Hot Encoding Columns:", one_hot_cols)
print("Numeric Columns:", numeric_cols)

Ordinal Encoding Columns: ['limit_cat']
One-Hot Encoding Columns: ['geo_cat', 'fraud_status']
Numeric Columns: ['transaction_dollar_amount', 'time_diff', 'distance']


In [13]:
# Transformasi
numerical_transformer = StandardScaler()
categorical_transformer = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown = 'ignore', sparse_output = True, max_categories = 50)), 
    # ('svd', TruncatedSVD(n_components = 100))  # Mengurangi dimensi fitur kategori
])
ordinal_transformer = OrdinalEncoder(handle_unknown = 'use_encoded_value', unknown_value = -1)

# Column Transformer
prep_stage_1 = ColumnTransformer(
    transformers = [
        ("num", numerical_transformer, numeric_cols), 
        ("cat", categorical_transformer, one_hot_cols), 
        ("ord", ordinal_transformer, ordinal_cols)
    ], remainder = "passthrough")

In [14]:
# Transform data menggunakan fit_transform pada tahap 1
model_df = prep_stage_1.fit_transform(model_df)

# Columns After: ubah kembali ke DataFrame dengan kolom dari prep_stage_1
model_df = pd.DataFrame(model_df, columns = prep_stage_1.get_feature_names_out())

# Hilangkan prefix (misalnya, "num__", "cat__", "out__")
clean_columns = [col.split("__", 1)[-1] for col in model_df.columns]
model_df.columns = clean_columns

In [15]:
# Menampilkan total null pada setiap kolom
null_columns = model_df.isnull().sum()[model_df.isnull().sum() > 0]
print(f'Total null columns: {null_columns} \n')

model_df.info()

Total null columns: Series([], dtype: int64) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 283712 entries, 0 to 283711
Data columns (total 8 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   transaction_dollar_amount  283712 non-null  float64
 1   time_diff                  283712 non-null  float64
 2   distance                   283712 non-null  float64
 3   geo_cat_anomaly            283712 non-null  float64
 4   geo_cat_normal             283712 non-null  float64
 5   fraud_status_fraud         283712 non-null  float64
 6   fraud_status_not_fraud     283712 non-null  float64
 7   limit_cat                  283712 non-null  float64
dtypes: float64(8)
memory usage: 17.3 MB


In [16]:
# change object after transform
model_df = convert_object_columns_to_numeric(model_df)
model_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 283712 entries, 0 to 283711
Data columns (total 8 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   transaction_dollar_amount  283712 non-null  float64
 1   time_diff                  283712 non-null  float64
 2   distance                   283712 non-null  float64
 3   geo_cat_anomaly            283712 non-null  float64
 4   geo_cat_normal             283712 non-null  float64
 5   fraud_status_fraud         283712 non-null  float64
 6   fraud_status_not_fraud     283712 non-null  float64
 7   limit_cat                  283712 non-null  float64
dtypes: float64(8)
memory usage: 17.3 MB


In [17]:
model_df.head()

Unnamed: 0,transaction_dollar_amount,time_diff,distance,geo_cat_anomaly,geo_cat_normal,fraud_status_fraud,fraud_status_not_fraud,limit_cat
0,-0.980072,-2.394849,-0.169832,0.0,1.0,0.0,1.0,4.0
1,-1.088841,-0.792138,-0.167818,0.0,1.0,0.0,1.0,1.0
2,0.13011,-2.039567,-0.171291,0.0,1.0,0.0,1.0,3.0
3,0.060055,-0.794456,-0.171205,0.0,1.0,0.0,1.0,2.0
4,-0.299804,-0.559763,-0.172001,0.0,1.0,0.0,1.0,4.0
