# Library / Packages

In [1]:
# basic library
import os
import pandas as pd
import numpy as np
import sys

# based on geolocation
from geopy.distance import geodesic

# pickle and .env
from dotenv import dotenv_values
import pickle

# Dataset

In [2]:
# 
pd.set_option('display.max_columns', None)

## Credit Info

In [3]:
# parameter
link = {**dotenv_values('../.env.secret')}
path = link['RAW_CREDIT_INFO']

# filter missing values
missing_val = ['N/a', 'n/a', 'No', 'N\a', 'na', 'NA', np.nan]

# load data to df
cc_info_df = pd.read_csv(path, sep = ',', na_values = missing_val, low_memory = False)
cc_info_df.head()

Unnamed: 0,credit_card,city,state,zipcode,credit_card_limit
0,1280981422329509,Dallas,PA,18612,6000
1,9737219864179988,Houston,PA,15342,16000
2,4749889059323202,Auburn,MA,1501,14000
3,9591503562024072,Orlando,WV,26412,18000
4,2095640259001271,New York,NY,10001,20000


In [4]:
# check data type
cc_info_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 984 entries, 0 to 983
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   credit_card        984 non-null    int64 
 1   city               984 non-null    object
 2   state              984 non-null    object
 3   zipcode            984 non-null    int64 
 4   credit_card_limit  984 non-null    int64 
dtypes: int64(3), object(2)
memory usage: 38.6+ KB


### Data Cleaning

#### Category Check

In [5]:
check_cat = cc_info_df.select_dtypes(include = ['object'])
check_cat.head()

Unnamed: 0,city,state
0,Dallas,PA
1,Houston,PA
2,Auburn,MA
3,Orlando,WV
4,New York,NY


In [6]:
for i in check_cat.columns:
    print(f'{i.upper()} \t: {check_cat[i].unique()} \n')
    print(f'{check_cat[i].value_counts()} \n')
    print(f'{"-" * 50} \n')

CITY 	: ['Dallas' 'Houston' 'Auburn' 'Orlando' 'New York' 'Atlanta' 'Pittsburgh'
 'Birmingham' 'Washington' 'Los Angeles' 'Phoenix' 'Colorado Springs'
 'San Francisco' 'Columbus' 'Topeka' 'El Paso' 'Corpus Christi' 'Richmond'
 'Sacramento' 'Arlington' 'Cleveland' 'Tacoma' 'San Diego' 'Springfield'
 'Chicago' 'Seattle' 'Albany' 'Kansas City' 'Denver' 'Oklahoma City'
 'Las Vegas' 'Raleigh' 'Pasadena' 'Memphis' 'Des Moines' 'Miami'
 'Portland' 'Buffalo' 'Long Beach' 'Newark' 'San Antonio' 'Indianapolis'
 'Fort Worth' 'Akron' 'New Orleans' 'Saint Louis' 'Salt Lake City'
 'Fresno' 'Cincinnati' 'Austin' 'Dover' 'San Jose' 'Trenton' 'Oakland'
 'Gretna' 'Clinton' 'Honolulu' 'Charleston' 'Dayton' 'Mobile' 'Charlotte'
 'Philadelphia' 'Madison' 'Newport' 'Lafayette' 'Pensacola' 'Jacksonville'
 'Greensboro' 'Shreveport' 'Saint Paul' 'Huntington' 'Jackson' 'Lexington'
 'Rochester' 'Minneapolis' 'Columbia' 'Louisville' 'Boston' 'New Haven'
 'Wichita' 'Bristol' 'Hillsboro' 'Hartford' 'Friendship' 'Om

#### Data Duplicate

In [7]:
# check general duplicate
print(f"Total General Duplicate: {cc_info_df.duplicated().sum()}")

Total General Duplicate: 0


In [8]:
# check specific duplicate
print(f"Total Specific Duplicate: {cc_info_df['credit_card'].duplicated().sum()}")

Total Specific Duplicate: 0


#### Null Checking

In [9]:
# check null
cc_info_df.isnull().sum()

credit_card          0
city                 0
state                0
zipcode              0
credit_card_limit    0
dtype: int64

## Credit Transactions

In [10]:
# parameter
link = {**dotenv_values('../.env.secret')}
path = link['RAW_CREDIT_TRANSACTIONS']

# filter missing values
missing_val = ['N/a', 'n/a', 'No', 'N\a', 'na', 'NA', np.nan]

# load data to df
cc_trans_df = pd.read_csv(path, sep = ',', na_values = missing_val, low_memory = False)
cc_trans_df.head()

Unnamed: 0,credit_card,date,transaction_dollar_amount,Long,Lat
0,1003715054175576,2015-09-11 00:32:40,43.78,-80.174132,40.26737
1,1003715054175576,2015-10-24 22:23:08,103.15,-80.19424,40.180114
2,1003715054175576,2015-10-26 18:19:36,48.55,-80.211033,40.313004
3,1003715054175576,2015-10-22 19:41:10,136.18,-80.174138,40.290895
4,1003715054175576,2015-10-26 20:08:22,71.82,-80.23872,40.166719


In [11]:
# 
cc_trans_df.columns = cc_trans_df.columns.str.lower().str.replace(' ', '_')
cc_trans_df = cc_trans_df.rename(columns = {'date': 'datetime'})
cc_trans_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294588 entries, 0 to 294587
Data columns (total 5 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   credit_card                294588 non-null  int64  
 1   datetime                   294588 non-null  object 
 2   transaction_dollar_amount  294588 non-null  float64
 3   long                       294588 non-null  float64
 4   lat                        294588 non-null  float64
dtypes: float64(3), int64(1), object(1)
memory usage: 11.2+ MB


### Data Cleaning

#### Data Duplicate

In [12]:
# check general duplicate
print(f"Total General Duplicate: {cc_trans_df.duplicated().sum()}")

Total General Duplicate: 0


In [13]:
# check specific duplicate
print(f"Total Specific Duplicate: {cc_trans_df['datetime'].duplicated().sum()}")

Total Specific Duplicate: 13523


In [14]:
# see duplicate data
duplicates_subset = cc_trans_df[cc_trans_df.duplicated(subset = ['datetime', 'credit_card'], keep = False)]
duplicates_subset = duplicates_subset.sort_values(by = ['datetime', 'credit_card'], ascending = True)
duplicates_subset.head(10)

Unnamed: 0,credit_card,datetime,transaction_dollar_amount,long,lat
69871,3253141560871065,2015-08-01 20:56:37,86.84,47.0974,26.228185
70172,3253141560871065,2015-08-01 20:56:37,32.62,-84.480998,39.119678
41771,2238144513466760,2015-08-01 22:16:36,31.57,-78.870381,42.847171
41933,2238144513466760,2015-08-01 22:16:36,25.1,-78.915551,42.888953
30944,1981294676906345,2015-08-07 19:49:41,50.82,-70.293991,43.587676
30961,1981294676906345,2015-08-07 19:49:41,38.61,-70.288511,43.662232
143776,5275410446848007,2015-08-13 19:52:24,68.29,-80.141105,40.25261
143933,5275410446848007,2015-08-13 19:52:24,83.2,-80.210411,40.279128
263107,8955392958618753,2015-08-21 16:32:58,92.85,-72.169272,43.169368
263346,8955392958618753,2015-08-21 16:32:58,56.04,81.594738,15.327278


#### Null Checking

In [15]:
# check null
cc_trans_df.isnull().sum()

credit_card                  0
datetime                     0
transaction_dollar_amount    0
long                         0
lat                          0
dtype: int64

# Combine Dataset

In [16]:
cc_df = cc_trans_df.merge(right = cc_info_df, how = 'inner', on = 'credit_card')
cc_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294588 entries, 0 to 294587
Data columns (total 9 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   credit_card                294588 non-null  int64  
 1   datetime                   294588 non-null  object 
 2   transaction_dollar_amount  294588 non-null  float64
 3   long                       294588 non-null  float64
 4   lat                        294588 non-null  float64
 5   city                       294588 non-null  object 
 6   state                      294588 non-null  object 
 7   zipcode                    294588 non-null  int64  
 8   credit_card_limit          294588 non-null  int64  
dtypes: float64(3), int64(3), object(3)
memory usage: 20.2+ MB


In [17]:
cc_df.head()

Unnamed: 0,credit_card,datetime,transaction_dollar_amount,long,lat,city,state,zipcode,credit_card_limit
0,1003715054175576,2015-09-11 00:32:40,43.78,-80.174132,40.26737,Houston,PA,15342,20000
1,1003715054175576,2015-10-24 22:23:08,103.15,-80.19424,40.180114,Houston,PA,15342,20000
2,1003715054175576,2015-10-26 18:19:36,48.55,-80.211033,40.313004,Houston,PA,15342,20000
3,1003715054175576,2015-10-22 19:41:10,136.18,-80.174138,40.290895,Houston,PA,15342,20000
4,1003715054175576,2015-10-26 20:08:22,71.82,-80.23872,40.166719,Houston,PA,15342,20000


In [18]:
check_cat = cc_df.select_dtypes(include = ['object'])
check_cat.head()

Unnamed: 0,datetime,city,state
0,2015-09-11 00:32:40,Houston,PA
1,2015-10-24 22:23:08,Houston,PA
2,2015-10-26 18:19:36,Houston,PA
3,2015-10-22 19:41:10,Houston,PA
4,2015-10-26 20:08:22,Houston,PA


In [19]:
for i in check_cat.columns:
    print(f'{i.upper()} \t: {check_cat[i].unique()} \n')
    print(f'{check_cat[i].value_counts()} \n')
    print(f'{"-" * 50} \n')

DATETIME 	: ['2015-09-11 00:32:40' '2015-10-24 22:23:08' '2015-10-26 18:19:36' ...
 '2015-08-06 21:00:13' '2015-09-22 16:15:47' '2015-08-27 18:08:24'] 

datetime
2015-09-12 19:16:55    4
2015-08-29 20:21:58    4
2015-08-21 19:48:43    4
2015-09-04 19:31:28    4
2015-09-24 17:37:09    4
                      ..
2015-09-18 15:12:33    1
2015-10-29 20:34:37    1
2015-08-05 20:55:49    1
2015-09-05 19:58:28    1
2015-08-27 18:08:24    1
Name: count, Length: 281065, dtype: int64 

-------------------------------------------------- 

CITY 	: ['Houston' 'Washington' 'Charlotte' 'Dallas' 'Seattle' 'Buffalo'
 'New York' 'El Paso' 'Louisville' 'Richmond' 'Miami' 'Tampa' 'Arlington'
 'Oakland' 'Columbus' 'Cedar Rapids' 'Atlanta' 'Memphis' 'Wichita'
 'Clinton' 'Pittsburgh' 'Oklahoma City' 'Cincinnati' 'Auburn' 'Charleston'
 'Kansas City' 'Springfield' 'Jackson' 'Sacramento' 'Rochester'
 'Las Vegas' 'Madison' 'Austin' 'Saint Paul' 'Boston' 'Roanoke' 'Portland'
 'Somerset' 'Quitman' 'San Francisco' 

## Data Manipulation

### Timestamp Derivative

In [20]:
# Change Data Type
cc_df['datetime'] = pd.to_datetime(cc_df['datetime'], format = 'mixed', dayfirst = True)

# add timestamp derivation
cc_df['date'] = cc_df['datetime'].dt.date
cc_df['year'] = cc_df['datetime'].dt.year
cc_df['month'] = cc_df['datetime'].dt.month_name().str.lower()

cc_df['quarter'] = pd.PeriodIndex(cc_df['datetime'], freq = 'Q')
cc_df['quarter'] = cc_df['quarter'].astype(str)

cc_df['day'] = cc_df['datetime'].dt.day_name().str.lower()

cc_df['time'] = cc_df['datetime'].dt.time

weekday_list = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday']
cc_df['week_cat'] = np.where(cc_df['day'].isin(weekday_list), 'weekday', 'weekend')

cc_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294588 entries, 0 to 294587
Data columns (total 16 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   credit_card                294588 non-null  int64         
 1   datetime                   294588 non-null  datetime64[ns]
 2   transaction_dollar_amount  294588 non-null  float64       
 3   long                       294588 non-null  float64       
 4   lat                        294588 non-null  float64       
 5   city                       294588 non-null  object        
 6   state                      294588 non-null  object        
 7   zipcode                    294588 non-null  int64         
 8   credit_card_limit          294588 non-null  int64         
 9   date                       294588 non-null  object        
 10  year                       294588 non-null  int32         
 11  month                      294588 non-null  object  

In [21]:
cc_df.head()

Unnamed: 0,credit_card,datetime,transaction_dollar_amount,long,lat,city,state,zipcode,credit_card_limit,date,year,month,quarter,day,time,week_cat
0,1003715054175576,2015-09-11 00:32:40,43.78,-80.174132,40.26737,Houston,PA,15342,20000,2015-09-11,2015,september,2015Q3,friday,00:32:40,weekday
1,1003715054175576,2015-10-24 22:23:08,103.15,-80.19424,40.180114,Houston,PA,15342,20000,2015-10-24,2015,october,2015Q4,saturday,22:23:08,weekend
2,1003715054175576,2015-10-26 18:19:36,48.55,-80.211033,40.313004,Houston,PA,15342,20000,2015-10-26,2015,october,2015Q4,monday,18:19:36,weekday
3,1003715054175576,2015-10-22 19:41:10,136.18,-80.174138,40.290895,Houston,PA,15342,20000,2015-10-22,2015,october,2015Q4,thursday,19:41:10,weekday
4,1003715054175576,2015-10-26 20:08:22,71.82,-80.23872,40.166719,Houston,PA,15342,20000,2015-10-26,2015,october,2015Q4,monday,20:08:22,weekday


### Data Enrichment

In [22]:
# Mapping for Seasons
season_mapping = {
    'spring': [('03-01', '05-31')],
    'summer': [('06-01', '08-31')],
    'fall': [('09-01', '11-30')],
    'winter': [('12-01', '12-31'), ('01-01', '02-29')]  # consider leap year
}

def get_season(date):
    month_day = date.strftime('%m-%d')
    
    for season, ranges in season_mapping.items():
        for start, end in ranges:
            if (month_day >= start and month_day <= end) or \
               (start > end and (month_day >= start or month_day <= end)):  # range that crosses the end of the year
                return season

cc_df['season'] = cc_df['datetime'].apply(get_season)

print(cc_df['season'].unique())
cc_df['season'].value_counts()

['fall' 'summer']


season
fall      191040
summer    103548
Name: count, dtype: int64

In [23]:
# Kategori berdasarkan limit
bins = [10000, 20000, 30000, 40000]
categories = ["very_low", "low", "medium", "high", "very_high"]

# Menggunakan np.digitize untuk mengkategorikan
credit_categories = np.digitize(cc_df['credit_card_limit'], bins)

# Mengubah indeks ke kategori yang sesuai
cc_df['limit_cat'] = np.array(categories)[credit_categories]
cc_df['limit_cat'].value_counts()

limit_cat
low          135810
medium        71883
very_low      64057
high          18674
very_high      4164
Name: count, dtype: int64

### Data Type Filter

In [24]:
cc_df.head()

Unnamed: 0,credit_card,datetime,transaction_dollar_amount,long,lat,city,state,zipcode,credit_card_limit,date,year,month,quarter,day,time,week_cat,season,limit_cat
0,1003715054175576,2015-09-11 00:32:40,43.78,-80.174132,40.26737,Houston,PA,15342,20000,2015-09-11,2015,september,2015Q3,friday,00:32:40,weekday,fall,medium
1,1003715054175576,2015-10-24 22:23:08,103.15,-80.19424,40.180114,Houston,PA,15342,20000,2015-10-24,2015,october,2015Q4,saturday,22:23:08,weekend,fall,medium
2,1003715054175576,2015-10-26 18:19:36,48.55,-80.211033,40.313004,Houston,PA,15342,20000,2015-10-26,2015,october,2015Q4,monday,18:19:36,weekday,fall,medium
3,1003715054175576,2015-10-22 19:41:10,136.18,-80.174138,40.290895,Houston,PA,15342,20000,2015-10-22,2015,october,2015Q4,thursday,19:41:10,weekday,fall,medium
4,1003715054175576,2015-10-26 20:08:22,71.82,-80.23872,40.166719,Houston,PA,15342,20000,2015-10-26,2015,october,2015Q4,monday,20:08:22,weekday,fall,medium


In [25]:
# string
string_columns = cc_df.select_dtypes(include = ['object']).columns.tolist()

# Kecualikan kolom 'quartal' dan 'weekday'
excluded_columns = ['quarter', 'week_cat', 'season', 'limit_cat']

string_columns = [col for col in string_columns if col not in excluded_columns]
print(f'String Columns: \n{string_columns}')

String Columns: 
['city', 'state', 'date', 'month', 'day', 'time']


In [26]:
def process_strings(df, columns):
    for column in columns:
        if column in df.columns:
            df[column] = df[column].astype(str)  # Pastikan tipe datanya string
            df[column] = df[column].str.lower()  # Ubah semua karakter ke huruf kecil
            df[column] = df[column].str.replace(r'\s{2,}', ' ', regex = True)  # Ganti spasi berlebih dengan satu spasi
    return df

cc_df = process_strings(cc_df, string_columns)
cc_df.head()

Unnamed: 0,credit_card,datetime,transaction_dollar_amount,long,lat,city,state,zipcode,credit_card_limit,date,year,month,quarter,day,time,week_cat,season,limit_cat
0,1003715054175576,2015-09-11 00:32:40,43.78,-80.174132,40.26737,houston,pa,15342,20000,2015-09-11,2015,september,2015Q3,friday,00:32:40,weekday,fall,medium
1,1003715054175576,2015-10-24 22:23:08,103.15,-80.19424,40.180114,houston,pa,15342,20000,2015-10-24,2015,october,2015Q4,saturday,22:23:08,weekend,fall,medium
2,1003715054175576,2015-10-26 18:19:36,48.55,-80.211033,40.313004,houston,pa,15342,20000,2015-10-26,2015,october,2015Q4,monday,18:19:36,weekday,fall,medium
3,1003715054175576,2015-10-22 19:41:10,136.18,-80.174138,40.290895,houston,pa,15342,20000,2015-10-22,2015,october,2015Q4,thursday,19:41:10,weekday,fall,medium
4,1003715054175576,2015-10-26 20:08:22,71.82,-80.23872,40.166719,houston,pa,15342,20000,2015-10-26,2015,october,2015Q4,monday,20:08:22,weekday,fall,medium


### Data Re-Structure

In [27]:
cc_df.columns

Index(['credit_card', 'datetime', 'transaction_dollar_amount', 'long', 'lat',
       'city', 'state', 'zipcode', 'credit_card_limit', 'date', 'year',
       'month', 'quarter', 'day', 'time', 'week_cat', 'season', 'limit_cat'],
      dtype='object')

In [28]:
re_col = [
    'credit_card', 
    'datetime', 
    'long', 
    'lat', 
    'zipcode', 
    'state', 
    'city', 
    'date', 
    'year', 
    'quarter', 
    'month', 
    'season', 
    'week_cat', 
    'day', 
    'time', 
    'credit_card_limit', 
    'limit_cat', 
    'transaction_dollar_amount'
]

cc_df = cc_df[re_col]
cc_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294588 entries, 0 to 294587
Data columns (total 18 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   credit_card                294588 non-null  int64         
 1   datetime                   294588 non-null  datetime64[ns]
 2   long                       294588 non-null  float64       
 3   lat                        294588 non-null  float64       
 4   zipcode                    294588 non-null  int64         
 5   state                      294588 non-null  object        
 6   city                       294588 non-null  object        
 7   date                       294588 non-null  object        
 8   year                       294588 non-null  int32         
 9   quarter                    294588 non-null  object        
 10  month                      294588 non-null  object        
 11  season                     294588 non-null  object  

## Fraud Indication

In [29]:
# see duplicate data
duplicates_subset = cc_df[cc_df.duplicated(subset = ['datetime', 'credit_card'], keep = False)]
duplicates_subset = duplicates_subset.sort_values(by = ['datetime', 'credit_card'], ascending = True)

print(f'Total duplicate data: {len(duplicates_subset)}')

Total duplicate data: 34


In [30]:
duplicates_subset.head()

Unnamed: 0,credit_card,datetime,long,lat,zipcode,state,city,date,year,quarter,month,season,week_cat,day,time,credit_card_limit,limit_cat,transaction_dollar_amount
69871,3253141560871065,2015-08-01 20:56:37,47.0974,26.228185,45201,oh,cincinnati,2015-08-01,2015,2015Q3,august,summer,weekend,saturday,20:56:37,30000,high,86.84
70172,3253141560871065,2015-08-01 20:56:37,-84.480998,39.119678,45201,oh,cincinnati,2015-08-01,2015,2015Q3,august,summer,weekend,saturday,20:56:37,30000,high,32.62
41771,2238144513466760,2015-08-01 22:16:36,-78.870381,42.847171,14201,ny,buffalo,2015-08-01,2015,2015Q3,august,summer,weekend,saturday,22:16:36,10000,low,31.57
41933,2238144513466760,2015-08-01 22:16:36,-78.915551,42.888953,14201,ny,buffalo,2015-08-01,2015,2015Q3,august,summer,weekend,saturday,22:16:36,10000,low,25.1
30944,1981294676906345,2015-08-07 19:49:41,-70.293991,43.587676,4101,me,portland,2015-08-07,2015,2015Q3,august,summer,weekday,friday,19:49:41,5000,very_low,50.82


### Check Transactions

In [31]:
# Check double transactions
cc_df['transaction_count'] = duplicates_subset.groupby(['credit_card', 'datetime'])['datetime'].transform('count')
cc_df['transaction_count'] = cc_df['transaction_count'].fillna(1)

# 
suspicious_transactions = cc_df.query('transaction_count > 1')
print(f'Total suspicious rows: {len(suspicious_transactions)}')

Total suspicious rows: 34


In [32]:
suspicious_transactions.head()

Unnamed: 0,credit_card,datetime,long,lat,zipcode,state,city,date,year,quarter,month,season,week_cat,day,time,credit_card_limit,limit_cat,transaction_dollar_amount,transaction_count
18252,1489635379898079,2015-09-19 21:00:51,-94.687903,39.122112,64101,mo,kansas city,2015-09-19,2015,2015Q3,september,fall,weekend,saturday,21:00:51,7000,very_low,57.55,2.0
18260,1489635379898079,2015-09-19 21:00:51,-94.585836,39.069871,64101,mo,kansas city,2015-09-19,2015,2015Q3,september,fall,weekend,saturday,21:00:51,7000,very_low,24.67,2.0
30944,1981294676906345,2015-08-07 19:49:41,-70.293991,43.587676,4101,me,portland,2015-08-07,2015,2015Q3,august,summer,weekday,friday,19:49:41,5000,very_low,50.82,2.0
30961,1981294676906345,2015-08-07 19:49:41,-70.288511,43.662232,4101,me,portland,2015-08-07,2015,2015Q3,august,summer,weekday,friday,19:49:41,5000,very_low,38.61,2.0
33143,2015751582522409,2015-10-17 19:05:15,-72.133272,43.115703,3280,nh,washington,2015-10-17,2015,2015Q4,october,fall,weekend,saturday,19:05:15,20000,medium,6.46,2.0


### Time Based

In [33]:
# 
cc_df['prev_time'] = cc_df.groupby('credit_card')['datetime'].shift(1)
cc_df['prev_time'] = cc_df['prev_time'].fillna(cc_df["datetime"])

# 
cc_df["time_diff_hour"] = abs(round(((cc_df["datetime"] - cc_df["prev_time"]).dt.total_seconds()) / 3600, 2))
cc_df["time_diff_hour"] = cc_df["time_diff_hour"].fillna(0)

cc_df.head()

Unnamed: 0,credit_card,datetime,long,lat,zipcode,state,city,date,year,quarter,month,season,week_cat,day,time,credit_card_limit,limit_cat,transaction_dollar_amount,transaction_count,prev_time,time_diff_hour
0,1003715054175576,2015-09-11 00:32:40,-80.174132,40.26737,15342,pa,houston,2015-09-11,2015,2015Q3,september,fall,weekday,friday,00:32:40,20000,medium,43.78,1.0,2015-09-11 00:32:40,0.0
1,1003715054175576,2015-10-24 22:23:08,-80.19424,40.180114,15342,pa,houston,2015-10-24,2015,2015Q4,october,fall,weekend,saturday,22:23:08,20000,medium,103.15,1.0,2015-09-11 00:32:40,1053.84
2,1003715054175576,2015-10-26 18:19:36,-80.211033,40.313004,15342,pa,houston,2015-10-26,2015,2015Q4,october,fall,weekday,monday,18:19:36,20000,medium,48.55,1.0,2015-10-24 22:23:08,43.94
3,1003715054175576,2015-10-22 19:41:10,-80.174138,40.290895,15342,pa,houston,2015-10-22,2015,2015Q4,october,fall,weekday,thursday,19:41:10,20000,medium,136.18,1.0,2015-10-26 18:19:36,94.64
4,1003715054175576,2015-10-26 20:08:22,-80.23872,40.166719,15342,pa,houston,2015-10-26,2015,2015Q4,october,fall,weekday,monday,20:08:22,20000,medium,71.82,1.0,2015-10-22 19:41:10,96.45


### Geolocation Based

In [34]:
def calculate_distance(row):
    if pd.isnull(row['prev_long']) or pd.isnull(row['prev_lat']):
        return 0
    
    current = (row['lat'], row['long'])
    previous = (row['prev_lat'], row['prev_long'])
    
    return geodesic(current, previous).kilometers

# 
cc_df['prev_long'] = cc_df.groupby('credit_card')['long'].shift(1)
cc_df['prev_long'] = cc_df['prev_long'].fillna(0)

# 
cc_df['prev_lat'] = cc_df.groupby('credit_card')['lat'].shift(1)
cc_df['prev_lat'] = cc_df['prev_lat'].fillna(0)

In [35]:
# 
cc_df['distance_km'] = cc_df.apply(calculate_distance, axis = 1)
cc_df['distance_km'] = cc_df['distance_km'].fillna(0)

In [36]:
# 
cc_df['geo_cat'] = np.where((cc_df['distance_km'] > 500) | (cc_df["time_diff_hour"] < 1), 'anomaly', 'normal')
cc_df.tail()

Unnamed: 0,credit_card,datetime,long,lat,zipcode,state,city,date,year,quarter,month,season,week_cat,day,time,credit_card_limit,limit_cat,transaction_dollar_amount,transaction_count,prev_time,time_diff_hour,prev_long,prev_lat,distance_km,geo_cat
294583,9999757432802760,2015-09-10 19:43:33,-82.360952,32.978497,30434,ga,louisville,2015-09-10,2015,2015Q3,september,fall,weekday,thursday,19:43:33,6000,very_low,127.23,1.0,2015-10-23 20:47:23,1033.06,-82.443294,32.991054,7.821455,normal
294584,9999757432802760,2015-08-06 21:00:13,-82.322721,33.061071,30434,ga,louisville,2015-08-06,2015,2015Q3,august,summer,weekday,thursday,21:00:13,6000,very_low,84.9,1.0,2015-09-10 19:43:33,838.72,-82.360952,32.978497,9.829853,normal
294585,9999757432802760,2015-09-22 16:15:47,-82.44213,32.949983,30434,ga,louisville,2015-09-22,2015,2015Q3,september,fall,weekday,tuesday,16:15:47,6000,very_low,77.54,1.0,2015-08-06 21:00:13,1123.26,-82.322721,33.061071,16.622254,normal
294586,9999757432802760,2015-08-27 18:08:24,-82.398587,32.976162,30434,ga,louisville,2015-08-27,2015,2015Q3,august,summer,weekday,thursday,18:08:24,6000,very_low,144.05,1.0,2015-09-22 16:15:47,622.12,-82.44213,32.949983,5.000261,normal
294587,9999757432802760,2015-08-22 00:14:52,-82.38968,33.068351,30434,ga,louisville,2015-08-22,2015,2015Q3,august,summer,weekend,saturday,00:14:52,6000,very_low,154.36,1.0,2015-08-27 18:08:24,137.89,-82.398587,32.976162,10.258021,normal


In [37]:
cc_df["speed_km/h"] = round(cc_df["distance_km"] / cc_df["time_diff_hour"], 2)  # km/jam
cc_df["speed_km/h"] = cc_df["speed_km/h"].fillna(0)

cc_df.tail()

Unnamed: 0,credit_card,datetime,long,lat,zipcode,state,city,date,year,quarter,month,season,week_cat,day,time,credit_card_limit,limit_cat,transaction_dollar_amount,transaction_count,prev_time,time_diff_hour,prev_long,prev_lat,distance_km,geo_cat,speed_km/h
294583,9999757432802760,2015-09-10 19:43:33,-82.360952,32.978497,30434,ga,louisville,2015-09-10,2015,2015Q3,september,fall,weekday,thursday,19:43:33,6000,very_low,127.23,1.0,2015-10-23 20:47:23,1033.06,-82.443294,32.991054,7.821455,normal,0.01
294584,9999757432802760,2015-08-06 21:00:13,-82.322721,33.061071,30434,ga,louisville,2015-08-06,2015,2015Q3,august,summer,weekday,thursday,21:00:13,6000,very_low,84.9,1.0,2015-09-10 19:43:33,838.72,-82.360952,32.978497,9.829853,normal,0.01
294585,9999757432802760,2015-09-22 16:15:47,-82.44213,32.949983,30434,ga,louisville,2015-09-22,2015,2015Q3,september,fall,weekday,tuesday,16:15:47,6000,very_low,77.54,1.0,2015-08-06 21:00:13,1123.26,-82.322721,33.061071,16.622254,normal,0.01
294586,9999757432802760,2015-08-27 18:08:24,-82.398587,32.976162,30434,ga,louisville,2015-08-27,2015,2015Q3,august,summer,weekday,thursday,18:08:24,6000,very_low,144.05,1.0,2015-09-22 16:15:47,622.12,-82.44213,32.949983,5.000261,normal,0.01
294587,9999757432802760,2015-08-22 00:14:52,-82.38968,33.068351,30434,ga,louisville,2015-08-22,2015,2015Q3,august,summer,weekend,saturday,00:14:52,6000,very_low,154.36,1.0,2015-08-27 18:08:24,137.89,-82.398587,32.976162,10.258021,normal,0.07


In [38]:
# Melebihi 80% dari limit kartu
cc_limit_sus = cc_df['transaction_dollar_amount'] > cc_df['credit_card_limit'] * 0.8

# 5x lipat lebih besar dari rata-rata transaks
cc_trx_sus = cc_df['transaction_dollar_amount'] > cc_df['transaction_dollar_amount'].mean() * 5

In [39]:
# 
cc_df['fraud'] = np.where((cc_df['geo_cat'] == 'suspicious') | 
                          (cc_df["time_diff_hour"] < 1) | 
                          (cc_df['speed_km/h'] > 50) | 
                          (cc_limit_sus) | 
                          (cc_trx_sus), 
                          'fraud', 'not_fraud')

cc_df.tail()

Unnamed: 0,credit_card,datetime,long,lat,zipcode,state,city,date,year,quarter,month,season,week_cat,day,time,credit_card_limit,limit_cat,transaction_dollar_amount,transaction_count,prev_time,time_diff_hour,prev_long,prev_lat,distance_km,geo_cat,speed_km/h,fraud
294583,9999757432802760,2015-09-10 19:43:33,-82.360952,32.978497,30434,ga,louisville,2015-09-10,2015,2015Q3,september,fall,weekday,thursday,19:43:33,6000,very_low,127.23,1.0,2015-10-23 20:47:23,1033.06,-82.443294,32.991054,7.821455,normal,0.01,not_fraud
294584,9999757432802760,2015-08-06 21:00:13,-82.322721,33.061071,30434,ga,louisville,2015-08-06,2015,2015Q3,august,summer,weekday,thursday,21:00:13,6000,very_low,84.9,1.0,2015-09-10 19:43:33,838.72,-82.360952,32.978497,9.829853,normal,0.01,not_fraud
294585,9999757432802760,2015-09-22 16:15:47,-82.44213,32.949983,30434,ga,louisville,2015-09-22,2015,2015Q3,september,fall,weekday,tuesday,16:15:47,6000,very_low,77.54,1.0,2015-08-06 21:00:13,1123.26,-82.322721,33.061071,16.622254,normal,0.01,not_fraud
294586,9999757432802760,2015-08-27 18:08:24,-82.398587,32.976162,30434,ga,louisville,2015-08-27,2015,2015Q3,august,summer,weekday,thursday,18:08:24,6000,very_low,144.05,1.0,2015-09-22 16:15:47,622.12,-82.44213,32.949983,5.000261,normal,0.01,not_fraud
294587,9999757432802760,2015-08-22 00:14:52,-82.38968,33.068351,30434,ga,louisville,2015-08-22,2015,2015Q3,august,summer,weekend,saturday,00:14:52,6000,very_low,154.36,1.0,2015-08-27 18:08:24,137.89,-82.398587,32.976162,10.258021,normal,0.07,not_fraud


In [40]:
# check null after data manipulations
cc_df.isnull().sum()

credit_card                  0
datetime                     0
long                         0
lat                          0
zipcode                      0
state                        0
city                         0
date                         0
year                         0
quarter                      0
month                        0
season                       0
week_cat                     0
day                          0
time                         0
credit_card_limit            0
limit_cat                    0
transaction_dollar_amount    0
transaction_count            0
prev_time                    0
time_diff_hour               0
prev_long                    0
prev_lat                     0
distance_km                  0
geo_cat                      0
speed_km/h                   0
fraud                        0
dtype: int64

In [41]:
# change all infinite number
cc_df.replace([np.inf, -np.inf], 0, inplace = True)

In [42]:
# check fraud data
fraud_df = cc_df.loc[cc_df['fraud'] == 'fraud']
fraud_df.head()

Unnamed: 0,credit_card,datetime,long,lat,zipcode,state,city,date,year,quarter,month,season,week_cat,day,time,credit_card_limit,limit_cat,transaction_dollar_amount,transaction_count,prev_time,time_diff_hour,prev_long,prev_lat,distance_km,geo_cat,speed_km/h,fraud
0,1003715054175576,2015-09-11 00:32:40,-80.174132,40.26737,15342,pa,houston,2015-09-11,2015,2015Q3,september,fall,weekday,friday,00:32:40,20000,medium,43.78,1.0,2015-09-11 00:32:40,0.0,0.0,0.0,9177.582847,anomaly,0.0,fraud
14,1003715054175576,2015-09-18 21:44:21,69.475216,25.125749,15342,pa,houston,2015-09-18,2015,2015Q3,september,fall,weekday,friday,21:44:21,20000,medium,146.75,1.0,2015-09-25 20:08:29,166.4,-80.279679,40.295635,12116.951132,anomaly,72.82,fraud
15,1003715054175576,2015-09-27 21:31:58,-80.145917,40.278721,15342,pa,houston,2015-09-27,2015,2015Q3,september,fall,weekend,sunday,21:31:58,20000,medium,111.84,1.0,2015-09-18 21:44:21,215.79,69.475216,25.125749,12113.120145,anomaly,56.13,fraud
84,1003715054175576,2015-09-11 19:50:02,-80.12676,40.225626,15342,pa,houston,2015-09-11,2015,2015Q3,september,fall,weekday,friday,19:50:02,20000,medium,995.35,1.0,2015-09-28 19:52:53,408.05,-80.144435,40.279707,6.190541,normal,0.02,fraud
114,1003715054175576,2015-09-05 20:12:57,-80.2313,40.244209,15342,pa,houston,2015-09-05,2015,2015Q3,september,fall,weekend,saturday,20:12:57,20000,medium,200.34,1.0,2015-09-08 16:43:42,68.51,34.429275,24.659874,10153.008882,anomaly,148.2,fraud


In [43]:
print(cc_df['fraud'].value_counts(normalize = True) * 100, '\n')
print(cc_df['fraud'].value_counts())

fraud
not_fraud    95.978791
fraud         4.021209
Name: proportion, dtype: float64 

fraud
not_fraud    282742
fraud         11846
Name: count, dtype: int64


# Write Data

In [44]:
cc_df.columns

Index(['credit_card', 'datetime', 'long', 'lat', 'zipcode', 'state', 'city',
       'date', 'year', 'quarter', 'month', 'season', 'week_cat', 'day', 'time',
       'credit_card_limit', 'limit_cat', 'transaction_dollar_amount',
       'transaction_count', 'prev_time', 'time_diff_hour', 'prev_long',
       'prev_lat', 'distance_km', 'geo_cat', 'speed_km/h', 'fraud'],
      dtype='object')

In [45]:
# drop unnecessary column
cc_df = cc_df.drop(columns = ['credit_card_limit', 'transaction_count'])
cc_df.columns

Index(['credit_card', 'datetime', 'long', 'lat', 'zipcode', 'state', 'city',
       'date', 'year', 'quarter', 'month', 'season', 'week_cat', 'day', 'time',
       'limit_cat', 'transaction_dollar_amount', 'prev_time', 'time_diff_hour',
       'prev_long', 'prev_lat', 'distance_km', 'geo_cat', 'speed_km/h',
       'fraud'],
      dtype='object')

## Pickle Data

In [46]:
# Tentukan folder tujuan
dir_name = 'datamart'
folder_path = f"../{dir_name}"

# Cek apakah folder sudah ada, jika belum buat foldernya
if not os.path.exists(folder_path):
    os.makedirs(folder_path)
    
    print(f"Directory '{dir_name}' created successfully.")

else: 
    print(f'Directory has already been created.')

Directory has already been created.


In [47]:
# calculate dataframe size
total_size_bytes = sys.getsizeof(cc_df)

# Converse bytes to MB
total_size_mb = total_size_bytes / 1048576
print(f"Total size of DataFrame: {total_size_mb:.2f} MB")

Total size of DataFrame: 242.97 MB


In [48]:
# parameter
share = {**dotenv_values('../.env.shared')} 

# save to pickle
with open(share['PREPARATION_DATA'], 'wb') as f:
    pickle.dump(cc_df, f)

print('Data has been saved')

Data has been saved
