# Library / Packages

In [1]:
# basic library
import os
import pandas as pd
import numpy as np
import sys

# pickle and .env
from dotenv import dotenv_values
import pickle

# Dataset

In [2]:
# 
pd.set_option('display.max_columns', None)

## Credit Info

In [3]:
# parameter
link = {**dotenv_values('../.env.secret')}
path = link['RAW_CREDIT_INFO']

# filter missing values
missing_val = ['N/a', 'n/a', 'No', 'N\a', 'na', 'NA', np.nan]

# load data to df
cc_info_df = pd.read_csv(path, sep = ',', na_values = missing_val, low_memory = False)
cc_info_df.head()

Unnamed: 0,credit_card,city,state,zipcode,credit_card_limit
0,1280981422329509,Dallas,PA,18612,6000
1,9737219864179988,Houston,PA,15342,16000
2,4749889059323202,Auburn,MA,1501,14000
3,9591503562024072,Orlando,WV,26412,18000
4,2095640259001271,New York,NY,10001,20000


In [4]:
# check data type
cc_info_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 984 entries, 0 to 983
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   credit_card        984 non-null    int64 
 1   city               984 non-null    object
 2   state              984 non-null    object
 3   zipcode            984 non-null    int64 
 4   credit_card_limit  984 non-null    int64 
dtypes: int64(3), object(2)
memory usage: 38.6+ KB


### Data Cleaning

#### Category Check

In [5]:
check_cat = cc_info_df.select_dtypes(include = ['object'])
check_cat.head()

Unnamed: 0,city,state
0,Dallas,PA
1,Houston,PA
2,Auburn,MA
3,Orlando,WV
4,New York,NY


In [6]:
for i in check_cat.columns:
    print(f'{i.upper()} \t: {check_cat[i].unique()} \n')
    print(f'{check_cat[i].value_counts()} \n')
    print(f'{'-' * 50} \n')

CITY 	: ['Dallas' 'Houston' 'Auburn' 'Orlando' 'New York' 'Atlanta' 'Pittsburgh'
 'Birmingham' 'Washington' 'Los Angeles' 'Phoenix' 'Colorado Springs'
 'San Francisco' 'Columbus' 'Topeka' 'El Paso' 'Corpus Christi' 'Richmond'
 'Sacramento' 'Arlington' 'Cleveland' 'Tacoma' 'San Diego' 'Springfield'
 'Chicago' 'Seattle' 'Albany' 'Kansas City' 'Denver' 'Oklahoma City'
 'Las Vegas' 'Raleigh' 'Pasadena' 'Memphis' 'Des Moines' 'Miami'
 'Portland' 'Buffalo' 'Long Beach' 'Newark' 'San Antonio' 'Indianapolis'
 'Fort Worth' 'Akron' 'New Orleans' 'Saint Louis' 'Salt Lake City'
 'Fresno' 'Cincinnati' 'Austin' 'Dover' 'San Jose' 'Trenton' 'Oakland'
 'Gretna' 'Clinton' 'Honolulu' 'Charleston' 'Dayton' 'Mobile' 'Charlotte'
 'Philadelphia' 'Madison' 'Newport' 'Lafayette' 'Pensacola' 'Jacksonville'
 'Greensboro' 'Shreveport' 'Saint Paul' 'Huntington' 'Jackson' 'Lexington'
 'Rochester' 'Minneapolis' 'Columbia' 'Louisville' 'Boston' 'New Haven'
 'Wichita' 'Bristol' 'Hillsboro' 'Hartford' 'Friendship' 'Om

#### Data Duplicate

In [7]:
# check general duplicate
print(f"Total General Duplicate: {cc_info_df.duplicated().sum()}")

Total General Duplicate: 0


In [8]:
# check specific duplicate
print(f"Total Specific Duplicate: {cc_info_df['credit_card'].duplicated().sum()}")

Total Specific Duplicate: 0


#### Null Checking

In [9]:
# check null
cc_info_df.isnull().sum()

credit_card          0
city                 0
state                0
zipcode              0
credit_card_limit    0
dtype: int64

## Credit Transactions

In [10]:
# parameter
link = {**dotenv_values('../.env.secret')}
path = link['RAW_CREDIT_TRANSACTIONS']

# filter missing values
missing_val = ['N/a', 'n/a', 'No', 'N\a', 'na', 'NA', np.nan]

# load data to df
cc_trans_df = pd.read_csv(path, sep = ',', na_values = missing_val, low_memory = False)
cc_trans_df.head()

Unnamed: 0,credit_card,date,transaction_dollar_amount,Long,Lat
0,1003715054175576,2015-09-11 00:32:40,43.78,-80.174132,40.26737
1,1003715054175576,2015-10-24 22:23:08,103.15,-80.19424,40.180114
2,1003715054175576,2015-10-26 18:19:36,48.55,-80.211033,40.313004
3,1003715054175576,2015-10-22 19:41:10,136.18,-80.174138,40.290895
4,1003715054175576,2015-10-26 20:08:22,71.82,-80.23872,40.166719


In [11]:
# 
cc_trans_df.columns = cc_trans_df.columns.str.lower().str.replace(' ', '_')
cc_trans_df = cc_trans_df.rename(columns = {'date': 'datetime'})
cc_trans_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294588 entries, 0 to 294587
Data columns (total 5 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   credit_card                294588 non-null  int64  
 1   datetime                   294588 non-null  object 
 2   transaction_dollar_amount  294588 non-null  float64
 3   long                       294588 non-null  float64
 4   lat                        294588 non-null  float64
dtypes: float64(3), int64(1), object(1)
memory usage: 11.2+ MB


### Data Cleaning

#### Data Duplicate

In [12]:
# check general duplicate
print(f"Total General Duplicate: {cc_trans_df.duplicated().sum()}")

Total General Duplicate: 0


In [13]:
# check specific duplicate
print(f"Total Specific Duplicate: {cc_trans_df['datetime'].duplicated().sum()}")

Total Specific Duplicate: 13523


In [14]:
# see duplicate data
duplicates_subset = cc_trans_df[cc_trans_df.duplicated(subset = ['datetime', 'credit_card'], keep = False)]
duplicates_subset = duplicates_subset.sort_values(by = ['datetime', 'credit_card'], ascending = True)
duplicates_subset.head(10)

Unnamed: 0,credit_card,datetime,transaction_dollar_amount,long,lat
69871,3253141560871065,2015-08-01 20:56:37,86.84,47.0974,26.228185
70172,3253141560871065,2015-08-01 20:56:37,32.62,-84.480998,39.119678
41771,2238144513466760,2015-08-01 22:16:36,31.57,-78.870381,42.847171
41933,2238144513466760,2015-08-01 22:16:36,25.1,-78.915551,42.888953
30944,1981294676906345,2015-08-07 19:49:41,50.82,-70.293991,43.587676
30961,1981294676906345,2015-08-07 19:49:41,38.61,-70.288511,43.662232
143776,5275410446848007,2015-08-13 19:52:24,68.29,-80.141105,40.25261
143933,5275410446848007,2015-08-13 19:52:24,83.2,-80.210411,40.279128
263107,8955392958618753,2015-08-21 16:32:58,92.85,-72.169272,43.169368
263346,8955392958618753,2015-08-21 16:32:58,56.04,81.594738,15.327278


#### Null Checking

In [15]:
# check null
cc_trans_df.isnull().sum()

credit_card                  0
datetime                     0
transaction_dollar_amount    0
long                         0
lat                          0
dtype: int64

# Combine Dataset

In [16]:
cc_df = cc_trans_df.merge(right = cc_info_df, how = 'inner', on = 'credit_card')
cc_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294588 entries, 0 to 294587
Data columns (total 9 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   credit_card                294588 non-null  int64  
 1   datetime                   294588 non-null  object 
 2   transaction_dollar_amount  294588 non-null  float64
 3   long                       294588 non-null  float64
 4   lat                        294588 non-null  float64
 5   city                       294588 non-null  object 
 6   state                      294588 non-null  object 
 7   zipcode                    294588 non-null  int64  
 8   credit_card_limit          294588 non-null  int64  
dtypes: float64(3), int64(3), object(3)
memory usage: 20.2+ MB


In [17]:
cc_df.head()

Unnamed: 0,credit_card,datetime,transaction_dollar_amount,long,lat,city,state,zipcode,credit_card_limit
0,1003715054175576,2015-09-11 00:32:40,43.78,-80.174132,40.26737,Houston,PA,15342,20000
1,1003715054175576,2015-10-24 22:23:08,103.15,-80.19424,40.180114,Houston,PA,15342,20000
2,1003715054175576,2015-10-26 18:19:36,48.55,-80.211033,40.313004,Houston,PA,15342,20000
3,1003715054175576,2015-10-22 19:41:10,136.18,-80.174138,40.290895,Houston,PA,15342,20000
4,1003715054175576,2015-10-26 20:08:22,71.82,-80.23872,40.166719,Houston,PA,15342,20000


In [18]:
check_cat = cc_df.select_dtypes(include = ['object'])
check_cat.head()

Unnamed: 0,datetime,city,state
0,2015-09-11 00:32:40,Houston,PA
1,2015-10-24 22:23:08,Houston,PA
2,2015-10-26 18:19:36,Houston,PA
3,2015-10-22 19:41:10,Houston,PA
4,2015-10-26 20:08:22,Houston,PA


In [19]:
for i in check_cat.columns:
    print(f'{i.upper()} \t: {check_cat[i].unique()} \n')
    print(f'{check_cat[i].value_counts()} \n')
    print(f'{'-' * 50} \n')

DATETIME 	: ['2015-09-11 00:32:40' '2015-10-24 22:23:08' '2015-10-26 18:19:36' ...
 '2015-08-06 21:00:13' '2015-09-22 16:15:47' '2015-08-27 18:08:24'] 

datetime
2015-09-12 19:16:55    4
2015-08-29 20:21:58    4
2015-08-21 19:48:43    4
2015-09-04 19:31:28    4
2015-09-24 17:37:09    4
                      ..
2015-09-18 15:12:33    1
2015-10-29 20:34:37    1
2015-08-05 20:55:49    1
2015-09-05 19:58:28    1
2015-08-27 18:08:24    1
Name: count, Length: 281065, dtype: int64 

-------------------------------------------------- 

CITY 	: ['Houston' 'Washington' 'Charlotte' 'Dallas' 'Seattle' 'Buffalo'
 'New York' 'El Paso' 'Louisville' 'Richmond' 'Miami' 'Tampa' 'Arlington'
 'Oakland' 'Columbus' 'Cedar Rapids' 'Atlanta' 'Memphis' 'Wichita'
 'Clinton' 'Pittsburgh' 'Oklahoma City' 'Cincinnati' 'Auburn' 'Charleston'
 'Kansas City' 'Springfield' 'Jackson' 'Sacramento' 'Rochester'
 'Las Vegas' 'Madison' 'Austin' 'Saint Paul' 'Boston' 'Roanoke' 'Portland'
 'Somerset' 'Quitman' 'San Francisco' 

In [20]:
# see duplicate data
duplicates_subset = cc_df[cc_df.duplicated(subset = ['datetime', 'credit_card'], keep = False)]
duplicates_subset = duplicates_subset.sort_values(by = ['datetime', 'credit_card'], ascending = True)
duplicates_subset.head(10)

Unnamed: 0,credit_card,datetime,transaction_dollar_amount,long,lat,city,state,zipcode,credit_card_limit
69871,3253141560871065,2015-08-01 20:56:37,86.84,47.0974,26.228185,Cincinnati,OH,45201,30000
70172,3253141560871065,2015-08-01 20:56:37,32.62,-84.480998,39.119678,Cincinnati,OH,45201,30000
41771,2238144513466760,2015-08-01 22:16:36,31.57,-78.870381,42.847171,Buffalo,NY,14201,10000
41933,2238144513466760,2015-08-01 22:16:36,25.1,-78.915551,42.888953,Buffalo,NY,14201,10000
30944,1981294676906345,2015-08-07 19:49:41,50.82,-70.293991,43.587676,Portland,ME,4101,5000
30961,1981294676906345,2015-08-07 19:49:41,38.61,-70.288511,43.662232,Portland,ME,4101,5000
143776,5275410446848007,2015-08-13 19:52:24,68.29,-80.141105,40.25261,Houston,PA,15342,18000
143933,5275410446848007,2015-08-13 19:52:24,83.2,-80.210411,40.279128,Houston,PA,15342,18000
263107,8955392958618753,2015-08-21 16:32:58,92.85,-72.169272,43.169368,Washington,NH,3280,20000
263346,8955392958618753,2015-08-21 16:32:58,56.04,81.594738,15.327278,Washington,NH,3280,20000


## Data Manipulation

### Timestamp Derivative

In [21]:
# Change Data Type
cc_df['datetime'] = pd.to_datetime(cc_df['datetime'], format = 'mixed', dayfirst = True)
cc_df['date'] = pd.to_datetime(cc_df['datetime'].dt.date, format = 'mixed', dayfirst = True)
cc_df['time'] = cc_df['datetime'].dt.strftime('%H:%M:%S')

# add timestamp derivation
cc_df['year'] = cc_df['date'].dt.year
cc_df['month'] = cc_df['date'].dt.month

cc_df['first_day_month'] = pd.to_datetime(cc_df[['year', 'month']].assign(DAY = 1))

cc_df['quarter'] = pd.PeriodIndex(cc_df['first_day_month'], freq = 'Q')
cc_df['quarter'] = cc_df['quarter'].astype(str)

cc_df['day_cat'] = cc_df['date'].dt.day_of_week

day_mapping = {0: 'monday', 1: 'tuesday', 2: 'wednesday', 3: 'thursday', 4: 'friday', 5: 'saturday', 6: 'sunday'}
cc_df['day'] = cc_df['day_cat'].map(day_mapping)

weekday_list = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday']
cc_df.loc[cc_df['day'].isin(weekday_list), 'week_cat'] = 'weekday'
cc_df.loc[~cc_df['day'].isin(weekday_list), 'week_cat'] = 'weekend'

cc_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294588 entries, 0 to 294587
Data columns (total 18 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   credit_card                294588 non-null  int64         
 1   datetime                   294588 non-null  datetime64[ns]
 2   transaction_dollar_amount  294588 non-null  float64       
 3   long                       294588 non-null  float64       
 4   lat                        294588 non-null  float64       
 5   city                       294588 non-null  object        
 6   state                      294588 non-null  object        
 7   zipcode                    294588 non-null  int64         
 8   credit_card_limit          294588 non-null  int64         
 9   date                       294588 non-null  datetime64[ns]
 10  time                       294588 non-null  object        
 11  year                       294588 non-null  int32   

### Data Enrichment

In [22]:
# Mapping for Seasons
season_mapping = {
    'dry': [('04-10', '10-09')],
    'rainy': [('10-10', '12-31'), ('01-01', '04-09')]  # consider leap year
}

def get_season(date):
    month_day = date.strftime('%m-%d')
    
    for season, ranges in season_mapping.items():
        for start, end in ranges:
            if (month_day >= start and month_day <= end) or \
               (start > end and (month_day >= start or month_day <= end)):  # range that crosses the end of the year
                return season

cc_df['season'] = cc_df['date'].apply(get_season)

print(cc_df['season'].unique())
cc_df['season'].value_counts()

['dry' 'rainy']


season
dry      231290
rainy     63298
Name: count, dtype: int64

### Data Type Filter

In [23]:
# string
string_columns = cc_df.select_dtypes(include = ['object']).columns.tolist()

# Kecualikan kolom 'quartal' dan 'weekday'
excluded_columns = ['quarter', 'week_cat', 'season', 'cat_lev']

string_columns = [col for col in string_columns if col not in excluded_columns]
print(f'String Columns: \n{string_columns}')

String Columns: 
['city', 'state', 'time', 'day']


In [24]:
def process_strings(df, columns):
    for column in columns:
        if column in df.columns:
            df[column] = df[column].str.lower()  # Ubah semua karakter ke huruf kecil
            df[column] = df[column].str.replace(r'\s{2,}', ' ', regex = True)  # Ganti spasi berlebih dengan satu spasi
    return df

cc_df = process_strings(cc_df, string_columns)
cc_df.head()

Unnamed: 0,credit_card,datetime,transaction_dollar_amount,long,lat,city,state,zipcode,credit_card_limit,date,time,year,month,first_day_month,quarter,day_cat,day,week_cat,season
0,1003715054175576,2015-09-11 00:32:40,43.78,-80.174132,40.26737,houston,pa,15342,20000,2015-09-11,00:32:40,2015,9,2015-09-01,2015Q3,4,friday,weekday,dry
1,1003715054175576,2015-10-24 22:23:08,103.15,-80.19424,40.180114,houston,pa,15342,20000,2015-10-24,22:23:08,2015,10,2015-10-01,2015Q4,5,saturday,weekend,rainy
2,1003715054175576,2015-10-26 18:19:36,48.55,-80.211033,40.313004,houston,pa,15342,20000,2015-10-26,18:19:36,2015,10,2015-10-01,2015Q4,0,monday,weekday,rainy
3,1003715054175576,2015-10-22 19:41:10,136.18,-80.174138,40.290895,houston,pa,15342,20000,2015-10-22,19:41:10,2015,10,2015-10-01,2015Q4,3,thursday,weekday,rainy
4,1003715054175576,2015-10-26 20:08:22,71.82,-80.23872,40.166719,houston,pa,15342,20000,2015-10-26,20:08:22,2015,10,2015-10-01,2015Q4,0,monday,weekday,rainy


### Data Re-Structure

In [25]:
re_col = [
    'credit_card', 
    'datetime', 
    'long', 
    'lat', 
    'zipcode', 
    'state', 
    'city', 
    'date', 
    'year', 
    'quarter', 
    'month', 
    'first_day_month', 
    'season', 
    'week_cat', 
    'day', 
    'time', 
    'credit_card_limit', 
    'transaction_dollar_amount'
]

cc_df = cc_df[re_col]
cc_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294588 entries, 0 to 294587
Data columns (total 18 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   credit_card                294588 non-null  int64         
 1   datetime                   294588 non-null  datetime64[ns]
 2   long                       294588 non-null  float64       
 3   lat                        294588 non-null  float64       
 4   zipcode                    294588 non-null  int64         
 5   state                      294588 non-null  object        
 6   city                       294588 non-null  object        
 7   date                       294588 non-null  datetime64[ns]
 8   year                       294588 non-null  int32         
 9   quarter                    294588 non-null  object        
 10  month                      294588 non-null  int32         
 11  first_day_month            294588 non-null  datetime

# Write Data

In [26]:
cc_df.columns

Index(['credit_card', 'datetime', 'long', 'lat', 'zipcode', 'state', 'city',
       'date', 'year', 'quarter', 'month', 'first_day_month', 'season',
       'week_cat', 'day', 'time', 'credit_card_limit',
       'transaction_dollar_amount'],
      dtype='object')

In [27]:
# Tentukan folder tujuan
dir_name = 'datamart'
folder_path = f"../{dir_name}"

# Cek apakah folder sudah ada, jika belum buat foldernya
if not os.path.exists(folder_path):
    os.makedirs(folder_path)
    
    print(f"Directory '{dir_name}' created successfully.")

else: 
    print(f'Directory has already been created.')

Directory has already been created.


In [28]:
# calculate dataframe size
total_size_bytes = sys.getsizeof(cc_df)

# Converse bytes to MB
total_size_mb = total_size_bytes / 1048576
print(f"Total size of DataFrame: {total_size_mb:.2f} MB")

Total size of DataFrame: 130.77 MB


In [29]:
# parameter
share = {**dotenv_values('../.env.shared')} 

# save to pickle
with open(share['PREPARATION_DATA'], 'wb') as f:
    pickle.dump(cc_df, f)

print('Data has been saved')

Data has been saved


# Another

In [30]:
# Menambahkan kolom penghitung transaksi berdasarkan credit_card dan transaction_dollar_amount
duplicates_subset['transaction_count'] = duplicates_subset.groupby(['credit_card', 'datetime'])['datetime'].transform('count')
duplicates_subset.head(10)

Unnamed: 0,credit_card,datetime,transaction_dollar_amount,long,lat,city,state,zipcode,credit_card_limit,transaction_count
69871,3253141560871065,2015-08-01 20:56:37,86.84,47.0974,26.228185,Cincinnati,OH,45201,30000,2
70172,3253141560871065,2015-08-01 20:56:37,32.62,-84.480998,39.119678,Cincinnati,OH,45201,30000,2
41771,2238144513466760,2015-08-01 22:16:36,31.57,-78.870381,42.847171,Buffalo,NY,14201,10000,2
41933,2238144513466760,2015-08-01 22:16:36,25.1,-78.915551,42.888953,Buffalo,NY,14201,10000,2
30944,1981294676906345,2015-08-07 19:49:41,50.82,-70.293991,43.587676,Portland,ME,4101,5000,2
30961,1981294676906345,2015-08-07 19:49:41,38.61,-70.288511,43.662232,Portland,ME,4101,5000,2
143776,5275410446848007,2015-08-13 19:52:24,68.29,-80.141105,40.25261,Houston,PA,15342,18000,2
143933,5275410446848007,2015-08-13 19:52:24,83.2,-80.210411,40.279128,Houston,PA,15342,18000,2
263107,8955392958618753,2015-08-21 16:32:58,92.85,-72.169272,43.169368,Washington,NH,3280,20000,2
263346,8955392958618753,2015-08-21 16:32:58,56.04,81.594738,15.327278,Washington,NH,3280,20000,2


In [31]:
suspicious_df = duplicates_subset.query('transaction_count > 1')
suspicious_df = suspicious_df.reset_index(drop = True)

print(f'Total suspicious rows: {len(suspicious_df)}')
suspicious_df.tail()

Total suspicious rows: 34


Unnamed: 0,credit_card,datetime,transaction_dollar_amount,long,lat,city,state,zipcode,credit_card_limit,transaction_count
29,7338934618553557,2015-10-18 17:27:18,42.96,-67.064944,18.459796,San Antonio,PR,690,8000,2
30,2246920751887814,2015-10-21 18:15:59,49.46,-81.373006,38.148168,Miami,WV,25134,9000,2
31,2246920751887814,2015-10-21 18:15:59,59.42,-81.514148,38.21279,Miami,WV,25134,9000,2
32,7053196367895112,2015-10-28 21:34:58,107.37,-73.947176,40.796738,New York,NY,10001,18000,2
33,7053196367895112,2015-10-28 21:34:58,101.92,-73.924052,40.722311,New York,NY,10001,18000,2


In [32]:
# 
suspicious_df['datetime'] = pd.to_datetime(suspicious_df['datetime'], format = 'mixed', dayfirst = True)

# based on transactions durations
suspicious_df['time_diff'] = suspicious_df.groupby('credit_card')['datetime'].diff().dt.total_seconds()
suspicious_df['time_diff'] = suspicious_df['time_diff'].fillna(0)
suspicious_df.tail()

Unnamed: 0,credit_card,datetime,transaction_dollar_amount,long,lat,city,state,zipcode,credit_card_limit,transaction_count,time_diff
29,7338934618553557,2015-10-18 17:27:18,42.96,-67.064944,18.459796,San Antonio,PR,690,8000,2,0.0
30,2246920751887814,2015-10-21 18:15:59,49.46,-81.373006,38.148168,Miami,WV,25134,9000,2,0.0
31,2246920751887814,2015-10-21 18:15:59,59.42,-81.514148,38.21279,Miami,WV,25134,9000,2,0.0
32,7053196367895112,2015-10-28 21:34:58,107.37,-73.947176,40.796738,New York,NY,10001,18000,2,0.0
33,7053196367895112,2015-10-28 21:34:58,101.92,-73.924052,40.722311,New York,NY,10001,18000,2,0.0


In [33]:
# based on geolocation
from geopy.distance import geodesic

def calculate_distance(row):
    if pd.isnull(row['prev_long']) or pd.isnull(row['prev_lat']):
        return 0
    current = (row['lat'], row['long'])
    previous = (row['prev_lat'], row['prev_long'])
    return geodesic(current, previous).kilometers

suspicious_df['prev_long'] = suspicious_df.groupby('credit_card')['long'].shift(1)
suspicious_df['prev_long'] = suspicious_df['prev_long'].fillna(0)

suspicious_df['prev_lat'] = suspicious_df.groupby('credit_card')['lat'].shift(1)
suspicious_df['prev_lat'] = suspicious_df['prev_lat'].fillna(0)

suspicious_df['distance'] = suspicious_df.apply(calculate_distance, axis=1)
suspicious_df['distance'] = suspicious_df['distance'].fillna(0)

suspicious_df.tail()

Unnamed: 0,credit_card,datetime,transaction_dollar_amount,long,lat,city,state,zipcode,credit_card_limit,transaction_count,time_diff,prev_long,prev_lat,distance
29,7338934618553557,2015-10-18 17:27:18,42.96,-67.064944,18.459796,San Antonio,PR,690,8000,2,0.0,-67.090815,18.407574,6.393787
30,2246920751887814,2015-10-21 18:15:59,49.46,-81.373006,38.148168,Miami,WV,25134,9000,2,0.0,0.0,0.0,9257.178155
31,2246920751887814,2015-10-21 18:15:59,59.42,-81.514148,38.21279,Miami,WV,25134,9000,2,0.0,-81.373006,38.148168,14.296202
32,7053196367895112,2015-10-28 21:34:58,107.37,-73.947176,40.796738,New York,NY,10001,18000,2,0.0,0.0,0.0,8664.232031
33,7053196367895112,2015-10-28 21:34:58,101.92,-73.924052,40.722311,New York,NY,10001,18000,2,0.0,-73.947176,40.796738,8.492578


In [34]:
anomaly_geo_transactions = suspicious_df[(suspicious_df['distance'] > 500) & (suspicious_df['time_diff'] < 3600)]
anomaly_geo_transactions

Unnamed: 0,credit_card,datetime,transaction_dollar_amount,long,lat,city,state,zipcode,credit_card_limit,transaction_count,time_diff,prev_long,prev_lat,distance
0,3253141560871065,2015-08-01 20:56:37,86.84,47.0974,26.228185,Cincinnati,OH,45201,30000,2,0.0,0.0,0.0,5821.115098
1,3253141560871065,2015-08-01 20:56:37,32.62,-84.480998,39.119678,Cincinnati,OH,45201,30000,2,0.0,47.0974,26.228185,11200.932054
2,2238144513466760,2015-08-01 22:16:36,31.57,-78.870381,42.847171,Buffalo,NY,14201,10000,2,0.0,0.0,0.0,9103.759742
4,1981294676906345,2015-08-07 19:49:41,50.82,-70.293991,43.587676,Portland,ME,4101,5000,2,0.0,0.0,0.0,8434.097291
6,5275410446848007,2015-08-13 19:52:24,68.29,-80.141105,40.25261,Houston,PA,15342,18000,2,0.0,0.0,0.0,9174.611611
8,8955392958618753,2015-08-21 16:32:58,92.85,-72.169272,43.169368,Washington,NH,3280,20000,2,0.0,0.0,0.0,8571.736999
9,8955392958618753,2015-08-21 16:32:58,56.04,81.594738,15.327278,Washington,NH,3280,20000,2,0.0,-72.169272,43.169368,12998.994072
10,9246395935309352,2015-08-22 21:08:34,118.2,-74.687392,40.289514,Trenton,NJ,8601,30000,2,0.0,0.0,0.0,8715.9837
12,8325153881531522,2015-08-22 22:07:54,152.74,-111.95135,40.770776,Salt Lake City,UT,84101,20000,2,0.0,0.0,0.0,11844.313373
14,5013753958362344,2015-09-04 18:52:22,78.56,-72.028454,43.117755,Washington,NH,3280,12000,2,0.0,0.0,0.0,8559.322692
