In [1]:
import pandas as pd
import json
import numpy as np  

In [None]:
# Read files


# ----- Load CSV file -----
transaction = pd.read_csv('csv_files/transactions_data_south_africa.csv')
user = pd.read_csv('csv_files/user_data_south_africa.csv')
card = pd.read_csv('csv_files/cards_data_south_africa.csv')

# ----- Load MCC JSON file -----
mcc_series = pd.read_json('json_files/mcc_codes.json', typ='series')
mcc_df = mcc_series.reset_index()
mcc_df.columns = ['mcc_code', 'description']
# save as csv
mcc_codes = mcc_df.to_csv('csv_files/mcc_codes.csv', index=False)
# Read csv
mcc_codes = pd.read_csv('csv_files/mcc_codes.csv')

# ----- Load fraud labels JSON -----
with open('json_files/train_fraud_labels.json', 'r') as f:
    raw_json_data = json.load(f)

transaction_labels_dict = raw_json_data['target']

train_fraud_labels = pd.Series(transaction_labels_dict).reset_index()
train_fraud_labels.columns = ['transaction_id', 'is_fraud']
train_fraud_labels['transaction_id'] = pd.to_numeric(train_fraud_labels['transaction_id'])
# Json to csv
train_fraud_labels.to_csv('csv_files/train_fraud_labels.csv', index=False)
# Read csv
train_fraud_labels = pd.read_csv('csv_files/train_fraud_labels.csv')

shape = {
        'Transactoins:' : transaction.shape, 
        'Users:': user.shape,
        'Cards:': card.shape,
        'Train raude label:' : train_fraud_labels.shape,
        'mcc Codes:': mcc_codes.shape
        }

# change mcc colomn to integer
transaction['mcc'] = pd.to_numeric(transaction['mcc'], errors='coerce').fillna(0).astype(np.int64)
#transaction.to_csv('transaction.csv', index=False)
transactions = pd.read_csv('transaction.csv')
shape_df = pd.DataFrame(shape, index=['Rows', 'Columns']).T
display(shape_df)

    

In [None]:
print(f"\nMissing values in transaction:")
print(transactions.isnull().sum())
print(transactions.columns.tolist())
print(transactions.info())
transactions.head()


Missing values in transaction:
id                       0
date                     0
client_id                0
card_id                  0
amount                   0
use_chip                 0
merchant_id              0
merchant_city            0
merchant_state     1509523
zip                1509523
mcc                      0
errors            12654526
dtype: int64
['id', 'date', 'client_id', 'card_id', 'amount', 'use_chip', 'merchant_id', 'merchant_city', 'merchant_state', 'zip', 'mcc', 'errors']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12858754 entries, 0 to 12858753
Data columns (total 12 columns):
 #   Column          Dtype  
---  ------          -----  
 0   id              int64  
 1   date            object 
 2   client_id       int64  
 3   card_id         int64  
 4   amount          float64
 5   use_chip        object 
 6   merchant_id     int64  
 7   merchant_city   object 
 8   merchant_state  object 
 9   zip             float64
 10  mcc             int64  
 11 

Unnamed: 0,id,date,client_id,card_id,amount,use_chip,merchant_id,merchant_city,merchant_state,zip,mcc,errors
0,7475327,2022-02-19 05:51:55,1556,2972,-1386.0,Swipe Transaction,59935,Kimberley,Northern Cape,8300.0,5499,
1,7475328,2023-01-13 02:58:58,561,4575,262.26,Swipe Transaction,67570,Pietermaritzburg,KwaZulu-Natal,3200.0,5311,
2,7475329,2024-07-03 23:41:24,1129,102,1440.0,Swipe Transaction,27092,Port Elizabeth,Eastern Cape,6000.0,4829,
3,7475331,2022-06-12 09:33:41,430,2860,3600.0,Swipe Transaction,27092,Bloemfontein,Free State,9300.0,4829,
4,7475332,2023-08-26 10:05:48,848,3915,835.38,Swipe Transaction,13051,Polokwane,Limpopo,700.0,5813,


In [None]:
print(f"\nMissing values in users:")
print(user.isnull().sum())
print(user.columns.tolist())
print(user.info())
display(user.head())


transactions['amount'] = (
    transactions['amount'].astype(str)
    .str.replace(',', '', regex=False)
    .str.replace('R', '', regex=False)
    .str.replace('$', '', regex=False)
    .str.strip()
)

transactions['amount'] = pd.to_numeric(transactions['amount'], errors='coerce').round(2)




Missing values in users:
id                   0
current_age          0
retirement_age       0
birth_year           0
birth_month          0
gender               0
address              0
per_capita_income    0
yearly_income        0
total_debt           0
credit_score         0
num_credit_cards     0
dtype: int64
['id', 'current_age', 'retirement_age', 'birth_year', 'birth_month', 'gender', 'address', 'per_capita_income', 'yearly_income', 'total_debt', 'credit_score', 'num_credit_cards']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 2000 non-null   int64 
 1   current_age        2000 non-null   int64 
 2   retirement_age     2000 non-null   int64 
 3   birth_year         2000 non-null   int64 
 4   birth_month        2000 non-null   int64 
 5   gender             2000 non-null   object
 6   address            

Unnamed: 0,id,current_age,retirement_age,birth_year,birth_month,gender,address,per_capita_income,yearly_income,total_debt,credit_score,num_credit_cards
0,825,53,66,1966,11,Female,"126 Main Road, Durban",527004,1074528,2297034,787,5
1,1746,53,68,1966,12,Female,"254 Van der Merwe Avenue, Cape Town",682038,1390572,3444282,701,5
2,1718,81,67,1938,11,Female,"856 King George Street, Pietermaritzburg",408258,602694,3528,698,5
3,708,63,63,1957,1,Female,"565 Pretoria Street, Durban",2936610,4498650,3641904,722,4
4,1164,43,70,1976,9,Male,"269 Swart Street, Pretoria",968346,1974366,3309390,675,1


In [None]:
print(f"\nMissing values in card:")
print(card.isnull().sum())
print(card.columns.tolist())
card.head()



Missing values in card:
id                       0
client_id                0
card_brand               0
card_type                0
card_number              0
expires                  0
cvv                      0
has_chip                 0
num_cards_issued         0
credit_limit             0
acct_open_date           0
year_pin_last_changed    0
card_on_dark_web         0
dtype: int64
['id', 'client_id', 'card_brand', 'card_type', 'card_number', 'expires', 'cvv', 'has_chip', 'num_cards_issued', 'credit_limit', 'acct_open_date', 'year_pin_last_changed', 'card_on_dark_web']


Unnamed: 0,id,client_id,card_brand,card_type,card_number,expires,cvv,has_chip,num_cards_issued,credit_limit,acct_open_date,year_pin_last_changed,card_on_dark_web
0,4524,825,Mastercard,Debit,5561765976957072,04/2025,623,YES,2,437310,09/2002,2008,No
1,2731,825,Mastercard,Debit,5681404028657264,08/2025,393,YES,2,395424,04/2014,2014,No
2,3701,825,Visa,Debit,4715119191099084,08/2025,719,YES,2,835452,07/2003,2004,No
3,42,825,Visa,Credit,4133211808921544,12/2026,693,NO,1,223200,01/2003,2012,No
4,4659,825,Mastercard,Prepaid,5469974597438641,10/2026,75,YES,1,504,09/2008,2009,No


In [None]:
print(f"\nMissing values in users:")
print(train_fraud_labels.isnull().sum())
print(train_fraud_labels.columns.tolist())
train_fraud_labels.head()



Missing values in users:
transaction_id    0
is_fraud          0
dtype: int64
['transaction_id', 'is_fraud']


Unnamed: 0,transaction_id,is_fraud
0,10649266,No
1,23410063,No
2,9316588,No
3,12478022,No
4,9558530,No


In [None]:
print(f"\nMissing values in users:")
print(mcc_codes.isnull().sum())
print(mcc_codes.columns.tolist())
mcc_codes.head()


Missing values in users:
mcc_code       0
description    0
dtype: int64
['mcc_code', 'description']


Unnamed: 0,mcc_code,description
0,5812,Eating Places and Restaurants
1,5541,Service Stations
2,7996,"Amusement Parks, Carnivals, Circuses"
3,5411,"Grocery Stores, Supermarkets"
4,4784,Tolls and Bridge Fees
