In [1]:
import pandas as pd

In [2]:
csv = pd.read_csv('../data/mobile_money_transactions.csv')


In [7]:
print(csv.head())

   step transactionType    amount         initiator  oldBalInitiator  \
0     0        TRANSFER  19824.96  4537027967639631        187712.18   
1     0         PAYMENT    598.97  4296267625767470             8.92   
2     0         PAYMENT    545.85  4178224023847746            93.60   
3     0        TRANSFER  19847.01  4178224023847746          -452.25   
4     0         PAYMENT    546.89  4779013371563747        159148.76   

   newBalInitiator         recipient  oldBalRecipient  newBalRecipient  \
0        167887.22  4875702729424478             8.31         19833.27   
1             8.92        25-0000401             0.00             0.00   
2          -452.25        13-0001587             0.00           545.85   
3        -20299.26  4096920916696293          4011.72         23858.74   
4        158601.88        75-0003564             0.00           546.89   

   isFraud  
0        1  
1        0  
2        0  
3        1  
4        0  


In [8]:
"""Clean and enrich the raw mobile money dataset.

Steps (in simple language):
1. Remove bad rows (duplicates and missing values).
2. Add a country column (always Ghana).
3. Add gender using realistic percentages (a bit more men than women).
4. Add a realistic date and time for each transaction (more activity in the day).
5. Assign each transaction to a Ghana region, using realistic region shares.
6. Assign an age group to each transaction, using realistic age shares.
7. Print basic checks so we can see that everything looks OK.
"""

import numpy as np
import pandas as pd
from datetime import datetime, timedelta

# 1. Start from the raw CSV loaded above as `csv`
#    - Remove exact duplicate rows
#    - Drop rows that have missing values in any column
csv_cleaned = csv.drop_duplicates()
csv_cleaned = csv_cleaned.dropna()

# 2. Drop the 'step' column if it exists (it's just a technical index)
if 'step' in csv_cleaned.columns:
    csv_cleaned = csv_cleaned.drop(columns=['step'])

# 3. Add 'country' column and fill with 'Ghana' for all rows
csv_cleaned['country'] = 'Ghana'

# 4. Add gender with realistic percentages
#    Idea: in many studies, a little more men use mobile money than women.
#    We use 55% male (M) and 45% female (F).
gender_probs = [0.55, 0.45]
csv_cleaned['gender'] = np.random.choice(['M', 'F'], size=len(csv_cleaned), p=gender_probs)

# 5. Add a timestamp (date + time) for each transaction
#    First choose a random second between 1 Jan 2023 and 31 Jan 2025.
start_datetime = datetime(2023, 1, 1, 0, 0, 0)
end_datetime = datetime(2025, 1, 31, 23, 59, 59)

total_seconds = int((end_datetime - start_datetime).total_seconds())
random_seconds = np.random.randint(0, total_seconds, size=len(csv_cleaned))

csv_cleaned['timestamp'] = [
    start_datetime + timedelta(seconds=int(sec)) for sec in random_seconds
]

# Now adjust the hour of day so that:
# - Very few transactions happen at night (00:00–05:00)
# - Most transactions happen in the day (10:00–16:00)
# - Some transactions happen in the evening (17:00–21:00)

hours = np.arange(24)
hour_probs = np.array([
    0.005, 0.005, 0.005, 0.005, 0.005, 0.005,  # 0–5 very low
    0.04,  0.04,  0.04,  0.04,                 # 6–9 low/medium
    0.07,  0.07,  0.07,  0.07,  0.07,  0.07,  0.07,  # 10–16 high
    0.05,  0.05,  0.05,  0.05,  0.05,          # 17–21 medium/high
    0.01,  0.01                                # 22–23 low
])

# Make sure the probabilities add up to 1
hour_probs = hour_probs / hour_probs.sum()

random_hours = np.random.choice(hours, size=len(csv_cleaned), p=hour_probs)

# Keep the random date we chose earlier, but replace the hour with the weighted hour
csv_cleaned['timestamp'] = pd.to_datetime(csv_cleaned['timestamp']).dt.normalize() \
    + pd.to_timedelta(random_hours, unit='h')

# If a 'time/h' column exists in the original data, drop it (we now use 'timestamp')
if 'time/h' in csv_cleaned.columns:
    csv_cleaned = csv_cleaned.drop(columns=['time/h'])

# 6. Add a Ghana region to each transaction using realistic shares
#    Greater Accra and Ashanti get larger shares because they are more populated and active.
ghana_regions = [
    'Greater Accra', 'Ashanti', 'Eastern', 'Central', 'Western',
    'Bono', 'Bono East', 'Volta', 'Northern',
    'Upper East', 'Upper West', 'Ahafo', 'North East',
    'Savannah', 'Western North', 'Oti'
]

region_probs = [
    0.22,  # Greater Accra
    0.19,  # Ashanti
    0.09,  # Eastern
    0.08,  # Central
    0.07,  # Western
    0.04,  # Bono
    0.04,  # Bono East
    0.06,  # Volta
    0.06,  # Northern
    0.04,  # Upper East
    0.03,  # Upper West
    0.03,  # Ahafo
    0.02,  # North East
    0.02,  # Savannah
    0.02,  # Western North
    0.02   # Oti
]

region_probs = np.array(region_probs) / np.sum(region_probs)

csv_cleaned['region'] = np.random.choice(
    ghana_regions, size=len(csv_cleaned), p=region_probs
)

# 7. Add an age group using realistic Ghana-style age structure
#    Younger and working-age adults dominate mobile money usage.
age_groups = ['18-24', '25-34', '35-44', '45-54', '55-64', '65+']
age_probs = [0.22, 0.30, 0.22, 0.13, 0.08, 0.05]
age_probs = np.array(age_probs) / np.sum(age_probs)

csv_cleaned['age_group'] = np.random.choice(
    age_groups, size=len(csv_cleaned), p=age_probs
)

# 8. Print simple checks so we can see the result
print('Rows after cleaning:', len(csv_cleaned))
print('Missing values per column:')
print(csv_cleaned.isnull().sum())

csv_cleaned.head()

Rows after cleaning: 1720181
Missing values per column:
transactionType    0
amount             0
initiator          0
oldBalInitiator    0
newBalInitiator    0
recipient          0
oldBalRecipient    0
newBalRecipient    0
isFraud            0
country            0
gender             0
timestamp          0
region             0
age_group          0
dtype: int64
transactionType    0
amount             0
initiator          0
oldBalInitiator    0
newBalInitiator    0
recipient          0
oldBalRecipient    0
newBalRecipient    0
isFraud            0
country            0
gender             0
timestamp          0
region             0
age_group          0
dtype: int64


Unnamed: 0,transactionType,amount,initiator,oldBalInitiator,newBalInitiator,recipient,oldBalRecipient,newBalRecipient,isFraud,country,gender,timestamp,region,age_group
0,TRANSFER,19824.96,4537027967639631,187712.18,167887.22,4875702729424478,8.31,19833.27,1,Ghana,M,2023-06-25 00:00:00,Eastern,25-34
1,PAYMENT,598.97,4296267625767470,8.92,8.92,25-0000401,0.0,0.0,0,Ghana,M,2024-04-25 10:00:00,Northern,18-24
2,PAYMENT,545.85,4178224023847746,93.6,-452.25,13-0001587,0.0,545.85,0,Ghana,F,2023-07-09 14:00:00,North East,25-34
3,TRANSFER,19847.01,4178224023847746,-452.25,-20299.26,4096920916696293,4011.72,23858.74,1,Ghana,F,2024-05-22 19:00:00,Western,25-34
4,PAYMENT,546.89,4779013371563747,159148.76,158601.88,75-0003564,0.0,546.89,0,Ghana,F,2024-01-22 08:00:00,Western,25-34


In [9]:
print(csv_cleaned.tail())

        transactionType     amount         initiator  oldBalInitiator  \
1720176         DEPOSIT  189001.06  4663743651962693       3153757.97   
1720177         PAYMENT     700.80  4823452294389366       4949410.71   
1720178        TRANSFER   83799.76  4823452294389366       4948709.91   
1720179         PAYMENT     599.78  4118786864093625       3739281.18   
1720180        TRANSFER   17190.81  4118786864093625       3738681.40   

         newBalInitiator         recipient  oldBalRecipient  newBalRecipient  \
1720176       3342759.03        72-0003211        132293.79        132293.79   
1720177       4948709.91        23-0002032        143754.84        144455.64   
1720178       4864910.15  4271888998665790         16908.07        100707.82   
1720179       3738681.40        35-0003267        109393.87        109993.65   
1720180       3721490.60  4438993966712101         33807.10         50997.91   

         isFraud country gender           timestamp    region age_group  
172017

In [10]:
# Data cleaning validation checks
print('Missing values per column:')
print(csv_cleaned.isnull().sum())
print('\nNumber of rows:', len(csv_cleaned))
print('Number of duplicate rows:', csv_cleaned.duplicated().sum())
print('\nColumns in dataset:')
print(csv_cleaned.columns)
print('\nAmount column statistics:')
print(csv_cleaned['amount'].describe())
print('\nRegion value counts:')
print(csv_cleaned['region'].value_counts())
print('\nAge group value counts:')
print(csv_cleaned['age_group'].value_counts())
print('\nTimestamp sample:')
print(csv_cleaned['timestamp'].head())

Missing values per column:
transactionType    0
amount             0
initiator          0
oldBalInitiator    0
newBalInitiator    0
recipient          0
oldBalRecipient    0
newBalRecipient    0
isFraud            0
country            0
gender             0
timestamp          0
region             0
age_group          0
dtype: int64

Number of rows: 1720181
transactionType    0
amount             0
initiator          0
oldBalInitiator    0
newBalInitiator    0
recipient          0
oldBalRecipient    0
newBalRecipient    0
isFraud            0
country            0
gender             0
timestamp          0
region             0
age_group          0
dtype: int64

Number of rows: 1720181
Number of duplicate rows: 0

Columns in dataset:
Index(['transactionType', 'amount', 'initiator', 'oldBalInitiator',
       'newBalInitiator', 'recipient', 'oldBalRecipient', 'newBalRecipient',
       'isFraud', 'country', 'gender', 'timestamp', 'region', 'age_group'],
      dtype='object')

Amount column st

In [11]:
# Save cleaned dataset to CSV
csv_cleaned.to_csv('../data/mobile_money_transactions_cleaned.csv', index=False)
print('Cleaned dataset saved to ../data/mobile_money_transactions_cleaned.csv')

Cleaned dataset saved to ../data/mobile_money_transactions_cleaned.csv


In [12]:
# Introduce more variation into the cleaned dataset
import numpy as np

# 1. Add noise to transaction amounts (more spread)
np.random.seed(42)
amount_std = csv_cleaned['amount'].std() if csv_cleaned['amount'].std() > 0 else csv_cleaned['amount'].max() * 0.3
noise = np.random.normal(loc=0, scale=amount_std * 0.4, size=len(csv_cleaned))
csv_cleaned['amount'] = (csv_cleaned['amount'] + noise).clip(lower=1)

# 2. Make some regions clearly busier (e.g., Greater Accra, Ashanti)
high_activity_regions = ['Greater Accra', 'Ashanti']
mask = csv_cleaned['region'].isin(high_activity_regions)
# duplicate some rows from these regions to increase their counts
csv_extra = csv_cleaned[mask].sample(int(mask.sum() * 0.5), replace=True, random_state=42)
csv_cleaned = pd.concat([csv_cleaned, csv_extra], ignore_index=True)

# 3. Skew amounts by age group (younger smaller, middle larger, older medium)
def adjust_amount_by_age_group(row):
    if row['age_group'] in ['18-24', '25-34']:
        factor = np.random.uniform(0.6, 0.9)
    elif row['age_group'] in ['35-44', '45-54']:
        factor = np.random.uniform(1.1, 1.7)
    else:  # 55+
        factor = np.random.uniform(0.8, 1.3)
    return row['amount'] * factor

csv_cleaned['amount'] = csv_cleaned.apply(adjust_amount_by_age_group, axis=1)

# 4. Make gender slightly imbalanced to see differences
csv_cleaned['gender'] = np.where(
    np.random.rand(len(csv_cleaned)) < 0.6,
    'M',
    'F'
)

# 5. Save updated cleaned dataset
csv_cleaned.to_csv('../data/mobile_money_transactions_cleaned.csv', index=False)
print('Updated cleaned dataset saved with more variation.')

Updated cleaned dataset saved with more variation.
