# Import Important Libraries

In [None]:
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point

# Load Data

In [None]:
# Define filepath and load 
url = "../data/raw/customer_banking_profile.csv"
df2 = pd.read_csv(url) 

### Explore Dataset

In [None]:
print(df2.info())
print(df2.describe())
print(df2.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4346 entries, 0 to 4345
Data columns (total 9 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   customerid                  4346 non-null   object 
 1   birthdate                   4346 non-null   object 
 2   bank_account_type           4346 non-null   object 
 3   longitude_gps               4346 non-null   float64
 4   latitude_gps                4346 non-null   float64
 5   bank_name_clients           4346 non-null   object 
 6   bank_branch_clients         51 non-null     object 
 7   employment_status_clients   3698 non-null   object 
 8   level_of_education_clients  587 non-null    object 
dtypes: float64(2), object(7)
memory usage: 305.7+ KB
None
       longitude_gps  latitude_gps
count    4346.000000   4346.000000
mean        4.626189      7.251356
std         7.184832      3.055052
min      -118.247009    -33.868818
25%         3.354953      6.47061

In [None]:
# Check uniqueness 
df2['customerid'].nunique()

4334

In [None]:
# Check nulls
df2['customerid'].isnull().sum()

np.int64(0)

In [None]:
# Check duplicates
df2['customerid'].duplicated().sum()
# Drop duplicates
df2 = df2.drop_duplicates(subset=['customerid'])
print(df2.info())
print(df2.shape)

<class 'pandas.core.frame.DataFrame'>
Index: 4334 entries, 0 to 4345
Data columns (total 9 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   customerid                  4334 non-null   object 
 1   birthdate                   4334 non-null   object 
 2   bank_account_type           4334 non-null   object 
 3   longitude_gps               4334 non-null   float64
 4   latitude_gps                4334 non-null   float64
 5   bank_name_clients           4334 non-null   object 
 6   bank_branch_clients         51 non-null     object 
 7   employment_status_clients   3686 non-null   object 
 8   level_of_education_clients  586 non-null    object 
dtypes: float64(2), object(7)
memory usage: 338.6+ KB
None
(4334, 9)


In [None]:
# Drop columns
df2 = df2.drop(['bank_branch_clients', 'level_of_education_clients'], axis=1)
print(df2.info())

<class 'pandas.core.frame.DataFrame'>
Index: 4334 entries, 0 to 4345
Data columns (total 7 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   customerid                 4334 non-null   object 
 1   birthdate                  4334 non-null   object 
 2   bank_account_type          4334 non-null   object 
 3   longitude_gps              4334 non-null   float64
 4   latitude_gps               4334 non-null   float64
 5   bank_name_clients          4334 non-null   object 
 6   employment_status_clients  3686 non-null   object 
dtypes: float64(2), object(5)
memory usage: 270.9+ KB
None


In [None]:
# Fill nulls in employment_status
df2['employment_status_clients'].fillna('Unknown', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df2['employment_status_clients'].fillna('Unknown', inplace=True)


In [None]:
# Convert birthdate to age
df2['birthdate'] = pd.to_datetime(df2['birthdate'])
df2['age'] = (pd.Timestamp.now() - df2['birthdate']).dt.days // 365
df2 = df2.drop('birthdate', axis=1)
print(df2.info())

<class 'pandas.core.frame.DataFrame'>
Index: 4334 entries, 0 to 4345
Data columns (total 7 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   customerid                 4334 non-null   object 
 1   bank_account_type          4334 non-null   object 
 2   longitude_gps              4334 non-null   float64
 3   latitude_gps               4334 non-null   float64
 4   bank_name_clients          4334 non-null   object 
 5   employment_status_clients  4334 non-null   object 
 6   age                        4334 non-null   int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 270.9+ KB
None


### Geo Mapping

In [None]:
# Load Nigeria States
states = gpd.read_file("../data/geospatial/gadm41_NGA_1.shp")

In [None]:
# convert customer data to GeoDataFrame
geometry = [Point(xy) for xy in zip(df2.longitude_gps, df2.latitude_gps)]
gdf2 = gpd.GeoDataFrame(df2, geometry=geometry, crs="EPSG:4326")

In [None]:
# assign state using spatial join
gdf2 = gpd.sjoin(
    gdf2,
    states[['NAME_1', 'geometry']],
    how='left',
    predicate='within'
)


In [None]:
# clean column
gdf2.rename(columns={'NAME_1': 'state'}, inplace=True)

In [None]:
# save
gdf2.drop(columns='geometry').to_csv(
    "customer_banking_profile_cleaned.csv",
    index=False
)

In [None]:
# Sanity check
gdf2['state'].value_counts().head()


state
Lagos                        1924
Federal Capital Territory     353
Oyo                           352
Ogun                          336
Rivers                        175
Name: count, dtype: int64

In [None]:
# State mapping to df2
state_mapping = gdf2[['customerid', 'state']].copy()
df2 = df2.merge(state_mapping, on='customerid', how='left')

In [None]:
# Inspect final dataframe
print(df2.info())
print(df2.describe())
print(df2.shape)
print(df2.head(50))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4334 entries, 0 to 4333
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   customerid                 4334 non-null   object 
 1   bank_account_type          4334 non-null   object 
 2   longitude_gps              4334 non-null   float64
 3   latitude_gps               4334 non-null   float64
 4   bank_name_clients          4334 non-null   object 
 5   employment_status_clients  4334 non-null   object 
 6   age                        4334 non-null   int64  
 7   state                      4293 non-null   object 
dtypes: float64(2), int64(1), object(5)
memory usage: 271.0+ KB
None
       longitude_gps  latitude_gps          age
count    4334.000000   4334.000000  4334.000000
mean        4.624000      7.249975    40.952469
std         7.194031      3.056594     6.142477
min      -118.247009    -33.868818    29.000000
25%         3.354857     

In [None]:
# Save cleaned dataset
df2.to_csv(r"..\data\interim\customer_banking_profile.csv", index=False)