# Import Important Libraries

In [12]:
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point

# Load Data

In [4]:
url = "C:/Users/USER/my_workspace/synapse/loan-default-risk/data/raw/customer_banking_profile.csv"
df = pd.read_csv(url) 

### Explore Dataset

In [5]:
print(df.info())
print(df.describe())
print(df.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4346 entries, 0 to 4345
Data columns (total 9 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   customerid                  4346 non-null   object 
 1   birthdate                   4346 non-null   object 
 2   bank_account_type           4346 non-null   object 
 3   longitude_gps               4346 non-null   float64
 4   latitude_gps                4346 non-null   float64
 5   bank_name_clients           4346 non-null   object 
 6   bank_branch_clients         51 non-null     object 
 7   employment_status_clients   3698 non-null   object 
 8   level_of_education_clients  587 non-null    object 
dtypes: float64(2), object(7)
memory usage: 305.7+ KB
None
       longitude_gps  latitude_gps
count    4346.000000   4346.000000
mean        4.626189      7.251356
std         7.184832      3.055052
min      -118.247009    -33.868818
25%         3.354953      6.47061

In [6]:
# Check uniqueness 
df['customerid'].nunique()

4334

In [7]:
# Check nulls
df['customerid'].isnull().sum()

np.int64(0)

In [8]:
# Check duplicates
df['customerid'].duplicated().sum()
# Drop duplicates
df = df.drop_duplicates(subset=['customerid'])
print(df.info())
print(df.shape)

<class 'pandas.core.frame.DataFrame'>
Index: 4334 entries, 0 to 4345
Data columns (total 9 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   customerid                  4334 non-null   object 
 1   birthdate                   4334 non-null   object 
 2   bank_account_type           4334 non-null   object 
 3   longitude_gps               4334 non-null   float64
 4   latitude_gps                4334 non-null   float64
 5   bank_name_clients           4334 non-null   object 
 6   bank_branch_clients         51 non-null     object 
 7   employment_status_clients   3686 non-null   object 
 8   level_of_education_clients  586 non-null    object 
dtypes: float64(2), object(7)
memory usage: 338.6+ KB
None
(4334, 9)


In [9]:
# Drop columns
df = df.drop(['bank_branch_clients', 'level_of_education_clients'], axis=1)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 4334 entries, 0 to 4345
Data columns (total 7 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   customerid                 4334 non-null   object 
 1   birthdate                  4334 non-null   object 
 2   bank_account_type          4334 non-null   object 
 3   longitude_gps              4334 non-null   float64
 4   latitude_gps               4334 non-null   float64
 5   bank_name_clients          4334 non-null   object 
 6   employment_status_clients  3686 non-null   object 
dtypes: float64(2), object(5)
memory usage: 270.9+ KB
None


In [10]:
# Fill nulls in employment_status
df['employment_status_clients'].fillna('Unknown', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['employment_status_clients'].fillna('Unknown', inplace=True)


In [11]:
# Convert birthdate to age
df['birthdate'] = pd.to_datetime(df['birthdate'])
df['age'] = (pd.Timestamp.now() - df['birthdate']).dt.days // 365
df = df.drop('birthdate', axis=1)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 4334 entries, 0 to 4345
Data columns (total 7 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   customerid                 4334 non-null   object 
 1   bank_account_type          4334 non-null   object 
 2   longitude_gps              4334 non-null   float64
 3   latitude_gps               4334 non-null   float64
 4   bank_name_clients          4334 non-null   object 
 5   employment_status_clients  4334 non-null   object 
 6   age                        4334 non-null   int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 270.9+ KB
None


In [13]:
# Geospatial Enrichment 
states = gpd.read_file("nigeria_states.shp")

geometry = [Point(xy) for xy in zip(df.longitude_gps, df.latitude_gps)]
gdf = gpd.GeoDataFrame(df, geometry=geometry, crs="EPSG:4326")

gdf = gpd.sjoin(gdf, states, how="left", predicate="within")
df["state"] = gdf["STATE_NAME"].fillna("Unknown")


DataSourceError: nigeria_states.shp: No such file or directory