In [1]:
# Import relevant libraries
import pandas as pd
from ydata_profiling import ProfileReport
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.preprocessing import  StandardScaler,LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
import pickle

In [None]:
# specify the file_path
file_path="./Financial_inclusion_dataset.csv"

# create a dataframe
df = pd.read_csv(file_path,index_col=False)
df.head()

Unnamed: 0,country,year,uniqueid,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,2018,uniqueid_1,Yes,Rural,Yes,3,24,Female,Spouse,Married/Living together,Secondary education,Self employed
1,Kenya,2018,uniqueid_2,No,Rural,No,5,70,Female,Head of Household,Widowed,No formal education,Government Dependent
2,Kenya,2018,uniqueid_3,Yes,Urban,Yes,5,26,Male,Other relative,Single/Never Married,Vocational/Specialised training,Self employed
3,Kenya,2018,uniqueid_4,No,Rural,Yes,5,34,Female,Head of Household,Married/Living together,Primary education,Formally employed Private
4,Kenya,2018,uniqueid_5,No,Urban,No,8,26,Male,Child,Single/Never Married,Primary education,Informally employed


In [3]:
# dropping unecessary columns
df= df.drop(columns={"uniqueid","year","country"})

In [4]:
# general information of the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23524 entries, 0 to 23523
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   bank_account            23524 non-null  object
 1   location_type           23524 non-null  object
 2   cellphone_access        23524 non-null  object
 3   household_size          23524 non-null  int64 
 4   age_of_respondent       23524 non-null  int64 
 5   gender_of_respondent    23524 non-null  object
 6   relationship_with_head  23524 non-null  object
 7   marital_status          23524 non-null  object
 8   education_level         23524 non-null  object
 9   job_type                23524 non-null  object
dtypes: int64(2), object(8)
memory usage: 1.8+ MB


In [5]:
# statistical analysis of the dataset
df.describe()

Unnamed: 0,household_size,age_of_respondent
count,23524.0,23524.0
mean,3.797483,38.80522
std,2.227613,16.520569
min,1.0,16.0
25%,2.0,26.0
50%,3.0,35.0
75%,5.0,49.0
max,21.0,100.0


In [6]:
# genrating a profile report
profile_report =ProfileReport(df,title="Financial_inclusion profile Report",explorative=True)
profile_path = "Financial_inclusion.html"
profile_report.to_file(profile_path)
print(f"\n Profile report generated{profile_path}")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]


 Profile report generatedFinancial_inclusion.html


In [7]:
# checking for missing values
df.isnull().sum()

bank_account              0
location_type             0
cellphone_access          0
household_size            0
age_of_respondent         0
gender_of_respondent      0
relationship_with_head    0
marital_status            0
education_level           0
job_type                  0
dtype: int64

In [8]:
# checking for duplicate values and removing them
df.duplicated().sum()

np.int64(5310)

In [9]:
df = df.drop_duplicates()

In [10]:
# Handling outliers
lower_quantile = df.quantile(0.25,numeric_only=True)
upper_quantile = df.quantile(0.75,numeric_only=True)
IRQ = upper_quantile - lower_quantile

lower_bound = lower_quantile - 1.5 * IRQ
upper_bound= upper_quantile + 1.5 * IRQ
df_filtered, lower_bound = df.align(lower_bound,axis=1,copy=False)
df_filtered, upper_bound = df.align(upper_bound,axis=1,copy=False)

df = df_filtered[~((df_filtered < lower_bound) | (df_filtered > upper_bound)).any(axis=1)]
df

Unnamed: 0,age_of_respondent,bank_account,cellphone_access,education_level,gender_of_respondent,household_size,job_type,location_type,marital_status,relationship_with_head
0,24,Yes,Yes,Secondary education,Female,3,Self employed,Rural,Married/Living together,Spouse
1,70,No,No,No formal education,Female,5,Government Dependent,Rural,Widowed,Head of Household
2,26,Yes,Yes,Vocational/Specialised training,Male,5,Self employed,Urban,Single/Never Married,Other relative
3,34,No,Yes,Primary education,Female,5,Formally employed Private,Rural,Married/Living together,Head of Household
4,26,No,No,Primary education,Male,8,Informally employed,Urban,Single/Never Married,Child
...,...,...,...,...,...,...,...,...,...,...
23518,20,No,Yes,Primary education,Female,9,No Income,Rural,Single/Never Married,Child
23519,48,No,Yes,No formal education,Female,4,Other Income,Rural,Divorced/Seperated,Head of Household
23520,27,No,Yes,Secondary education,Female,2,Other Income,Rural,Single/Never Married,Head of Household
23521,27,No,Yes,Primary education,Female,5,Other Income,Rural,Widowed,Parent


In [11]:
df.columns

Index(['age_of_respondent', 'bank_account', 'cellphone_access',
       'education_level', 'gender_of_respondent', 'household_size', 'job_type',
       'location_type', 'marital_status', 'relationship_with_head'],
      dtype='object')

In [12]:
# Initialize LabelEncoders
label_encoders = {}
# encoding categorical values
categorical_cols = ['cellphone_access',
       'education_level', 'gender_of_respondent', 'job_type',
       'location_type', 'marital_status', 'relationship_with_head']
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])  # Convert to numeric
    label_encoders[col] = le 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = le.fit_transform(df[col])  # Convert to numeric
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = le.fit_transform(df[col])  # Convert to numeric
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = le.fit_transform(df[col])  # Convert to numeric
A value is trying to be set on 

In [13]:
df['bank_account'] = df['bank_account'].map({"Yes": 1 ,"No" : 0})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['bank_account'] = df['bank_account'].map({"Yes": 1 ,"No" : 0})


In [14]:
# check if the dataframe has encoded values
df.head()

Unnamed: 0,age_of_respondent,bank_account,cellphone_access,education_level,gender_of_respondent,household_size,job_type,location_type,marital_status,relationship_with_head
0,24,1,1,3,0,3,9,0,2,5
1,70,0,0,0,0,5,4,0,4,1
2,26,1,1,5,1,5,9,1,3,3
3,34,0,1,2,0,5,3,0,2,1
4,26,0,0,2,1,8,5,1,3,0


In [15]:
# training a model to target is has a bank account
# splitting features and target
X = df.drop(['bank_account'],axis=1)
y =df['bank_account']
# splitting the data into training and testing data
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

# standardize the features
scalar = StandardScaler()
X_train = scalar.fit_transform(X_train)
X_test = scalar.transform(X_test)

# training our random forest classifier model
model = RandomForestClassifier(random_state=42)
model.fit(X_train,y_train)

In [16]:
# prediction
y_pred = model.predict(X_test)

In [17]:
# testing model accuracy
accuracy = accuracy_score(y_test,y_pred)
accuracy

0.8184641445511011

In [18]:
# Measuring the accuracy of a model
cm = confusion_matrix(y_test,y_pred)
cm

array([[2710,  248],
       [ 395,  189]])

In [19]:
# cross validation
cv_scores = cross_val_score(model,X,y, cv=10)
cv_scores

array([0.76453981, 0.78035008, 0.77696217, 0.80688876, 0.81818182,
       0.81648786, 0.78599661, 0.81761717, 0.85536723, 0.83220339])

In [20]:
# so that we can use our model in the streamlit application
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)
    

In [21]:
df.columns

Index(['age_of_respondent', 'bank_account', 'cellphone_access',
       'education_level', 'gender_of_respondent', 'household_size', 'job_type',
       'location_type', 'marital_status', 'relationship_with_head'],
      dtype='object')