In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [3]:
data = pd.read_csv(r"C:\Users\PC\Documents\data science GMC\financial_dataset_model\Financial_inclusion_dataset.csv")
data.head()

Unnamed: 0,country,year,uniqueid,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,2018,uniqueid_1,Yes,Rural,Yes,3,24,Female,Spouse,Married/Living together,Secondary education,Self employed
1,Kenya,2018,uniqueid_2,No,Rural,No,5,70,Female,Head of Household,Widowed,No formal education,Government Dependent
2,Kenya,2018,uniqueid_3,Yes,Urban,Yes,5,26,Male,Other relative,Single/Never Married,Vocational/Specialised training,Self employed
3,Kenya,2018,uniqueid_4,No,Rural,Yes,5,34,Female,Head of Household,Married/Living together,Primary education,Formally employed Private
4,Kenya,2018,uniqueid_5,No,Urban,No,8,26,Male,Child,Single/Never Married,Primary education,Informally employed


In [4]:
data["country"].value_counts()

country
Rwanda      8735
Tanzania    6620
Kenya       6068
Uganda      2101
Name: count, dtype: int64

In [5]:
country_encoder = LabelEncoder()
data["country"] = country_encoder.fit_transform(data["country"])

In [6]:
data = data.drop(['uniqueid', 'year'], axis =1)

In [7]:
location_encoder = LabelEncoder()
data['location_type'] = location_encoder.fit_transform(data['location_type'])
cellphone_encoder = LabelEncoder()
data['cellphone_access'] = cellphone_encoder.fit_transform(data['cellphone_access'])
gender_encoder = LabelEncoder()
data['gender_of_respondent'] = gender_encoder.fit_transform(data['gender_of_respondent'])
relationship_encoder = LabelEncoder()
data["relationship_with_head"] = relationship_encoder.fit_transform(data["relationship_with_head"])
marital_status_encoder = LabelEncoder()
data["marital_status"] = marital_status_encoder.fit_transform(data["marital_status"])
education_level_encoder = LabelEncoder()
data["education_level"] = education_level_encoder.fit_transform(data["education_level"])
job_type_encoder = LabelEncoder()
data["job_type"] = job_type_encoder.fit_transform(data["job_type"])

In [8]:
data.head()

Unnamed: 0,country,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,0,Yes,0,1,3,24,0,5,2,3,9
1,0,No,0,0,5,70,0,1,4,0,4
2,0,Yes,1,1,5,26,1,3,3,5,9
3,0,No,0,1,5,34,0,1,2,2,3
4,0,No,1,0,8,26,1,0,3,2,5


In [9]:
x = data.drop(columns='bank_account')
target_encoder = LabelEncoder()
y = target_encoder.fit_transform(data['bank_account'])

In [10]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [11]:
model = RandomForestClassifier()

In [12]:
param_grid = {
    "criterion": ['gini', 'entropy', 'log-loss'],
    "max_depth": [2,5,6,7,9,10,None],
    "min_samples_split": [2,3,4,5],
}

In [13]:
grid_search_cv = GridSearchCV(estimator=model, param_grid=param_grid, cv=15, scoring="accuracy")

In [14]:
grid_search_cv.fit(x_train, y_train)

420 fits failed out of a total of 1260.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
420 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\PC\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\PC\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\base.py", line 1382, in wrapper
    estimator._validate_params()
    ~~~~~~~~~~~~~~~~~~~~~~~~~~^^
  File "c:\Users\PC\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\base.py", line 436, in _validate_params
    validate_parameter_co

In [15]:
best_estimator = grid_search_cv.best_estimator_
y_pred = best_estimator.predict(x_test)

In [16]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.98      0.94      6073
           1       0.74      0.31      0.44       985

    accuracy                           0.89      7058
   macro avg       0.82      0.65      0.69      7058
weighted avg       0.88      0.89      0.87      7058



In [17]:
print(accuracy_score(y_pred, y_test))

0.8883536412581468


In [24]:
with open('country_encoder.pkl', 'wb') as ce:
  pickle.dump(country_encoder, ce)
with open('location_encoder.pkl', 'wb') as le:
  pickle.dump(location_encoder, le)
with open('cellphone_encoder.pkl', 'wb') as ce:
  pickle.dump(cellphone_encoder, ce)
with open('gender_encoder.pkl', 'wb') as ge:
  pickle.dump(gender_encoder, ge)
with open('relationship_encoder.pkl', 'wb') as re:
  pickle.dump(relationship_encoder, re)
with open('marital_status_encoder.pkl', 'wb') as mse:
  pickle.dump(marital_status_encoder, mse)
with open('education_level_encoder.pkl', 'wb') as ele:
  pickle.dump(education_level_encoder, ele)
with open('job_type_encoder.pkl', 'wb') as jte:
  pickle.dump(job_type_encoder, jte)

with open('target_encoder.pkl', 'wb') as te:
  pickle.dump(target_encoder, te)

with open('model.pkl', 'wb') as mdl:
  pickle.dump(best_estimator, mdl)

In [22]:
import joblib

In [25]:

joblib.dump(country_encoder, 'country_encoder.joblib')
joblib.dump(best_estimator, 'model.joblib')
joblib.dump(location_encoder, 'location_encoder.joblib')
joblib.dump(cellphone_encoder, 'cellphone_encoder.joblib')
joblib.dump(gender_encoder, 'gender_encoder.joblib')
joblib.dump(relationship_encoder, 'relationship_encoder.joblib')
joblib.dump(marital_status_encoder, 'marital_status_encoder.joblib')
joblib.dump(education_level_encoder, 'education_level_encoder.joblib')
joblib.dump(job_type_encoder, 'job_type_encoder.joblib')
joblib.dump(target_encoder, 'target_encoder.joblib')

['target_encoder.joblib']