<a href="https://colab.research.google.com/github/Segn11/zindi-financial-inclusion/blob/main/financial_pred.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

In [20]:
df = pd.read_csv('/content/Train (6).csv')
test = pd.read_csv('/content/Test (5).csv')

In [64]:
df.head()


Unnamed: 0,country,year,uniqueid,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,2018,uniqueid_1,Yes,Rural,Yes,3,24,Female,Spouse,Married/Living together,Secondary education,Self employed
1,Kenya,2018,uniqueid_2,No,Rural,No,5,70,Female,Head of Household,Widowed,No formal education,Government Dependent
2,Kenya,2018,uniqueid_3,Yes,Urban,Yes,5,26,Male,Other relative,Single/Never Married,Vocational/Specialised training,Self employed
3,Kenya,2018,uniqueid_4,No,Rural,Yes,5,34,Female,Head of Household,Married/Living together,Primary education,Formally employed Private
4,Kenya,2018,uniqueid_5,No,Urban,No,8,26,Male,Child,Single/Never Married,Primary education,Informally employed


In [21]:

df_id = df['uniqueid'].copy()
test_id = test["uniqueid"].copy()

# Prepare text data
X_df_raw = df.drop(columns=["uniqueid", "bank_account"]).copy()
X_te_raw = test.drop(columns=["uniqueid"]).copy()

# Combine for vectorization
all_data = pd.concat([X_df_raw, X_te_raw], axis=0).reset_index(drop=True)


In [4]:
all_data.isna().sum()

Unnamed: 0,0
country,0
year,0
location_type,0
cellphone_access,0
household_size,0
age_of_respondent,0
gender_of_respondent,0
relationship_with_head,0
marital_status,0
education_level,0


In [42]:
all_data.head()

Unnamed: 0,country,year,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,2018,Rural,Yes,3,24,Female,Spouse,Married/Living together,Secondary education,Self employed
1,Kenya,2018,Rural,No,5,70,Female,Head of Household,Widowed,No formal education,Government Dependent
2,Kenya,2018,Urban,Yes,5,26,Male,Other relative,Single/Never Married,Vocational/Specialised training,Self employed
3,Kenya,2018,Rural,Yes,5,34,Female,Head of Household,Married/Living together,Primary education,Formally employed Private
4,Kenya,2018,Urban,No,8,26,Male,Child,Single/Never Married,Primary education,Informally employed


In [22]:
cat_cols = all_data.select_dtypes(include="object").columns
cat_cols = list(cat_cols)
print(cat_cols)

['country', 'location_type', 'cellphone_access', 'gender_of_respondent', 'relationship_with_head', 'marital_status', 'education_level', 'job_type']


In [23]:
num_cols = all_data.select_dtypes(include="number").columns
num_cols = list(num_cols)
print(num_cols)

['year', 'household_size', 'age_of_respondent']


In [24]:
from sklearn.preprocessing import LabelEncoder
float_array = all_data[["household_size", "age_of_respondent", "year"]].values.astype(float)

# List of categorical columns for One-Hot Encoding
cat_cols = [
    "relationship_with_head",
    "marital_status",
    "education_level",
    "job_type",
    "country"
]

# Keep only columns that exist in all_data
cat_cols = [col for col in cat_cols if col in all_data.columns]

# One-Hot Encoding
all_data = pd.get_dummies(all_data, columns=cat_cols, prefix_sep="_")

# Label encoding for small/binary categorical columns
le = LabelEncoder()
for col in ["location_type", "cellphone_access", "gender_of_respondent"]:
    if col in all_data.columns:
        all_data[col] = le.fit_transform(all_data[col].astype(str))


In [25]:
# If train['bank_account'] contains 'Yes'/'No'
y = df['bank_account'].map({'No': 0, 'Yes': 1})


In [28]:
n_train = X_df_raw.shape[0]
X_train = all_data.iloc[:n_train, :].copy()
X_test  = all_data.iloc[n_train:, :].copy()

In [30]:
from sklearn.model_selection import train_test_split
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y, test_size=0.2, random_state=42
)



In [31]:
from lightgbm import LGBMClassifier
from sklearn.metrics import mean_absolute_error, roc_auc_score

model = LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=7,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

model.fit(X_tr, y_tr)


[LightGBM] [Info] Number of positive: 2670, number of negative: 16149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002228 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 172
[LightGBM] [Info] Number of data points in the train set: 18819, number of used features: 36
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.141878 -> initscore=-1.799780
[LightGBM] [Info] Start training from score -1.799780


In [32]:
y_val_pred = model.predict(X_val)
y_val_prob = model.predict_proba(X_val)[:, 1]  # probability of class 1

# Since Zindi uses MAE
val_mae = mean_absolute_error(y_val, y_val_pred)
print("Validation MAE:", val_mae)

# Optional: ROC-AUC for additional insight
roc_auc = roc_auc_score(y_val, y_val_prob)
print("Validation ROC-AUC:", roc_auc)


Validation MAE: 0.11073326248671626
Validation ROC-AUC: 0.8660347961966628


In [34]:
model.fit(X_train, y)  # train on all training data
test_preds = model.predict(X_test)




[LightGBM] [Info] Number of positive: 3312, number of negative: 20212
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002115 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 173
[LightGBM] [Info] Number of data points in the train set: 23524, number of used features: 36
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.140792 -> initscore=-1.808724
[LightGBM] [Info] Start training from score -1.808724


In [None]:
# Create submission DataFrame
submission = pd.DataFrame({"uniqueid": test["uniqueid"] + " x " + test["country"],
                           "bank_account": test.bank_account})

In [46]:
# test_id: Series of unique IDs
# test_country: Series of country names corresponding to test IDs
test_country = test["country"].copy()  # make sure you have this column

# Prepare submission
submission = pd.DataFrame({
    "unique_id": test_id.astype(str) + " x " + test_country.astype(str),
    "bank_account": test_preds
})

submission.to_csv("submission.csv", index=False)
print("✅ Submission saved!")


✅ Submission saved!


In [47]:
submission.head()

Unnamed: 0,unique_id,bank_account
0,uniqueid_6056 x Kenya,1
1,uniqueid_6060 x Kenya,1
2,uniqueid_6065 x Kenya,0
3,uniqueid_6072 x Kenya,0
4,uniqueid_6073 x Kenya,0


In [48]:
from google.colab import files
files.download("submission.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>