In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import imblearn
from scipy.stats import chi2_contingency
from matplotlib.ticker import PercentFormatter
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.feature_selection import mutual_info_classif
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score
from scipy import stats
import heapq

In [3]:
df = pd.read_csv('data/train.csv')

In [4]:
df.columns

Index(['country', 'year', 'uniqueid', 'bank_account', 'location_type',
       'cellphone_access', 'household_size', 'age_of_respondent',
       'gender_of_respondent', 'relationship_with_head', 'marital_status',
       'education_level', 'job_type'],
      dtype='object')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23524 entries, 0 to 23523
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   country                 23524 non-null  object
 1   year                    23524 non-null  int64 
 2   uniqueid                23524 non-null  object
 3   bank_account            23524 non-null  object
 4   location_type           23524 non-null  object
 5   cellphone_access        23524 non-null  object
 6   household_size          23524 non-null  int64 
 7   age_of_respondent       23524 non-null  int64 
 8   gender_of_respondent    23524 non-null  object
 9   relationship_with_head  23524 non-null  object
 10  marital_status          23524 non-null  object
 11  education_level         23524 non-null  object
 12  job_type                23524 non-null  object
dtypes: int64(3), object(10)
memory usage: 2.3+ MB


### Chi-Square Test

- If the p-value is less than or equal to 0.05 , it is typically considered statistically significant
- If the p-value is greater than 0.05, it is generally considered not statistically significant


In [7]:
# Select the categorical columns
categorical_columns = ['country', 'location_type', 'cellphone_access',
                       'gender_of_respondent', 'relationship_with_head',
                       'marital_status', 'education_level', 'job_type']

# Perform chi-square test for each categorical feature
results = []
for feature in categorical_columns:
    contingency_table = pd.crosstab(df[feature], df['bank_account'])
    _, p_value, _, _ = chi2_contingency(contingency_table)
    results.append((feature, p_value))

# Sort the results by p-value in ascending order
results.sort(key=lambda x: x[1])

# Print the features and their p-values
for feature, p_value in results:
    print(f"Feature: {feature}, p-value: {p_value}")

Feature: education_level, p-value: 0.0
Feature: job_type, p-value: 0.0
Feature: cellphone_access, p-value: 1.3592541989722424e-226
Feature: country, p-value: 2.5290257400600255e-181
Feature: gender_of_respondent, p-value: 3.884875437093037e-72
Feature: relationship_with_head, p-value: 9.9629019270655e-69
Feature: location_type, p-value: 9.218844615938627e-41
Feature: marital_status, p-value: 1.03957918709173e-40


### Information Gain/Mutual Information

In [17]:
import pandas as pd
from sklearn.feature_selection import mutual_info_classif

# Select the categorical columns
categorical_columns = ['country', 'location_type', 'cellphone_access',
                       'gender_of_respondent', 'relationship_with_head',
                       'marital_status', 'education_level', 'job_type']

# Perform one-hot encoding on the categorical features
encoded_df = pd.get_dummies(df[categorical_columns])

# Compute the mutual information for each categorical feature
mutual_info = mutual_info_classif(encoded_df, df['bank_account'])

# Create a DataFrame with feature names and mutual information scores
feature_scores = pd.DataFrame({'Feature': encoded_df.columns, 'Mutual Information': mutual_info})

# Sort the features by mutual information scores in descending order
feature_scores = feature_scores.sort_values('Mutual Information', ascending=False)

# Print the feature scores
print(feature_scores)

                                            Feature  Mutual Information
7                              cellphone_access_Yes            0.033895
6                               cellphone_access_No            0.030739
30               job_type_Formally employed Private            0.020597
0                                     country_Kenya            0.018550
25               education_level_Tertiary education            0.018420
29            job_type_Formally employed Government            0.018389
26  education_level_Vocational/Specialised training            0.016744
23                education_level_Primary education            0.016193
21              education_level_No formal education            0.013662
32                     job_type_Informally employed            0.011679
9                         gender_of_respondent_Male            0.010986
11         relationship_with_head_Head of Household            0.010043
4                               location_type_Rural            0

### Feature Importance in Decision Trees

In [10]:
from sklearn.ensemble import RandomForestClassifier

# Select the categorical columns
categorical_columns = ['country', 'location_type', 'cellphone_access',
                       'gender_of_respondent', 'relationship_with_head',
                       'marital_status', 'education_level', 'job_type']

# One-hot encode the categorical columns
encoded_df = pd.get_dummies(df[categorical_columns])

# Create the target variable
target = df['bank_account']

# Train a random forest classifier
rf = RandomForestClassifier()
rf.fit(encoded_df, target)

# Get feature importances
feature_importances = rf.feature_importances_

# Create a DataFrame with feature names and importances
feature_scores = pd.DataFrame({'Feature': encoded_df.columns, 'Importance': feature_importances})

# Sort the features by importance in descending order
feature_scores = feature_scores.sort_values(by='Importance', ascending=False)

# Print the features and their importances
print(feature_scores)

                                            Feature  Importance
30               job_type_Formally employed Private    0.082109
25               education_level_Tertiary education    0.081191
29            job_type_Formally employed Government    0.066830
26  education_level_Vocational/Specialised training    0.061796
0                                     country_Kenya    0.049113
23                education_level_Primary education    0.044767
6                               cellphone_access_No    0.036793
11         relationship_with_head_Head of Household    0.034301
24              education_level_Secondary education    0.033562
2                                  country_Tanzania    0.032810
7                              cellphone_access_Yes    0.031684
36                           job_type_Self employed    0.030874
4                               location_type_Rural    0.030295
5                               location_type_Urban    0.029175
21              education_level_No forma

### LASSO Regression

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder

# Select the categorical columns
categorical_columns = ['country', 'location_type', 'cellphone_access',
                       'gender_of_respondent', 'relationship_with_head',
                       'marital_status', 'education_level', 'job_type']

# One-hot encode the categorical columns
encoder = OneHotEncoder(sparse=False)
encoded_features = encoder.fit_transform(df[categorical_columns])

# Create the target variable
target = df['bank_account']

# Fit LASSO logistic regression
lasso = LogisticRegression(penalty='l1', solver='liblinear')
lasso.fit(encoded_features, target)

# Get feature names and coefficients
feature_names = encoder.get_feature_names_out(categorical_columns)
feature_coefficients = lasso.coef_[0]

# Create a DataFrame with feature names and coefficients
feature_scores = pd.DataFrame({'Feature': feature_names, 'Coefficient': feature_coefficients})

# Sort the features by absolute coefficient values in descending order
feature_scores['Absolute Coefficient'] = feature_scores['Coefficient'].abs()
feature_scores = feature_scores.sort_values(by='Absolute Coefficient', ascending=False)

# Print the features and their coefficients
print(feature_scores)



                                            Feature  Coefficient   
21              education_level_No formal education    -2.333983  \
6                               cellphone_access_No    -1.744520   
23                education_level_Primary education    -1.651080   
29            job_type_Formally employed Government     1.283055   
3                                    country_Uganda    -1.084932   
30               job_type_Formally employed Private     1.056519   
31                    job_type_Government Dependent     1.032054   
33                               job_type_No Income    -1.002302   
32                     job_type_Informally employed    -0.848613   
11         relationship_with_head_Head of Household     0.847375   
2                                  country_Tanzania    -0.767517   
24              education_level_Secondary education    -0.720220   
35                    job_type_Remittance Dependent    -0.654108   
15                    relationship_with_head_Spo