In [1]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [None]:
!pip install category_encoders lightautoml

In [None]:
!pip install scipy==1.10.1
!pip install --upgrade statsmodels

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import category_encoders as ce

In [4]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

In [5]:
from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.tasks import Task



In [6]:
def predict_model(automl, df_test):
  proba = automl.predict(df_test).data
  classes = automl.targets_order
  pred = [classes[i] for i in proba.argmax(1)]
  return pred

Generate additional features

In [7]:
def generate_features(df):
  df['dti'] = df['current_total_liability'] / (df['estimated_annual_income'] + 1e-6)
  df['save_rate'] = df['monthly_investments'] / (df['monthly_gig_income'] + 1e-6)
  df['cash_rate'] = df['end_of_month_balance'] / (df['monthly_gig_income'] + 1e-6)
  df['liability_to_balance'] = df['current_total_liability'] / (df['end_of_month_balance'] + 1e-6)
  df['investment_share'] = df['monthly_investments'] / (df['monthly_gig_income'] + 1e-6)
  df['liability_per_card'] = df['current_total_liability'] / (df['num_credit_cards'] + 1e-6)
  df['utilization_per_card'] = df['credit_utilization_rate'] / (df['num_credit_cards'] + 1e-6)
  return df

In [8]:
df_test = pd.read_csv('/gdrive/My Drive/financial_stress/test.csv', index_col='Unnamed: 0')
df_train = pd.read_csv('/gdrive/My Drive/financial_stress/train.csv', index_col='Unnamed: 0')

In [9]:
drop_cols = ['survey_month', 'worker_id']
cat_cols = ['job_sector', 'credit_age_months', 'min_payment_flag', 'spending_behavior']
num_cols = ['worker_age', 'estimated_annual_income', 'monthly_gig_income', 'num_savings_accounts',
            'num_credit_cards', 'avg_credit_interest', 'num_active_loans', 'avg_loan_delay_days',
            'missed_payment_events', 'recent_credit_checks', 'current_total_liability', 'credit_utilization_rate',
            'monthly_investments', 'end_of_month_balance',
            'dti', 'save_rate', 'cash_rate', 'liability_to_balance', 'investment_share',
            'liability_per_card', 'utilization_per_card']

In [10]:
df_train = generate_features(df_train)
df_test = generate_features(df_test)

Perform data preprocessing. We are working with time-based data (employees’ transaction records are available for multiple months). Everything before August is used for training, and August is reserved for testing. This simulates a real scenario where the model must make predictions on a “future period” it has not seen before.

In [11]:
train_data = df_train[df_train['survey_month'] != 'August']
test_data = df_train[df_train['survey_month'] == 'August']

train_data_agg = train_data[num_cols + ['worker_id']].groupby('worker_id').agg(['mean', 'median', 'sum', 'min', 'max'])
train_data_agg.columns = [str(i) for i in train_data_agg.columns]

train_data = train_data.merge(train_data_agg, on='worker_id', how='left')
test_data = test_data.merge(train_data_agg, on='worker_id', how='left')

df_test = df_test.merge(train_data_agg, on='worker_id', how='left')


train_data = train_data.drop(drop_cols, axis=1)
test_data = test_data.drop(drop_cols, axis=1)

In [12]:
train_data[num_cols] = train_data[num_cols].fillna('mean')
test_data[num_cols] = test_data[num_cols].fillna('mean')
df_test[num_cols] = df_test[num_cols].fillna('mean')


train_data[cat_cols] = train_data[cat_cols].fillna('Unknown')
test_data[cat_cols] = test_data[cat_cols].fillna('Unknown')
df_test[cat_cols] = df_test[cat_cols].fillna('Unknown')

In [13]:
roles = {
    'target': 'financial_stress_level',

}

In [14]:
task = Task('multiclass', metric='accuracy')
automl = TabularAutoML(task = task, timeout=400, general_params={'use_algos': ['lgb', 'cb', 'linear']})
oof = automl.fit_predict(train_data, roles=roles)

INFO:lightautoml.automl.presets.base:Stdout logging level is ERROR.
INFO:lightautoml.automl.presets.base:Task: multiclass

INFO:lightautoml.automl.presets.base:Start automl preset with listed constraints:
INFO:lightautoml.automl.presets.base:- time: 400.00 seconds
INFO:lightautoml.automl.presets.base:- CPU: 4 cores
INFO:lightautoml.automl.presets.base:- memory: 16 GB

INFO:lightautoml.reader.base:[1mTrain data shape: (48000, 131)[0m

INFO3:lightautoml.reader.base:Feats was rejected during automatic roles guess: []
INFO:lightautoml.automl.base:Layer [1m1[0m train process start. Time left 366.66 secs
INFO3:lightautoml.ml_algo.boost_lgbm:Training until validation scores don't improve for 200 rounds
INFO3:lightautoml.ml_algo.boost_lgbm:[100]	valid's multi_error: 0.254375
INFO3:lightautoml.ml_algo.boost_lgbm:[200]	valid's multi_error: 0.221667
INFO3:lightautoml.ml_algo.boost_lgbm:[300]	valid's multi_error: 0.203854
INFO3:lightautoml.ml_algo.boost_lgbm:[400]	valid's multi_error: 0.197292

In [15]:
proba = automl.predict(test_data).data
classes = automl.targets_order
pred = [classes[i] for i in proba.argmax(1)]

In [16]:
accuracy_score(test_data['financial_stress_level'], pred)

0.7245

In [20]:
train_data['financial_stress_level'].value_counts()/len(train_data)

Unnamed: 0_level_0,count
financial_stress_level,Unnamed: 1_level_1
Moderate,0.528854
Low,0.293896
High,0.17725


In [23]:
test_data['financial_stress_level'].value_counts()/len(train_data)

Unnamed: 0_level_0,count
financial_stress_level,Unnamed: 1_level_1
Moderate,0.084729
Low,0.0485
High,0.033438


In [24]:
fi = automl.get_feature_scores()
print(fi.sort_values("Importance", ascending=False).head(20))

                                  Feature     Importance
0          ('avg_credit_interest', 'max')  100291.614071
1      ('current_total_liability', 'max')   84380.932564
2         ('recent_credit_checks', 'min')   29330.327651
3       ('avg_loan_delay_days', 'median')   21157.197780
4       ('missed_payment_events', 'mean')   17327.668470
5     ('missed_payment_events', 'median')   13851.425909
6         ('liability_to_balance', 'min')   13099.685820
7      ('num_savings_accounts', 'median')   11475.705418
8             ('investment_share', 'min')   11009.467229
9      ('credit_utilization_rate', 'max')   10862.593880
10                credit_utilization_rate   10858.893663
11            ('num_credit_cards', 'min')   10376.359413
12            ('investment_share', 'max')   10372.476540
13         ('monthly_investments', 'min')   10250.544048
14        ('utilization_per_card', 'max')   10115.311687
15     ('credit_utilization_rate', 'min')    9824.164423
16       ('missed_payment_event