In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import f1_score
from category_encoders import HashingEncoder

In [2]:
train_pd = pd.read_csv('train_data.csv')
test_pd = pd.read_csv('test_data.csv')

In [3]:
train_pd.isnull().sum()

ID                 0
Frequency          0
InstlmentMode      0
LoanStatus         0
PaymentMode        0
BranchID           0
Area               0
Tenure             0
AssetCost          0
AmountFinance      0
DisbursalAmount    0
EMI                0
AssetID            0
SupplierID         0
LTV                0
MonthlyIncome      0
City               0
State              0
Top-up Month       0
DisbursalYear      0
MaturityYear       0
AuthYear           0
Report-Counts      0
dtype: int64

In [4]:
X_train = train_pd.drop(['ID', 'Top-up Month', 'BranchID', 'SupplierID', 'AssetID'], axis=1)
y_train = train_pd['Top-up Month']
X_test = test_pd.drop(['ID', 'BranchID', 'SupplierID', 'AssetID'], axis=1)

In [5]:
cat_features = list(X_test.dtypes[X_test.dtypes == 'object'].index)

In [6]:
column_trans = make_column_transformer(
    (OrdinalEncoder(), ['Frequency']),
    (HashingEncoder(), cat_features),
    remainder='passthrough'
)
logreg = LogisticRegression(solver='lbfgs')
xgb_cls = XGBClassifier() 

In [7]:
pipe = make_pipeline(column_trans, xgb_cls)

In [8]:
pipe.fit(X_train, y_train)



Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('ordinalencoder',
                                                  OrdinalEncoder(),
                                                  ['Frequency']),
                                                 ('hashingencoder',
                                                  HashingEncoder(max_process=4),
                                                  ['Frequency', 'InstlmentMode',
                                                   'LoanStatus', 'PaymentMode',
                                                   'Area', 'City',
                                                   'State'])])),
                ('xgbclassifier',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_byl...
                               importance_type='gain',
                               interaction_constraints='

In [9]:
y_pred = pipe.predict(X_train)

In [10]:
print('Macro F1:', f1_score(y_train, y_pred, average='macro'))

Macro F1: 0.32160461239598803


In [22]:
submission = pd.read_csv('submission.csv')
submission['Top-up Month'] = pipe.predict(X_test)
submission.to_csv('submission.csv', index=False)

In [23]:
submission

Unnamed: 0,ID,Top-up Month
0,4,No Top-up Service
1,5,No Top-up Service
2,6,No Top-up Service
3,25,No Top-up Service
4,119,No Top-up Service
...,...,...
14740,143396,> 48 Months
14741,143397,No Top-up Service
14742,143398,No Top-up Service
14743,143399,No Top-up Service
