In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import f1_score

In [2]:
train_pd = pd.read_csv('train_data.csv')
test_pd = pd.read_csv('test_data.csv')

In [3]:
test_pd.columns

Index(['ID', 'Frequency', 'InstlmentMode', 'LoanStatus', 'PaymentMode',
       'BranchID', 'Area', 'Tenure', 'AssetCost', 'AmountFinance',
       'DisbursalAmount', 'EMI', 'AuthDate', 'AssetID', 'SupplierID', 'LTV',
       'MonthlyIncome', 'City', 'State', 'DisbursalYear', 'MaturityYear'],
      dtype='object')

In [4]:
X_train = train_pd.drop(['ID', 'Top-up Month', 'AuthDate', 'City', 'Area', 'BranchID', 'SupplierID', 'AssetID'], axis=1)
y_train = train_pd['Top-up Month']
X_test = test_pd.drop(['ID', 'AuthDate', 'City', 'Area', 'BranchID', 'SupplierID', 'AssetID'], axis=1)

In [5]:
cat_features = list(X_test.dtypes[X_test.dtypes == 'object'].index)

In [6]:
column_trans = make_column_transformer(
    (OrdinalEncoder(), ['Frequency']),
    (OneHotEncoder(), cat_features[0:]),
    remainder='passthrough'
)
logreg = LogisticRegression(solver='lbfgs')

In [7]:
pipe = make_pipeline(column_trans, logreg)

In [8]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('ordinalencoder',
                                                  OrdinalEncoder(),
                                                  ['Frequency']),
                                                 ('onehotencoder',
                                                  OneHotEncoder(),
                                                  ['Frequency', 'InstlmentMode',
                                                   'LoanStatus', 'PaymentMode',
                                                   'State'])])),
                ('logisticregression', LogisticRegression())])

In [9]:
y_pred = pipe.predict(X_train)

In [10]:
print('Macro F1:', f1_score(y_train, y_pred, average='macro'))

Macro F1: 0.12951550514652857


In [11]:
X_train.dtypes

Frequency           object
InstlmentMode       object
LoanStatus          object
PaymentMode         object
Tenure               int64
AssetCost            int64
AmountFinance      float64
DisbursalAmount    float64
EMI                float64
LTV                float64
MonthlyIncome      float64
State               object
DisbursalYear        int64
MaturityYear       float64
dtype: object