This code is inspired from [Atif Hassan's](https://github.com/atif-hassan/Competition-code/blob/master/AnalyticsVidhya/Machine%20Learning%20for%20Banking/code/predict.ipynb) approach

# Usual imports

In [12]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import f1_score
from imblearn.over_sampling import SMOTE
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

# Pre-processing

In [2]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")
# Convert to numeric
df_train["Loan_Amount_Requested"] = df_train["Loan_Amount_Requested"].str.replace(",", "")
df_train["Loan_Amount_Requested"] = pd.to_numeric(df_train["Loan_Amount_Requested"])
df_test["Loan_Amount_Requested"] = df_test["Loan_Amount_Requested"].str.replace(",", "")
df_test["Loan_Amount_Requested"] = pd.to_numeric(df_test["Loan_Amount_Requested"])

# Fill NaN
df_train["Length_Employed"].fillna('NaN', inplace=True)
df_test["Length_Employed"].fillna('NaN', inplace=True)

df_train["Home_Owner"].fillna('NaN', inplace=True)
df_test["Home_Owner"].fillna('NaN', inplace=True)

df_train["Income_Verified"].fillna('NaN', inplace=True)
df_test["Income_Verified"].fillna('NaN', inplace=True)

df_train["Purpose_Of_Loan"].fillna('NaN', inplace=True)
df_test["Purpose_Of_Loan"].fillna('NaN', inplace=True)

df_train["Gender"].fillna('NaN', inplace=True)
df_test["Gender"].fillna('NaN', inplace=True)

# Drop loan ids
df_train = df_train.drop(["Loan_ID"], axis=1)
loan_ids = df_test["Loan_ID"].values
df_test = df_test.drop(["Loan_ID"], axis=1)

# Fill NaN with mean
df_train["Annual_Income"].fillna(df_train["Annual_Income"].mean(), inplace=True)
df_test["Annual_Income"].fillna(df_test["Annual_Income"].mean(), inplace=True)

# Assumption: If it is NaN, then user has no deliquency, so set with 0
df_train["Months_Since_Deliquency"].fillna(0, inplace=True)
df_test["Months_Since_Deliquency"].fillna(0, inplace=True)

df_train.head()


Unnamed: 0,Loan_Amount_Requested,Length_Employed,Home_Owner,Annual_Income,Income_Verified,Purpose_Of_Loan,Debt_To_Income,Inquiries_Last_6Mo,Months_Since_Deliquency,Number_Open_Accounts,Total_Accounts,Gender,Interest_Rate
0,7000,< 1 year,Rent,68000.0,not verified,car,18.37,0,0.0,9,14,Female,1
1,30000,4 years,Mortgage,73331.159434,VERIFIED - income,debt_consolidation,14.93,0,17.0,12,24,Female,3
2,24725,7 years,Mortgage,75566.4,VERIFIED - income source,debt_consolidation,15.88,0,0.0,12,16,Male,3
3,16000,< 1 year,,56160.0,VERIFIED - income source,debt_consolidation,14.34,3,0.0,16,22,Male,3
4,17000,8 years,Own,96000.0,VERIFIED - income source,debt_consolidation,22.17,1,0.0,19,30,Female,1


In [3]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 164309 entries, 0 to 164308
Data columns (total 13 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   Loan_Amount_Requested    164309 non-null  int64  
 1   Length_Employed          164309 non-null  object 
 2   Home_Owner               164309 non-null  object 
 3   Annual_Income            164309 non-null  float64
 4   Income_Verified          164309 non-null  object 
 5   Purpose_Of_Loan          164309 non-null  object 
 6   Debt_To_Income           164309 non-null  float64
 7   Inquiries_Last_6Mo       164309 non-null  int64  
 8   Months_Since_Deliquency  164309 non-null  float64
 9   Number_Open_Accounts     164309 non-null  int64  
 10  Total_Accounts           164309 non-null  int64  
 11  Gender                   164309 non-null  object 
 12  Interest_Rate            164309 non-null  int64  
dtypes: float64(3), int64(5), object(5)
memory usage: 16.3+ MB


# CatBoost Model

In [4]:
X_train, Y_train = df_train.drop(["Interest_Rate"], axis=1).values, df_train["Interest_Rate"].values
X_test = df_test.values

X_train.shape, Y_train.shape, X_test.shape

((164309, 12), (164309,), (109541, 12))

In [5]:
kfold, scores = KFold(n_splits=5, shuffle=True, random_state=0), list()
for train, test in kfold.split(X_train):
    print(kfold.split(X_train))
    x_train, x_test = X_train[train], X_train[test]
    y_train, y_test = Y_train[train], Y_train[test]
    
    model = CatBoostClassifier(random_state=27, max_depth=4, task_type="GPU", devices="0:1", n_estimators=1000, verbose=500)
    model.fit(x_train, y_train, cat_features=[1, 2, 4, 5, 11])
    preds = model.predict(x_test)
    score = f1_score(y_test, preds, average="weighted")
    scores.append(score)
    print(score)
print("Average: ", sum(scores)/len(scores))

<generator object _BaseKFold.split at 0x7f31c92e78d0>
Learning rate set to 0.175594
0:	learn: 1.0809270	total: 31.9ms	remaining: 31.8s
500:	learn: 0.9056365	total: 14.7s	remaining: 14.7s
999:	learn: 0.8935474	total: 28.8s	remaining: 0us
0.5356875530293171
<generator object _BaseKFold.split at 0x7f31c92e7cd0>
Learning rate set to 0.175594
0:	learn: 1.0808188	total: 30.5ms	remaining: 30.5s
500:	learn: 0.9044622	total: 14.2s	remaining: 14.2s
999:	learn: 0.8920800	total: 27.8s	remaining: 0us
0.5343847656372573
<generator object _BaseKFold.split at 0x7f31c92e7cd0>
Learning rate set to 0.175594
0:	learn: 1.0812283	total: 28.5ms	remaining: 28.4s
500:	learn: 0.9047396	total: 16.6s	remaining: 16.5s
999:	learn: 0.8927573	total: 32.3s	remaining: 0us
0.5327531394585924
<generator object _BaseKFold.split at 0x7f31c92e7cd0>
Learning rate set to 0.175594
0:	learn: 1.0808246	total: 67.1ms	remaining: 1m 6s
500:	learn: 0.9055686	total: 29.6s	remaining: 29.5s
999:	learn: 0.8928687	total: 57.9s	remaining:

In [6]:
model = CatBoostClassifier(random_state=27, task_type="GPU", devices="0:1", n_estimators=1000, max_depth=4, verbose=500)
model.fit(X_train, Y_train, cat_features=[1, 2, 4, 5, 11])
preds1 = model.predict_proba(X_test)

Learning rate set to 0.183772
0:	learn: 1.0802058	total: 64ms	remaining: 1m 3s
500:	learn: 0.9064044	total: 33.3s	remaining: 33.2s
999:	learn: 0.8955923	total: 1m 2s	remaining: 0us


In [7]:
preds1

array([[0.17328609, 0.40864082, 0.41807309],
       [0.52764357, 0.39569713, 0.0766593 ],
       [0.1592445 , 0.42389746, 0.41685804],
       ...,
       [0.24586958, 0.54846146, 0.20566897],
       [0.07056161, 0.32994468, 0.59949371],
       [0.18279974, 0.45401243, 0.36318782]])

In [8]:
preds1=np.argmax(preds1,axis=1)+1

In [9]:
preds1

array([3, 1, 2, ..., 2, 3, 2])

In [10]:
pred14 = pd.DataFrame({
    "Loan_ID":loan_ids,
    "Interest_Rate": preds1
})

In [11]:
# pred14.to_csv('submission14.csv', index=False)