This code is inspired from [Atif Hassan's](https://github.com/atif-hassan/Competition-code/blob/master/AnalyticsVidhya/Machine%20Learning%20for%20Banking/code/predict.ipynb) approach

# Usual imports

In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score
from imblearn.over_sampling import SMOTE
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

# Pre-processing

In [2]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

# Convert to numeric
df_train["Loan_Amount_Requested"] = df_train["Loan_Amount_Requested"].str.replace(",", "")
df_train["Loan_Amount_Requested"] = pd.to_numeric(df_train["Loan_Amount_Requested"])
df_test["Loan_Amount_Requested"] = df_test["Loan_Amount_Requested"].str.replace(",", "")
df_test["Loan_Amount_Requested"] = pd.to_numeric(df_test["Loan_Amount_Requested"])

# Fill NaN
df_train["Length_Employed"].fillna('NaN', inplace=True)
df_test["Length_Employed"].fillna('NaN', inplace=True)

df_train["Home_Owner"].fillna('NaN', inplace=True)
df_test["Home_Owner"].fillna('NaN', inplace=True)

df_train["Purpose_Of_Loan"].fillna('NaN', inplace=True)
df_test["Purpose_Of_Loan"].fillna('NaN', inplace=True)

df_train["Gender"].fillna('NaN', inplace=True)
df_test["Gender"].fillna('NaN', inplace=True)

# Drop loan ids
df_train = df_train.drop(["Loan_ID"], axis=1)
loan_ids = df_test["Loan_ID"].values
df_test = df_test.drop(["Loan_ID"], axis=1)

# Label Encode
le = LabelEncoder()
df_train["Length_Employed"] = le.fit_transform(df_train["Length_Employed"])
df_test["Length_Employed"] = le.transform(df_test["Length_Employed"])

df_train["Home_Owner"] = le.fit_transform(df_train["Home_Owner"])
df_test["Home_Owner"] = le.transform(df_test["Home_Owner"])

df_train["Income_Verified"] = le.fit_transform(df_train["Income_Verified"])
df_test["Income_Verified"] = le.transform(df_test["Income_Verified"])

df_train["Purpose_Of_Loan"] = le.fit_transform(df_train["Purpose_Of_Loan"])
df_test["Purpose_Of_Loan"] = le.transform(df_test["Purpose_Of_Loan"])

df_train["Gender"] = le.fit_transform(df_train["Gender"])
df_test["Gender"] = le.transform(df_test["Gender"])

# Fill NaN with mean
df_train["Annual_Income"].fillna(df_train["Annual_Income"].mean(), inplace=True)
df_test["Annual_Income"].fillna(df_test["Annual_Income"].mean(), inplace=True)

# Assumption: If it is NaN, then user has no deliquency, so set with 0
df_train["Months_Since_Deliquency"].fillna(0, inplace=True)
df_test["Months_Since_Deliquency"].fillna(0, inplace=True)

df_train.head()

Unnamed: 0,Loan_Amount_Requested,Length_Employed,Home_Owner,Annual_Income,Income_Verified,Purpose_Of_Loan,Debt_To_Income,Inquiries_Last_6Mo,Months_Since_Deliquency,Number_Open_Accounts,Total_Accounts,Gender,Interest_Rate
0,7000,10,5,68000.0,2,0,18.37,0,0.0,9,14,0,1
1,30000,4,0,73331.159434,0,2,14.93,0,17.0,12,24,0,3
2,24725,7,0,75566.4,1,2,15.88,0,0.0,12,16,1,3
3,16000,10,1,56160.0,1,2,14.34,3,0.0,16,22,1,3
4,17000,8,4,96000.0,1,2,22.17,1,0.0,19,30,0,1


In [3]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 164309 entries, 0 to 164308
Data columns (total 13 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   Loan_Amount_Requested    164309 non-null  int64  
 1   Length_Employed          164309 non-null  int64  
 2   Home_Owner               164309 non-null  int64  
 3   Annual_Income            164309 non-null  float64
 4   Income_Verified          164309 non-null  int64  
 5   Purpose_Of_Loan          164309 non-null  int64  
 6   Debt_To_Income           164309 non-null  float64
 7   Inquiries_Last_6Mo       164309 non-null  int64  
 8   Months_Since_Deliquency  164309 non-null  float64
 9   Number_Open_Accounts     164309 non-null  int64  
 10  Total_Accounts           164309 non-null  int64  
 11  Gender                   164309 non-null  int64  
 12  Interest_Rate            164309 non-null  int64  
dtypes: float64(3), int64(10)
memory usage: 16.3 MB


# LightGBM Model

In [4]:
X_train, Y_train = df_train.drop(["Interest_Rate"], axis=1).values, df_train["Interest_Rate"].values
X_test = df_test.values

X_train.shape, Y_train.shape, X_test.shape

((164309, 12), (164309,), (109541, 12))

In [5]:
kfold, scores = KFold(n_splits=5, shuffle=True, random_state=0), list()
for train, test in kfold.split(X_train):
    x_train, x_test = X_train[train], X_train[test]
    y_train, y_test = Y_train[train], Y_train[test]
    
    num_class1, num_class2, num_class3 = Counter(y_train)[1], Counter(y_train)[2], Counter(y_train)[3]
    sm = SMOTE(random_state=27, sampling_strategy={1: int(2.0*num_class1), 2: int(1.6*num_class2), 3: int(1.6*num_class3)})
    x_train, y_train = sm.fit_resample(x_train, y_train)
    
    model = LGBMClassifier(random_state=27, max_depth=6, n_estimators=400)
    model.fit(x_train, y_train, categorical_feature=[1, 2, 4, 5, 11])
    preds = model.predict(x_test)
    score = f1_score(y_test, preds, average="weighted")
    scores.append(score)
    print(score)
print("Average: ", sum(scores)/len(scores))

0.5368406292231035
0.5329528456116052
0.5354034675626155
0.5356220996921175
0.5340578715783949
Average:  0.5349753827335674


In [7]:
# We apply SMOTE on all classes, thus increasing total sample size of each class
# This generalizes the decision boundary
num_class1, num_class2, num_class3 = Counter(Y_train)[1], Counter(Y_train)[2], Counter(Y_train)[3]
sm = SMOTE(random_state=27, sampling_strategy={1: int(2.0*num_class1), 2: int(1.6*num_class2), 3: int(1.6*num_class3)})
X_train_, Y_ = sm.fit_resample(X_train, Y_train)

model = LGBMClassifier(random_state=27, max_depth=6, n_estimators=400)
model.fit(X_train_, Y_, categorical_feature=[1, 2, 4, 5, 11])
preds2 = model.predict_proba(X_test)

In [8]:
preds2

array([[0.1947634 , 0.32905743, 0.47617917],
       [0.47235884, 0.45782208, 0.06981908],
       [0.17364408, 0.43311972, 0.3932362 ],
       ...,
       [0.23256564, 0.56597307, 0.20146129],
       [0.03827737, 0.45957147, 0.50215116],
       [0.18933896, 0.45408846, 0.35657257]])

In [9]:
preds2=np.argmax(preds2,axis=1)+1

In [11]:
preds2

array([3, 1, 2, ..., 2, 3, 2])

In [10]:
pred15 = pd.DataFrame({
    "Loan_ID":loan_ids,
    "Interest_Rate": preds2
})

In [None]:
# pred15.to_csv('submission15.csv', index=False)