In [16]:
# import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler

# import classification libraries
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier

# cross validation and metrics libraries
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import make_scorer, accuracy_score, f1_score

# supress warnings
import warnings
warnings.filterwarnings('ignore')

In [9]:
def load_dataset():
    train_df = pd.read_csv('train_fNxu4vz.csv')
    test_df = pd.read_csv('test_fjtUOL8.csv')
    return (train_df, test_df)

In [10]:
def treat_missing_values(df):
    df['Loan_Amount_Requested'] = df['Loan_Amount_Requested'].str.replace(',','').astype(float)
    df['Length_Employed'].fillna(df['Length_Employed'].mode()[0], inplace=True)
    df['Home_Owner'].fillna(df['Home_Owner'].mode()[0], inplace=True)
    df['Annual_Income'].fillna(df.groupby('Length_Employed')['Annual_Income'].transform('mean'), inplace=True)
    df['Months_Since_Deliquency'] = df['Months_Since_Deliquency'].map(lambda x: 0 if x>0 else 1)
    return df

In [11]:
def preprocess(df):
    df['Gender'] = df['Gender'].map(lambda x: 0 if x=='Female' else 1)
    df['Length_Employed'] = df['Length_Employed'].apply(
        lambda x: x.replace('< 1', '0')).str.extract('(\d+)').astype(int)
    categorical_cols = list(df.select_dtypes(object).columns)
    dummies = pd.get_dummies(df[categorical_cols], drop_first=True)
    df = pd.concat([df, dummies], axis=1)
    df.drop(columns=categorical_cols, axis=1, inplace=True)
    
    return df

In [12]:
train_df, test_df = load_dataset()
train_df.shape, test_df.shape

((164309, 14), (109541, 13))

In [13]:
train_df, test_df = treat_missing_values(train_df), treat_missing_values(test_df)

In [14]:
train_df, test_df = preprocess(train_df), preprocess(test_df)

In [15]:
train_df.head()

Unnamed: 0,Loan_ID,Loan_Amount_Requested,Length_Employed,Annual_Income,Debt_To_Income,Inquiries_Last_6Mo,Months_Since_Deliquency,Number_Open_Accounts,Total_Accounts,Gender,...,Purpose_Of_Loan_home_improvement,Purpose_Of_Loan_house,Purpose_Of_Loan_major_purchase,Purpose_Of_Loan_medical,Purpose_Of_Loan_moving,Purpose_Of_Loan_other,Purpose_Of_Loan_renewable_energy,Purpose_Of_Loan_small_business,Purpose_Of_Loan_vacation,Purpose_Of_Loan_wedding
0,10000001,7000.0,0,68000.0,18.37,0,1,9,14,0,...,0,0,0,0,0,0,0,0,0,0
1,10000002,30000.0,4,71150.55752,14.93,0,0,12,24,0,...,0,0,0,0,0,0,0,0,0,0
2,10000003,24725.0,7,75566.4,15.88,0,1,12,16,1,...,0,0,0,0,0,0,0,0,0,0
3,10000004,16000.0,0,56160.0,14.34,3,1,16,22,1,...,0,0,0,0,0,0,0,0,0,0
4,10000005,17000.0,8,96000.0,22.17,1,1,19,30,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
scaler = MinMaxScaler()

columns = ['Loan_Amount_Requested', 'Length_Employed', 'Annual_Income', 'Debt_To_Income', 'Inquiries_Last_6Mo', 
           'Number_Open_Accounts', 'Total_Accounts']
train_df[columns] = scaler.fit_transform(train_df[columns])
test_df[columns] = scaler.transform(test_df[columns])

In [18]:
from sklearn.model_selection import train_test_split

X = train_df.drop(columns=['Interest_Rate', 'Loan_ID'], axis=1)
y = train_df['Interest_Rate']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

In [22]:
%%time
rf_clf = RandomForestClassifier()
scores = cross_val_score(rf_clf, X_train, y_train, cv=3, scoring=make_scorer(f1_score, average='weighted'), n_jobs=-1)
print('F1_score for RandomForest : ', scores.mean())

F1_score for RandomForest :  0.5033808152060413
CPU times: user 56.3 ms, sys: 29.3 ms, total: 85.5 ms
Wall time: 22.4 s


In [23]:
%%time
from sklearn.svm import LinearSVC

lsvm_clf = LinearSVC()
scores = cross_val_score(lsvm_clf, X_train, y_train, cv=3, scoring=make_scorer(f1_score, average='weighted'), n_jobs=-1)
print('F1_score for Linear SVC : ', scores.mean())

F1_score for Linear SVC :  0.4825872625905288
CPU times: user 52.3 ms, sys: 25.4 ms, total: 77.7 ms
Wall time: 20.3 s


In [25]:
%%time
from xgboost import XGBClassifier

xgb_clf = XGBClassifier(objective='multi:softmax')
scores = cross_val_score(xgb_clf, X_train, y_train, cv=3, scoring=make_scorer(f1_score, average='weighted'), n_jobs=-1)
print('F1_score for XGBoost : ', scores.mean())

F1_score for XGBoost :  0.5095226166165469
CPU times: user 57.9 ms, sys: 30.2 ms, total: 88.1 ms
Wall time: 53.5 s


In [29]:
%%time
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression(multi_class='multinomial')
scores = cross_val_score(lr_clf, X_train, y_train, cv=5, scoring=make_scorer(f1_score, average='weighted'), n_jobs=-1)
print('F1_score for LogReg : ', scores.mean())

F1_score for LogReg :  0.5014696144760789
CPU times: user 80 ms, sys: 41.4 ms, total: 121 ms
Wall time: 7.94 s


In [31]:
%%time
from catboost import CatBoostClassifier

catb_clf = CatBoostClassifier()
scores = cross_val_score(catb_clf, X_train, y_train, cv=5, scoring=make_scorer(f1_score, average='weighted'), n_jobs=-1)
print('F1_score for CatBoost : ', scores.mean())

F1_score for CatBoost :  0.5303345625235878
CPU times: user 82.8 ms, sys: 50.3 ms, total: 133 ms
Wall time: 3min 6s


In [32]:
test_df.head()

Unnamed: 0,Loan_ID,Loan_Amount_Requested,Length_Employed,Annual_Income,Debt_To_Income,Inquiries_Last_6Mo,Months_Since_Deliquency,Number_Open_Accounts,Total_Accounts,Gender,...,Purpose_Of_Loan_home_improvement,Purpose_Of_Loan_house,Purpose_Of_Loan_major_purchase,Purpose_Of_Loan_medical,Purpose_Of_Loan_moving,Purpose_Of_Loan_other,Purpose_Of_Loan_renewable_energy,Purpose_Of_Loan_small_business,Purpose_Of_Loan_vacation,Purpose_Of_Loan_wedding
0,10164310,0.782609,1.0,0.016676,0.32183,0.0,0,0.131579,0.227273,1,...,0,0,0,0,0,0,0,0,0,0
1,10164311,0.73913,1.0,0.014141,0.284321,0.0,1,0.078947,0.136364,1,...,0,0,0,0,0,0,0,0,0,0
2,10164312,0.161594,0.0,0.009472,0.170793,0.25,1,0.065789,0.116883,1,...,0,0,0,0,0,0,0,0,0,0
3,10164313,0.333333,1.0,0.009205,0.194049,0.0,1,0.078947,0.038961,1,...,0,0,0,0,0,0,0,0,0,0
4,10164314,1.0,0.0,0.020277,0.24056,0.0,0,0.118421,0.123377,1,...,0,0,0,0,0,0,0,0,0,0


In [42]:
def submit(model, name):
    model.fit(X_train, y_train, verbose=False)
    results = model.predict(test_df.drop('Loan_ID', axis=1))
    data = { 'Loan_ID': test_df['Loan_ID'], 'Interest_Rate': results}
    submission = pd.DataFrame(data)
    submission.to_csv(name, index=False)
    return True

In [43]:
submit(catb_clf, 'catboost.csv')

Exception: Data must be 1-dimensional

In [44]:
catb_clf.fit(X_train, y_train, verbose=False)
results = catb_clf.predict(test_df.drop('Loan_ID', axis=1))

In [47]:
[i[0] for i in r]

array([[3],
       [1],
       [3],
       ...,
       [2],
       [3],
       [2]])

In [None]:
data = { 'Loan_ID': test_df['Loan_ID'], 'Interest_Rate': results}
submission = pd.DataFrame(data)
submission.to_csv(name, index=False)

In [None]:
test_loan_ids = test_df['Loan_ID']


train_df.drop('Loan_ID', axis=1, inplace=True)
test_df.drop('Loan_ID', axis=1, inplace=True)



train_df.describe()

test_df.describe()

train_df['Months_Since_Deliquency'].value_counts(normalize=True)

train_df['Interest_Rate'].value_counts(normalize=True).sum()



In [10]:
# evaluate a model
def evaluate_model(X, y, model):
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
    scores = cross_val_score(model, X, y, scoring=make_scorer(f1_score, average='weighted'), cv=cv, n_jobs=-1)
    return scores

In [None]:
model = RandomForestClassifier()
model.fit()

In [None]:
# define models to test
def get_models():
    models, names = list(), list()
    models.append(KNeighborsClassifier())
    names.append('KNN')
    # Bagging
    models.append(BaggingClassifier(n_estimators=1000))
    names.append('BAG')
    # RF
    models.append(RandomForestClassifier(n_estimators=1000))
    names.append('RF')
    # ET
    models.append(ExtraTreesClassifier(n_estimators=1000))
    names.append('ET')
    return models, names

# define models
models, names = get_models()
results = list()

# evaluate each model
for i in range(len(models)):
    # evaluate the model and store results
    scores = evaluate_model(X_train, y_train, models[i])
    results.append(scores)
    # summarize performance
    print('>%s %.3f (%.3f)' % (names[i], mean(scores), std(scores)))