In [19]:

from sklearn.base import TransformerMixin, BaseEstimator

from collections import Counter , defaultdict

import pandas as pd

pd.set_option('display.max_columns', None)


from pandas import Series as s , DataFrame as df
import numpy as np

import seaborn as sns
# import matplotlib.pyplot as plt

# To ignore warnings
import warnings
warnings.filterwarnings("ignore")

from matplotlib import pyplot as plt, rcParams as rc


%matplotlib inline
rc["figure.figsize"] = 10,6

import datetime
import datetime as dt

from sklearn.model_selection  import StratifiedKFold
from sklearn.decomposition import PCA

from iteration_utilities import duplicates, unique_everseen

import sys
from itertools import groupby
from operator import itemgetter
from timeit import timeit


## RandomOverSampler to handle imbalanced data
# from imblearn.over_sampling import RandomOverSampler # over sampling method 2

In [44]:

#Algorithm
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.naive_bayes import MultinomialNB

from lightgbm import LGBMClassifier
# from catboost import CatBoostClassifier

#preprocessing
from sklearn.preprocessing import LabelEncoder

from sklearn.preprocessing import StandardScaler, normalize



from sklearn.model_selection import train_test_split

import lightgbm as lgb


#Metrics
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, recall_score, precision_score
from sklearn.metrics import confusion_matrix, classification_report, f1_score, precision_recall_fscore_support

from sklearn.pipeline import Pipeline

# GridSearchCV to find optimal min_samples_leaf
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

In [23]:
train = pd.read_csv("train_fNxu4vz.csv")

test = pd.read_csv("test_fjtUOL8.csv")
submission = pd.read_csv("sample_submission_HSqiq1Q.csv")

train.shape, test.shape, submission.shape

((164309, 14), (109541, 13), (109541, 2))

In [24]:
df = train.copy() 
df_test = test.copy() 

In [64]:
def get_percentage_miss_value(dataset):
    higher_miss_value_column = []
    total_miss_value_column = []
    miss_threshold_value = 50
    
    for i in dataset.columns:
        if dataset[i].isna().sum() > 1: 
            perectange_val = (dataset[i].isna().sum() / len(dataset)) * 100
            total_miss_value_column.append(i)
            print("Type ",dataset[i].dtype," Column-> " , i, ", missing value : ",dataset[i].isna().sum() , " & :         ", round(perectange_val,2) ," %")
                
            if(perectange_val > miss_threshold_value):
                higher_miss_value_column.append(i)
            
    print("\n\n\n")
        
    if higher_miss_value_column:
        print("Higher Missing values in Columns for Delete : ", higher_miss_value_column)
    else:
        print("There are no Higher Column Missing values in Dataset")
    
    return total_miss_value_column, higher_miss_value_column

In [65]:
get_percentage_miss_value(df)

Type  object  Column->  Length_Employed , missing value :  7371  & :          4.49  %
Type  object  Column->  Home_Owner , missing value :  25349  & :          15.43  %
Type  float64  Column->  Annual_Income , missing value :  25102  & :          15.28  %
Type  float64  Column->  Months_Since_Deliquency , missing value :  88379  & :          53.79  %




Higher Missing values in Columns for Delete :  ['Months_Since_Deliquency']


(['Length_Employed', 'Home_Owner', 'Annual_Income', 'Months_Since_Deliquency'],
 ['Months_Since_Deliquency'])

In [57]:
get_percentage_miss_value(df_test)

Column->  Length_Employed , total no of missing value :  4936  & :          4.51  %
Column->  Home_Owner , total no of missing value :  16711  & :          15.26  %
Column->  Annual_Income , total no of missing value :  16898  & :          15.43  %
Column->  Months_Since_Deliquency , total no of missing value :  58859  & :          53.73  %




Higher Missing values in Columns for Delete :  ['Months_Since_Deliquency']


(['Length_Employed', 'Home_Owner', 'Annual_Income', 'Months_Since_Deliquency'],
 ['Months_Since_Deliquency'])

In [28]:
def check_cloumn_details_type_numberical(dataset):
    for i in dataset.columns:
        if (dataset[i].dtype == "int"):
            print("Columns name :  ",i  )
            
            print(dict(Counter(dataset[i])))
            print("*"*100)
            print("\n")
            
            
def check_cloumn_details_type_float(dataset):
    for i in dataset.columns:
        if (dataset[i].dtype == "float"):
            print("Columns name :  ",i  )
            
            print(dict(Counter(dataset[i])))
            print("*"*100)
            print("\n")
            

def check_cloumn_details_type_categorical(dataset):
    for i in dataset.columns:
        if(dataset[i].dtype == "object"):
            print("Columns name :  ",i  )
            
            print(dict(Counter(dataset[i])))
            
            print("*"*100)
            print("\n")
            

In [29]:
# check_cloumn_details_type_categorical(df)

In [30]:
# check_cloumn_details_type_float(df)

In [31]:
def visualize_histogram(dataset):
    # plot histogram
    plt.figure(figsize=(25, 9))  # figure size in ratio 16:9
    features = dataset.columns  # list of columns name
    for i, j in enumerate(features):
        plt.subplot(3, 3, i + 1)  # create subplot for histogram
        plt.title("Histogram of {}".format(j), fontsize=15)  # title of histogram

        bins = len(dataset[j].unique())  # bins for histogram
        plt.hist(dataset[j], bins=bins, rwidth=0.8, edgecolor="y", linewidth=2, )  # plot histogram

    plt.subplots_adjust(hspace=0.5)  # space between horixontal axes (subplots)

In [32]:
# visualize_histogram(df)

In [33]:
# visualize_numberical_values(df)

In [34]:
#Label encoding
def convert_to_numerical_label_encoding(dataset):
    enc = LabelEncoder()
    for i in dataset.columns:
        if(dataset[i].dtype == "object"):
            dataset[i] = enc.fit_transform(dataset[i])
            
    return dataset

In [35]:
class DataFrameImputer(TransformerMixin):

    def __init__(self):
        """Impute missing values.

        Columns of dtype object are imputed with the most frequent value 
        in column.

        Columns of other types are imputed with mean of column.

        """
    def fit(self, X, y=None):

        self.fill = pd.Series([X[c].value_counts().index[0] if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
            index=X.columns)

        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)
    


In [36]:
def majority_imbalanced_dataset(dataset, target_col):
    feature_columns = dataset.columns.tolist()
    feature_columns = [c for c in feature_columns if c not in [target_col]]

    X2_new = dataset[feature_columns]
    Y2_new = dataset[target_col]

    os =  RandomOverSampler(random_state=35)
    X_feature_variables , y_output = os.fit_sample(X2_new, Y2_new)
    
    X_feature_variables[target_col] = y_output
    
    X_feature_variables = X_feature_variables.sample(frac = 1).reset_index(drop = True)
    
    return X_feature_variables

In [37]:
def standardscaler_preprocessing(dataset_train, dataset_test, num_col):
    scaler = StandardScaler()
   
    dataset_train[num_col] = scaler.fit_transform(dataset_train[num_col])

    dataset_test[num_col] = scaler.transform(dataset_test[num_col])
    
    return dataset_train, dataset_test

In [38]:
df.head()

Unnamed: 0,Loan_ID,Loan_Amount_Requested,Length_Employed,Home_Owner,Annual_Income,Income_Verified,Purpose_Of_Loan,Debt_To_Income,Inquiries_Last_6Mo,Months_Since_Deliquency,Number_Open_Accounts,Total_Accounts,Gender,Interest_Rate
0,10000001,7000,< 1 year,Rent,68000.0,not verified,car,18.37,0,,9,14,Female,1
1,10000002,30000,4 years,Mortgage,,VERIFIED - income,debt_consolidation,14.93,0,17.0,12,24,Female,3
2,10000003,24725,7 years,Mortgage,75566.4,VERIFIED - income source,debt_consolidation,15.88,0,,12,16,Male,3
3,10000004,16000,< 1 year,,56160.0,VERIFIED - income source,debt_consolidation,14.34,3,,16,22,Male,3
4,10000005,17000,8 years,Own,96000.0,VERIFIED - income source,debt_consolidation,22.17,1,,19,30,Female,1


In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 164309 entries, 0 to 164308
Data columns (total 14 columns):
Loan_ID                    164309 non-null int64
Loan_Amount_Requested      164309 non-null object
Length_Employed            156938 non-null object
Home_Owner                 138960 non-null object
Annual_Income              139207 non-null float64
Income_Verified            164309 non-null object
Purpose_Of_Loan            164309 non-null object
Debt_To_Income             164309 non-null float64
Inquiries_Last_6Mo         164309 non-null int64
Months_Since_Deliquency    75930 non-null float64
Number_Open_Accounts       164309 non-null int64
Total_Accounts             164309 non-null int64
Gender                     164309 non-null object
Interest_Rate              164309 non-null int64
dtypes: float64(3), int64(5), object(6)
memory usage: 17.6+ MB


In [40]:
df.columns

Index(['Loan_ID', 'Loan_Amount_Requested', 'Length_Employed', 'Home_Owner',
       'Annual_Income', 'Income_Verified', 'Purpose_Of_Loan', 'Debt_To_Income',
       'Inquiries_Last_6Mo', 'Months_Since_Deliquency', 'Number_Open_Accounts',
       'Total_Accounts', 'Gender', 'Interest_Rate'],
      dtype='object')

In [41]:
Counter(df.Interest_Rate)

Counter({1: 33806, 3: 59923, 2: 70580})

In [42]:
df.describe()

Unnamed: 0,Loan_ID,Annual_Income,Debt_To_Income,Inquiries_Last_6Mo,Months_Since_Deliquency,Number_Open_Accounts,Total_Accounts,Interest_Rate
count,164309.0,139207.0,164309.0,164309.0,75930.0,164309.0,164309.0,164309.0
mean,10082160.0,73331.16,17.207189,0.781698,34.229356,11.193818,25.067665,2.158951
std,47432.07,60377.5,7.845083,1.034747,21.76118,4.991813,11.583067,0.738364
min,10000000.0,4000.0,0.0,0.0,0.0,0.0,2.0,1.0
25%,10041080.0,45000.0,11.37,0.0,16.0,8.0,17.0,2.0
50%,10082160.0,63000.0,16.84,0.0,31.0,10.0,23.0,2.0
75%,10123230.0,88697.5,22.78,1.0,50.0,14.0,32.0,3.0
max,10164310.0,7500000.0,39.99,8.0,180.0,76.0,156.0,3.0


# <font color ='orange' > Step 1: Rakesh EDA

In [155]:
# get_percentage_miss_value(df)

In [214]:
print(df.shape)

df1 = df.drop(columns = ['Loan_ID', 'Months_Since_Deliquency'])

print(df1.shape)

print(df_test.shape)

df_test1 = df_test.drop(columns = ['Loan_ID', 'Months_Since_Deliquency'])

print(df_test1.shape)

(164309, 14)
(164309, 12)
(109541, 13)
(109541, 11)


In [215]:
df1['Loan_Amount_Requested'] = df1['Loan_Amount_Requested'].str.replace(',','').astype(np.float64)

df_test1['Loan_Amount_Requested'] = df_test1['Loan_Amount_Requested'].str.replace(',','').astype(np.float64)

In [216]:
#Missing value imputation for Train
df1.Annual_Income.fillna(0, inplace = True) # Another option is to impute mean to checck model performance
df1.Home_Owner.fillna('UNKNOWN', inplace = True)
df1.Length_Employed.fillna('UNKNOWN', inplace = True)


#Missing value imputation for test
df_test1.Annual_Income.fillna(0, inplace = True) # Another option is to impute mean to checck model performance
df_test1.Home_Owner.fillna('UNKNOWN', inplace = True)
df_test1.Length_Employed.fillna('UNKNOWN', inplace = True)

In [217]:
#outlier managegment
originalCount = len(df1)
df1 = df1[df1['Annual_Income'] <= 225000]
df1 = df1[df1['Number_Open_Accounts'] <= 30]
df1 = df1[df1['Total_Accounts'] <= 68]

finalCount  = len(df1)
print('Original Count -', originalCount , ', Final Count -', finalCount,  ', Dropped -', originalCount - finalCount, '(', round(finalCount/originalCount,4), '%)') 


Original Count - 164309 , Final Count - 161954 , Dropped - 2355 ( 0.9857 %)


In [218]:
scaling_columns = ["Loan_Amount_Requested", "Debt_To_Income" ,"Inquiries_Last_6Mo" , "Number_Open_Accounts", "Total_Accounts", 
                  ]
label_encoding_columns = ["Length_Employed", "Home_Owner" , "Income_Verified" , "Purpose_Of_Loan"]

In [219]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 161954 entries, 0 to 164308
Data columns (total 12 columns):
Loan_Amount_Requested    161954 non-null float64
Length_Employed          161954 non-null object
Home_Owner               161954 non-null object
Annual_Income            161954 non-null float64
Income_Verified          161954 non-null object
Purpose_Of_Loan          161954 non-null object
Debt_To_Income           161954 non-null float64
Inquiries_Last_6Mo       161954 non-null int64
Number_Open_Accounts     161954 non-null int64
Total_Accounts           161954 non-null int64
Gender                   161954 non-null object
Interest_Rate            161954 non-null int64
dtypes: float64(3), int64(4), object(5)
memory usage: 16.1+ MB


In [220]:
scale_col_2 = ["Loan_Amount_Requested", "Annual_Income"]

In [221]:
# df1.Annual_Income   .min(), df1.Total_Accounts                   .max()

In [228]:
Counter(df1.Gender )

Counter({'Female': 46482, 'Male': 115462})

In [223]:
df20 = df1.copy()

In [201]:
Counter(df20.Home_Owner)

Counter({'Rent': 55586,
         'Mortgage': 68964,
         'UNKNOWN': 25025,
         'Own': 12320,
         'Other': 49,
         'None': 10})

In [229]:
df1 = df1[df1.Home_Owner != "None"]

In [230]:
df1.shape

(161944, 12)

In [231]:
df4 = convert_to_numerical_label_encoding(df1.copy())
df_test4 = convert_to_numerical_label_encoding(df_test1.copy())

In [188]:
# df5, df_test5  = standardscaler_preprocessing(df4.copy(), df_test4.copy(), scale_col_2)

In [232]:
X1 = df4.drop("Interest_Rate", axis = 1)

Y1 = df4.loc[:, ["Interest_Rate"]]

# <font color ='green' > Step 1: EDA

In [70]:
# df = train.copy() 
# df_test = test.copy() 

In [80]:
# print(df.shape)
# print(df_test.shape)

# del df['Loan_ID']
# del df_test['Loan_ID']

# print(df.shape)
# print(df_test.shape)

(164309, 14)
(109541, 13)
(164309, 13)
(109541, 12)


In [81]:
# df['Loan_Amount_Requested'] = df['Loan_Amount_Requested'].str.replace(',','').astype(np.float64)

# df_test['Loan_Amount_Requested'] = df_test['Loan_Amount_Requested'].str.replace(',','').astype(np.float64)

In [82]:
# Counter(df.Loan_Amount_Requested)

In [79]:
# #Missing value imputation for Train
# df.Months_Since_Deliquency.fillna(0, inplace = True)
# df.Annual_Income.fillna(0, inplace = True) # Another option is to impute mean to checck model performance
# df.Home_Owner.fillna('msg2drop', inplace = True)
# df.Length_Employed.fillna('msg2drop', inplace = True)


# #Missing value imputation for test
# df_test.Months_Since_Deliquency.fillna(0, inplace = True)
# df_test.Annual_Income.fillna(0, inplace = True) # Another option is to impute mean to checck model performance
# df_test.Home_Owner.fillna('msg2drop', inplace = True)
# df_test.Length_Employed.fillna('msg2drop', inplace = True)

In [84]:
# #outlier managegment
# originalCount = len(df)
# df = df[df['Annual_Income'] <= 225000]
# df = df[df['Months_Since_Deliquency'] <= 80]
# df = df[df['Number_Open_Accounts'] <= 30]
# df = df[df['Total_Accounts'] <= 68]

# finalCount  = len(df)
# print('Original Count -', originalCount , ', Final Count -', finalCount,  ', Dropped -', originalCount - finalCount, '(', round(finalCount/originalCount,4), '%)') 


Original Count - 164309 , Final Count - 161232 , Dropped - 3077 ( 0.9813 %)


In [131]:
df1.shape, df.shape

((161954, 12), (164309, 14))

In [136]:
df2 = df1.copy()
df_test2 = df_test1.copy()

In [137]:
dummy_col_list = ['Length_Employed', 'Home_Owner', 'Income_Verified', 'Purpose_Of_Loan', 'Gender']

df2 = pd.get_dummies(df2, columns=dummy_col_list)


df_test2 = pd.get_dummies(df_test2, columns=dummy_col_list)



In [138]:
df2.shape, df_test2.shape

((161954, 44), (109541, 43))

In [140]:
#dropping 1 column from each category dummy variable list

drop_col = ['Length_Employed_UNKNOWN','Home_Owner_UNKNOWN','Income_Verified_VERIFIED - income',
            'Purpose_Of_Loan_renewable_energy','Gender_Male']

df2 = df2.drop(drop_col, axis = 1)


df_test2 = df_test2.drop(drop_col, axis = 1)


In [141]:
df2.shape, df_test2.shape

((161954, 39), (109541, 38))

# <font color ='red' > Step 2: EDA

In [93]:
# df.head()

In [131]:
# df1["Type_of_Cab"] = df1["Type_of_Cab"].fillna("Unkown")
# df1["Confidence_Life_Style_Index"] = df1["Confidence_Life_Style_Index"].fillna("Unkown")


# df2 = DataFrameImputer().fit_transform(df.copy())
# df2_test = DataFrameImputer().fit_transform(df_test.copy())


# df3 = convert_to_numerical_label_encoding(df2.copy())
# df2.shape, df3.shape

# df3_test = convert_to_numerical_label_encoding(df2_test.copy())
# df2_test.shape, df3_test.shape

# df3 = majority_imbalanced_dataset(df3.copy(), "Interest_Rate")
# Counter(df4.Surge_Pricing_Type)


# scaling_col = ["Annual_Income", "Debt_To_Income","Inquiries_Last_6Mo", "Months_Since_Deliquency", 
#               "Number_Open_Accounts", "Total_Accounts", "Interest_Rate"]


# df4, df4_test = standardscaler_preprocessing(df3.copy(), df3_test.copy(), scaling_col)

In [117]:
# def convert_numerical_data_type(dataset):
#     col_list = list(dataset.select_dtypes(include =  ["int" , "float"] ).columns)
#     for numberical_colname in col_list:
#         dataset[numberical_colname] = pd.to_numeric(dataset[numberical_colname])  
    
#     return dataset

In [99]:
# df3 = convert_numerical_data_type(df.copy())
# df3_test = convert_numerical_data_type(df_test.copy())

In [152]:
# df3.columns

In [110]:
# df3 = df.copy()
# df3_test = df_test.copy()

In [148]:
# X = df3.loc[:,fetaaure_col]
X = df2.drop("Interest_Rate", axis = 1)

Y = df2.loc[:, ["Interest_Rate"]]

In [149]:
len(X) , len(Y), X.shape

(161954, 161954, (161954, 38))

In [237]:
model_xgb = XGBClassifier(max_depth=5, objective='multi:softmax',n_estimators=300, num_classes=3)
model_xgb.fit(X1, Y1)
# y_pred_final = model_xgb.predict(df3_test)


Parameters: { num_classes } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
       importance_type='gain', interaction_constraints='',
       learning_rate=0.300000012, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=nan, monotone_constraints='()',
       n_estimators=300, n_jobs=0, num_classes=3, num_parallel_tree=1,
       objective='multi:softprob', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=None, subsample=1,
       tree_method='exact', validate_parameters=1, verbosity=None)

In [233]:

%%time
model_lgb= LGBMClassifier(n_estimators= 400, objective='multi:softmax', num_classes=3)
model_lgb.fit(X1,Y1)
# y_pred_final = model_lgb.predict(df2_test)

Wall time: 7.84 s


In [115]:

# model_catg= CatBoostClassifier(depth= 3,l2_leaf_reg= 1e-25,learning_rate= 0.07,
#                                loss_function= 'MultiClass',n_estimators= 200)
# model_catg.fit(X,Y)
# y_pred_final = model_catg.predict(df3_test)

0:	learn: 1.0875212	total: 79.4ms	remaining: 15.8s
1:	learn: 1.0780889	total: 159ms	remaining: 15.7s
2:	learn: 1.0696121	total: 256ms	remaining: 16.8s
3:	learn: 1.0619977	total: 389ms	remaining: 19.1s
4:	learn: 1.0553271	total: 556ms	remaining: 21.7s
5:	learn: 1.0495591	total: 815ms	remaining: 26.4s
6:	learn: 1.0443790	total: 1.07s	remaining: 29.6s
7:	learn: 1.0395901	total: 1.31s	remaining: 31.6s
8:	learn: 1.0352500	total: 1.47s	remaining: 31.2s
9:	learn: 1.0313301	total: 1.61s	remaining: 30.6s
10:	learn: 1.0277080	total: 1.72s	remaining: 29.6s
11:	learn: 1.0244920	total: 1.85s	remaining: 29.1s
12:	learn: 1.0212182	total: 1.97s	remaining: 28.3s
13:	learn: 1.0185376	total: 2.05s	remaining: 27.2s
14:	learn: 1.0158645	total: 2.15s	remaining: 26.5s
15:	learn: 1.0134250	total: 2.27s	remaining: 26.1s
16:	learn: 1.0109586	total: 2.35s	remaining: 25.3s
17:	learn: 1.0087585	total: 2.45s	remaining: 24.8s
18:	learn: 1.0067336	total: 2.53s	remaining: 24.1s
19:	learn: 1.0049122	total: 2.64s	remain

161:	learn: 0.9411189	total: 16s	remaining: 3.76s
162:	learn: 0.9410394	total: 16.1s	remaining: 3.66s
163:	learn: 0.9409106	total: 16.2s	remaining: 3.55s
164:	learn: 0.9407986	total: 16.2s	remaining: 3.45s
165:	learn: 0.9406237	total: 16.3s	remaining: 3.34s
166:	learn: 0.9405445	total: 16.4s	remaining: 3.24s
167:	learn: 0.9403810	total: 16.5s	remaining: 3.13s
168:	learn: 0.9402552	total: 16.5s	remaining: 3.03s
169:	learn: 0.9401526	total: 16.6s	remaining: 2.93s
170:	learn: 0.9400744	total: 16.7s	remaining: 2.83s
171:	learn: 0.9399712	total: 16.7s	remaining: 2.73s
172:	learn: 0.9398818	total: 16.8s	remaining: 2.63s
173:	learn: 0.9397095	total: 16.9s	remaining: 2.52s
174:	learn: 0.9396185	total: 17s	remaining: 2.42s
175:	learn: 0.9395172	total: 17s	remaining: 2.32s
176:	learn: 0.9394023	total: 17.1s	remaining: 2.22s
177:	learn: 0.9392745	total: 17.2s	remaining: 2.13s
178:	learn: 0.9391649	total: 17.3s	remaining: 2.02s
179:	learn: 0.9390911	total: 17.4s	remaining: 1.93s
180:	learn: 0.9389

In [238]:
create_submission_file(model_xgb , "final_xgb_2", df_test4)

'File created successful'

In [116]:
submission_1 = submission.copy()

print(df_test.shape , submission.shape , len(y_pred_final))

submission_1["Interest_Rate"] = y_pred_final

print(submission_1["Interest_Rate"].values)


submission_1.to_csv('cgb_eda_1.csv', index=False)

df_submission_1 = pd.read_csv('cgb_eda_1.csv')

print(df_submission_1.shape)

df_submission_1.head(3)

(109541, 39) (109541, 2) 109541
[2. 1. 2. ... 2. 3. 2.]
(109541, 2)


Unnamed: 0,Loan_ID,Interest_Rate
0,10164310,2.0
1,10164311,1.0
2,10164312,2.0


In [67]:
def create_submission_file(model, file_name, test_data):
    
    y_pred_final = model.predict(test_data)

    submission_1 = submission.copy()
    submission_1["Interest_Rate"] = y_pred_final
    submission_1.to_csv(file_name+'.csv', index=False)
    
    return "File created successful"

In [55]:
Counter(df.Interest_Rate)

Counter({1: 33806, 3: 59923, 2: 70580})

In [68]:
df3[0:109541].shape  , df3[109541 : ].shape

((109541, 13), (54768, 13))

In [69]:
# X_feature_variables1 , y_output1 = majority_imbalanced_dataset(df3, "Interest_Rate")
# df10 = X_feature_variables1.copy()
# df10["Interest_Rate"] = y_output1
# df10.head()

In [175]:
def get_accuracy(y_train_val , y_pred_val , dataset_type = "Default"):
    
    print(" Dataset type is : ", dataset_type)
    
    print("\n Accuracy Score     : ",round(accuracy_score(y_train_val, y_pred_val), 4) * 100)
    
#     print("\n precision_accuracy : ",round(precision_score(y_train_val, y_pred_val), 4) * 100)
 
#     print("\n recall_accuracy    : ",round(recall_score(y_train_val, y_pred_val), 4) * 100)
    
#     print("\n roc_auc_accuracy   : ",round(roc_auc_score(y_train_val, y_pred_val), 4) * 100)
    
#     print("\n f1_score_accuracy  : ",round(f1_score(y_train_val, y_pred_val), 4) * 100)
    
#     print("\n explained_variance  : ",round(explained_variance_score(y_train_val, y_pred_val), 4) * 100)
    
#     tn, fp, fn, tp = confusion_matrix(y_train_val, y_pred_val).ravel()
    
#     print("\n Confusion Matrix TN : ", tn, " FP : ", fp, " FN : ", fn, " TP : ", tp)
    print(confusion_matrix(y_train_val, y_pred_val))

In [71]:
df3.head()

Unnamed: 0,Loan_Amount_Requested,Length_Employed,Home_Owner,Annual_Income,Income_Verified,Purpose_Of_Loan,Debt_To_Income,Inquiries_Last_6Mo,Months_Since_Deliquency,Number_Open_Accounts,Total_Accounts,Gender,Interest_Rate
0,1165,10,4,68000.0,2,0,18.37,0,34.229356,9,14,0,1
1,904,4,0,73331.159434,0,2,14.93,0,17.0,12,24,0,3
2,667,7,0,75566.4,1,2,15.88,0,34.229356,12,16,1,3
3,279,10,0,56160.0,1,2,14.34,3,34.229356,16,22,1,3
4,319,8,3,96000.0,1,2,22.17,1,34.229356,19,30,0,1


In [72]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 164309 entries, 0 to 164308
Data columns (total 13 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   Loan_Amount_Requested    164309 non-null  int64  
 1   Length_Employed          164309 non-null  int64  
 2   Home_Owner               164309 non-null  int64  
 3   Annual_Income            164309 non-null  float64
 4   Income_Verified          164309 non-null  int64  
 5   Purpose_Of_Loan          164309 non-null  int64  
 6   Debt_To_Income           164309 non-null  float64
 7   Inquiries_Last_6Mo       164309 non-null  int64  
 8   Months_Since_Deliquency  164309 non-null  float64
 9   Number_Open_Accounts     164309 non-null  int64  
 10  Total_Accounts           164309 non-null  int64  
 11  Gender                   164309 non-null  int64  
 12  Interest_Rate            164309 non-null  int64  
dtypes: float64(3), int64(10)
memory usage: 16.3 MB


In [74]:
df_new = df3.copy()

In [78]:
df10.shape , df3.shape

((164309, 14), (164309, 13))

In [79]:
new_test = df_new[0:109541]

new_train = df_new[109541 : ] 

In [80]:
x1_train = new_train.iloc[:, :-1]
y1_train = new_train.iloc[:, -1]

In [81]:
x1_test = new_test.iloc[:, :-1]
y1_test = new_test.iloc[:, -1]

In [82]:
Counter(y1_test)

Counter({1: 22618, 3: 39917, 2: 47006})

In [83]:
model_final = XGBClassifier(max_depth=5, objective='multi:softmax', num_classes=3)

model_final.fit(x1_train, y1_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, num_classes=3, objective='multi:softprob',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              seed=None, silent=None, subsample=1, verbosity=1)

In [176]:
y_pred_final_new_test = model_final.predict(x1_test)
get_accuracy(y1_test , y_pred_final_new_test ,  "Test")

 Dataset type is :  Test

 Accuracy Score     :  53.26
[[ 5113 14086  3419]
 [ 3454 29854 13698]
 [  900 15640 23377]]


In [178]:
precision_score(y1_test, y_pred_final_new_test, pos_label='positive',average='micro')

0.532622488383345

In [179]:
precision_score(y1_test, y_pred_final_new_test, pos_label='negative',average='micro')

0.532622488383345

In [181]:
recall_score(y1_test, y_pred_final_new_test, pos_label='positive',average='micro')

0.532622488383345

In [182]:
recall_score(y1_test, y_pred_final_new_test, pos_label='negative',average='micro')

0.532622488383345

In [86]:
y_pred_final_new_train = model_final.predict(x1_train)
get_accuracy(y1_train , y_pred_final_new_train ,  "Train")

 Dataset type is :  Train

 Accuracy Score     :  56.93
[[ 2909  6685  1594]
 [ 1507 15892  6175]
 [  418  7209 12379]]


In [88]:
submission_new = submission.copy()

submission_new["Interest_Rate"] = y_pred_final_new_test


submission_new.to_csv('second_submission.csv', index=False)
