In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from verstack import NaNImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from imblearn.over_sampling import RandomOverSampler

In [3]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler,SMOTE
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline,make_pipeline

In [4]:
df = pd.read_csv('loan_level_500k.csv')
df.head()

Unnamed: 0,CREDIT_SCORE,FIRST_PAYMENT_DATE,FIRST_TIME_HOMEBUYER_FLAG,MATURITY_DATE,METROPOLITAN_STATISTICAL_AREA,MORTGAGE_INSURANCE_PERCENTAGE,NUMBER_OF_UNITS,OCCUPANCY_STATUS,ORIGINAL_COMBINED_LOAN_TO_VALUE,ORIGINAL_DEBT_TO_INCOME_RATIO,...,PROPERTY_TYPE,POSTAL_CODE,LOAN_SEQUENCE_NUMBER,LOAN_PURPOSE,ORIGINAL_LOAN_TERM,NUMBER_OF_BORROWERS,SELLER_NAME,SERVICER_NAME,PREPAID,DELINQUENT
0,669.0,200206,N,202901,,0.0,1.0,O,80.0,33.0,...,SF,26100.0,F199Q1000004,P,320,2.0,Other sellers,Other servicers,True,False
1,732.0,199904,N,202903,17140.0,0.0,1.0,O,25.0,10.0,...,SF,45200.0,F199Q1000005,N,360,1.0,Other sellers,Other servicers,True,False
2,679.0,200208,N,202902,15940.0,30.0,1.0,O,91.0,48.0,...,SF,44700.0,F199Q1000007,P,319,1.0,Other sellers,Other servicers,True,False
3,721.0,200209,N,202902,38060.0,0.0,1.0,O,39.0,13.0,...,SF,85200.0,F199Q1000013,N,318,2.0,Other sellers,Other servicers,True,False
4,618.0,200210,N,202902,10420.0,25.0,1.0,O,85.0,24.0,...,SF,44200.0,F199Q1000015,N,317,2.0,Other sellers,Other servicers,True,False


In [5]:
df.drop('LOAN_SEQUENCE_NUMBER', inplace=True, axis =1)

In [6]:
df.shape

(500137, 26)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500137 entries, 0 to 500136
Data columns (total 26 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   CREDIT_SCORE                      497426 non-null  float64
 1   FIRST_PAYMENT_DATE                500137 non-null  int64  
 2   FIRST_TIME_HOMEBUYER_FLAG         369578 non-null  object 
 3   MATURITY_DATE                     500137 non-null  int64  
 4   METROPOLITAN_STATISTICAL_AREA     429988 non-null  float64
 5   MORTGAGE_INSURANCE_PERCENTAGE     449089 non-null  float64
 6   NUMBER_OF_UNITS                   500134 non-null  float64
 7   OCCUPANCY_STATUS                  500137 non-null  object 
 8   ORIGINAL_COMBINED_LOAN_TO_VALUE   500124 non-null  float64
 9   ORIGINAL_DEBT_TO_INCOME_RATIO     485208 non-null  float64
 10  ORIGINAL_UPB                      500137 non-null  int64  
 11  ORIGINAL_LOAN_TO_VALUE            500128 non-null  f

In [8]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
CREDIT_SCORE,497426.0,712.536212,54.791262,300.0,676.0,719.0,756.0,839.0
FIRST_PAYMENT_DATE,500137.0,200025.430952,109.815541,199901.0,199904.0,200005.0,200105.0,201103.0
MATURITY_DATE,500137.0,203023.195872,110.384189,202402.0,202903.0,203004.0,203104.0,204101.0
METROPOLITAN_STATISTICAL_AREA,429988.0,30777.824739,11333.401144,10180.0,19740.0,33340.0,40420.0,49740.0
MORTGAGE_INSURANCE_PERCENTAGE,449089.0,7.744532,12.046546,0.0,0.0,0.0,18.0,55.0
NUMBER_OF_UNITS,500134.0,1.02889,0.218391,1.0,1.0,1.0,1.0,4.0
ORIGINAL_COMBINED_LOAN_TO_VALUE,500124.0,76.053571,15.139986,6.0,70.0,80.0,88.0,180.0
ORIGINAL_DEBT_TO_INCOME_RATIO,485208.0,32.917541,11.1118,1.0,25.0,33.0,41.0,65.0
ORIGINAL_UPB,500137.0,136493.484785,60968.743066,8000.0,89000.0,126000.0,176000.0,578000.0
ORIGINAL_LOAN_TO_VALUE,500128.0,75.710714,14.937717,6.0,70.0,80.0,85.0,100.0


Dropping irrelevant columns (Not Available during prediction)

In [9]:
df.drop(["FIRST_PAYMENT_DATE", "MATURITY_DATE", "MORTGAGE_INSURANCE_PERCENTAGE", "ORIGINAL_UPB", "ORIGINAL_INTEREST_RATE", "PREPAYMENT_PENALTY_MORTGAGE_FLAG"], inplace=True, axis=1)

In [10]:
print(df.isnull().sum().sort_values())

PRODUCT_TYPE                            0
SERVICER_NAME                           0
SELLER_NAME                             0
ORIGINAL_LOAN_TERM                      0
LOAN_PURPOSE                            0
PROPERTY_STATE                          0
PREPAID                                 0
CHANNEL                                 0
DELINQUENT                              0
OCCUPANCY_STATUS                        0
NUMBER_OF_UNITS                         3
ORIGINAL_LOAN_TO_VALUE                  9
ORIGINAL_COMBINED_LOAN_TO_VALUE        13
POSTAL_CODE                            31
PROPERTY_TYPE                          95
NUMBER_OF_BORROWERS                   247
CREDIT_SCORE                         2711
ORIGINAL_DEBT_TO_INCOME_RATIO       14929
METROPOLITAN_STATISTICAL_AREA       70149
FIRST_TIME_HOMEBUYER_FLAG          130559
dtype: int64


In [11]:
def missing_percentage(df):
    missing = pd.DataFrame(columns=['Category', 'Percentage'])
    for col in df.columns:
        if df[col].isna().values.any():
            percentage = 100*df[col].isna().sum()/df.shape[0]
            missing = missing.append({'Category':col, 'Percentage':percentage}, ignore_index = True)
    return missing

In [12]:
missingdata = missing_percentage(df)
missingdata.sort_values('Percentage', ascending=False)

Unnamed: 0,Category,Percentage
1,FIRST_TIME_HOMEBUYER_FLAG,26.104647
2,METROPOLITAN_STATISTICAL_AREA,14.025957
5,ORIGINAL_DEBT_TO_INCOME_RATIO,2.984982
0,CREDIT_SCORE,0.542051
9,NUMBER_OF_BORROWERS,0.049386
7,PROPERTY_TYPE,0.018995
8,POSTAL_CODE,0.006198
4,ORIGINAL_COMBINED_LOAN_TO_VALUE,0.002599
6,ORIGINAL_LOAN_TO_VALUE,0.0018
3,NUMBER_OF_UNITS,0.0006


In [13]:
df.FIRST_TIME_HOMEBUYER_FLAG.value_counts()

N    320418
Y     49160
Name: FIRST_TIME_HOMEBUYER_FLAG, dtype: int64

In [14]:
categorical = list(df.select_dtypes(include=['object','bool']).columns[:-1])
numerical = df.select_dtypes(include=['int64','float64']).columns[:-1]

In [15]:
df.drop('FIRST_TIME_HOMEBUYER_FLAG',axis = 1,  inplace=True)

Seperate Data

In [16]:
X_n = df.drop('DELINQUENT', axis =1)
y_n = df['DELINQUENT']

In [17]:
imputer = NaNImputer()
X_n = imputer.impute(X_n)

NaNImputer(conservative = False, n_feats = 10,            
           fix_string_nans = True, verbose = True,                
           multiprocessing_load = 3, fill_nans_in_pure_text = True,                    
           drop_empty_cols = True, drop_nan_cols_with_constant = True                        
           feature_selection = correlation)

Dataset dimensions:
 - rows:         500137
 - columns:      18
 - mb in memory: 65.34
 - NaN cols num: 9
--------------------------

Deploy multiprocessing with 12 parallel proceses


NaNs imputation time: 0.8 minutes
--------------------------------------------------


In [18]:
X_n.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500137 entries, 0 to 500136
Data columns (total 18 columns):
 #   Column                           Non-Null Count   Dtype  
---  ------                           --------------   -----  
 0   CREDIT_SCORE                     497426 non-null  float64
 1   METROPOLITAN_STATISTICAL_AREA    429988 non-null  float64
 2   NUMBER_OF_UNITS                  500134 non-null  float64
 3   OCCUPANCY_STATUS                 500137 non-null  object 
 4   ORIGINAL_COMBINED_LOAN_TO_VALUE  500124 non-null  float64
 5   ORIGINAL_DEBT_TO_INCOME_RATIO    485208 non-null  float64
 6   ORIGINAL_LOAN_TO_VALUE           500137 non-null  float64
 7   CHANNEL                          500137 non-null  object 
 8   PRODUCT_TYPE                     500137 non-null  object 
 9   PROPERTY_STATE                   500137 non-null  object 
 10  PROPERTY_TYPE                    500137 non-null  object 
 11  POSTAL_CODE                      500137 non-null  float64
 12  LO

In [19]:
y_n.value_counts().to_frame().T

Unnamed: 0,False,True
DELINQUENT,482146,17991


Train test Split

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X_n, y_n, test_size=0.3, random_state=42)

In [21]:
X_train = pd.DataFrame(X_train, columns = X_n.columns)
X_test = pd.DataFrame(X_test, columns=X_n.columns)

Random Forest HyperParameters

In [22]:
# param_grid = {
#     'n_estimators':[200,500],
#     'max_features':['auto', 'sqrt', 'log2'],
#     'max_depth':[4,5,6,7,8],
#     'criterion':['gini', 'entropy']
# }

BaseEstimator and TransformerMixin Approach

In [23]:
from sklearn.base import BaseEstimator, TransformerMixin
from imblearn.pipeline import Pipeline as imbPipeline

In [33]:
class Impute(BaseEstimator, TransformerMixin):

    def fit(self,X, y=None):
        return self

    def transform(self, X):
        X = pd.DataFrame(X, columns=X_n.columns)
        categorical = list(X.select_dtypes(include=['object','bool']).columns[:-1])
        numerical = X.select_dtypes(include=['int64','float64']).columns[:-1]
        for i in numerical:
            X[i] = X[i].fillna(X[i].median())
        for i in categorical:
            X[i] = X[i].fillna(X[i].mode())
        return X

In [34]:
class FeatureEncoder(BaseEstimator, TransformerMixin):

    def fit(self,X,y=None):
        return self

    def transform(self,X):
        label_encoder = LabelEncoder()
        X['PREPAID']= label_encoder.fit_transform(X[['PREPAID']])
        X['POSTAL_CODE'] = label_encoder.fit_transform(X[['POSTAL_CODE']])
        return X

In [35]:
class Ordinal_Encoder(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self,X):
        enc = OrdinalEncoder()
        X = enc.fit_transform(X[['OCCUPANCY_STATUS', 'CHANNEL', 'PRODUCT_TYPE', 'PROPERTY_STATE','PROPERTY_TYPE', 'LOAN_PURPOSE', 'SELLER_NAME', 'SERVICER_NAME']])
        return X

In [36]:
class Scaling(BaseEstimator, TransformerMixin):

    def fit(self,X,y=None):
        return self

    def transform(self,X):
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
        return X_scaled

In [37]:
pipe = imbPipeline([
    ("Label Encoder", FeatureEncoder()),
    ("Ordinal Encoder", OrdinalEncoder()),
    ("Imputer", Impute()),
    ("Scaling", Scaling()),
    ("Over sample", SMOTE()),
    ("Classifier", RandomForestClassifier())
])

In [38]:
pipe.fit(X_train, y_train)

In [39]:
pipe.named_steps

{'Label Encoder': FeatureEncoder(),
 'Ordinal Encoder': OrdinalEncoder(),
 'Imputer': Impute(),
 'Scaling': Scaling(),
 'Over sample': SMOTE(),
 'Classifier': RandomForestClassifier()}

In [31]:
from sklearn import set_config
set_config(display='diagram')

In [32]:
y_pred = pipe.predict(X_test)

ValueError: Found unknown categories [333.0, 391.0, 416.0, 432.0, 448.0, 453.0, 464.0, 466.0, 480.0, 832.0, 835.0] in column 0 during transform

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

Pipeline using Column TransFormer

In [None]:
# from sklearn.base import TransformerMixin
#
#
# class MyStandardScaler(TransformerMixin):
#     def __init__(self, *args, **kwargs):
#         self.scaler = StandardScaler(*args,**kwargs)
#     def fit(self,x,y=0):
#         self.scaler.fit(x)
#         return self
#     def transform(self,x,y=0):
#         return self.scaler.transform(x)
#
#
#

In [None]:
# Ordinal_features = ['OCCUPANCY_STATUS', 'CHANNEL', 'PRODUCT_TYPE', 'PROPERTY_STATE', 'PROPERTY_TYPE', 'LOAN_PURPOSE', 'SELLER_NAME', 'SERVICER_NAME']

In [None]:
# trf1 = ColumnTransformer([
#     ('Ordinal Encoder', OrdinalEncoder(),Ordinal_features)
# ],remainder='passthrough')

In [None]:
# trf2 = ColumnTransformer([
#     ('NaN Imputer',NaNImputer()),
#     ('Scalar', MyStandardScaler()),
#     ('Random Over Sample', RandomOverSampler())
# ])

In [None]:
# trf3 = RandomForestClassifier()

In [None]:
# pipe = make_pipeline(trf1, trf2, trf3)

In [None]:
# pipe.fit(X_train, y_train)

Pipeline

In [None]:
# Label_Enc = Pipeline(steps=[
#     ('Label Imputer', LabelEncoder())
# ])
#
# Ord_Enc = Pipeline(steps=[
#     ('Ordinal Encoder', OrdinalEncoder())
# ])
# All_transformer = Pipeline(steps=[
#     ('NaN Imputer',NaNImputer()),
#     ('Scalar', StandardScaler()),
#     ('Smote', RandomOverSampler())
# ])

In [None]:
# Label_features = ['DELINQUENT','PREPAID','POSTAL_CODE']
# Ordinal_features = ['OCCUPANCY_STATUS', 'CHANNEL', 'PRODUCT_TYPE', 'PROPERTY_STATE', 'PROPERTY_TYPE', 'LOAN_PURPOSE', 'SELLER_NAME', 'SERVICER_NAME']
# All_features = X.select_dtypes(include=['int64', 'float64', 'object']).columns

In [None]:
# preprocessor = ColumnTransformer([
#         ('Label', Label_Enc, Label_features),
#         ('Ordinal', Ord_Enc, Ordinal_features),
#         ('Data Processing', All_transformer, All_features)
# ])

In [None]:
# trf1 = RandomForestClassifier()

In [None]:
# pipe = make_pipeline(preprocessor,trf1)

In [None]:
# pipe.fit(X_train, y_train)

In [None]:
# pipeline = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('Classifier', RandomForestClassifier())
# ])

In [None]:
# rf_model = pipeline.fit(X_train, y_train)
# print(rf_model)