In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from verstack import NaNImputer
from sklearn.preprocessing import LabelEncoder
from imblearn.pipeline import Pipeline as imbPipeline

In [3]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import classification_report

In [4]:
class Pipeline_class():

    def Load(self):
        df = pd.read_csv('loan_level_500k.csv')
        return df

    def dropper(self,df):
        df.drop(["FIRST_PAYMENT_DATE", "MATURITY_DATE", "MORTGAGE_INSURANCE_PERCENTAGE", "ORIGINAL_UPB", "ORIGINAL_INTEREST_RATE", "PREPAYMENT_PENALTY_MORTGAGE_FLAG","NUMBER_OF_BORROWERS","LOAN_SEQUENCE_NUMBER", "FIRST_TIME_HOMEBUYER_FLAG"], inplace=True, axis=1)

    def explainData(self,df):
        print("\n\nShape of Data:")
        print(df.shape)
        print("\n\nInformation of Data:\n")
        df.info()
        print("\n\nDescription of Data:\n")
        print(df.describe().T)

    def missing_percentage(self, df):
        missing = pd.DataFrame(columns=['Category', 'Percentage'])
        for col in df.columns:
            if df[col].isna().values.any():
                percentage = 100*df[col].isna().sum()/df.shape[0]
                missing = missing.append({'Category':col, 'Percentage':percentage}, ignore_index = True)
        return missing

    def seperate_data(self, df):
        X = df.drop('DELINQUENT', axis =1)
        y = df['DELINQUENT']
        y = y*1
        return X,y

    def verstack_imputer(self, X):
        imputer = NaNImputer()
        X = imputer.impute(X)
        return X

    def train_test_sp(self, X, y):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
        return X_train, X_test, y_train, y_test

    def Impute(self, X):
        categorical = list(X.select_dtypes(include=['object','bool']).columns[:-1])
        numerical = X.select_dtypes(include=['int64','float64']).columns[:-1]
        for i in numerical:
            X[i] = X[i].fillna(X[i].median())
        for i in categorical:
            X[i] = X[i].fillna(X[i].mode()[0])
        return X

    def feature_encoder(self, X):
        label_encoder = LabelEncoder()
        X['PREPAID']= label_encoder.fit_transform(X['PREPAID'])
        X['POSTAL_CODE'] = label_encoder.fit_transform(X['POSTAL_CODE'])
        return X

    def onehot_encoder(self, X):
        X_new = pd.get_dummies(X, ['OCCUPANCY_STATUS', 'CHANNEL', 'PRODUCT_TYPE', 'PROPERTY_STATE', 'PROPERTY_TYPE', 'LOAN_PURPOSE', 'SELLER_NAME', 'SERVICER_NAME'], drop_first=True)
        return X_new

    def scaling(self, X):
        scaler = StandardScaler(with_mean=False)
        X = scaler.fit_transform(X)
        return X

    def OverSample_rand(self,X,y):
        sam = RandomOverSampler(sampling_strategy='minority')
        X_resampled, y_resampled = sam.fit_resample(X, y)
        return X_resampled, y_resampled

    def classifier(self, X_train, y_train, X_test, y_test):
        rfc2 = RandomForestClassifier(class_weight='balanced')
        rfc2.fit(X_train, y_train)
        y_pred_rfc2 = rfc2.predict(X_test)
        print(classification_report(y_pred_rfc2, y_test))
        return rfc2

In [5]:
p1 = Pipeline_class()
df = p1.Load()

In [6]:
p1.dropper(df)

In [7]:
p1.explainData(df)



Shape of Data:
(500137, 18)


Information of Data:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500137 entries, 0 to 500136
Data columns (total 18 columns):
 #   Column                           Non-Null Count   Dtype  
---  ------                           --------------   -----  
 0   CREDIT_SCORE                     497426 non-null  float64
 1   METROPOLITAN_STATISTICAL_AREA    429988 non-null  float64
 2   NUMBER_OF_UNITS                  500134 non-null  float64
 3   OCCUPANCY_STATUS                 500137 non-null  object 
 4   ORIGINAL_COMBINED_LOAN_TO_VALUE  500124 non-null  float64
 5   ORIGINAL_DEBT_TO_INCOME_RATIO    485208 non-null  float64
 6   ORIGINAL_LOAN_TO_VALUE           500128 non-null  float64
 7   CHANNEL                          500137 non-null  object 
 8   PRODUCT_TYPE                     500137 non-null  object 
 9   PROPERTY_STATE                   500137 non-null  object 
 10  PROPERTY_TYPE                    500042 non-null  object 
 11  POSTAL_CODE

In [8]:
df = p1.verstack_imputer(df)

NaNImputer(conservative = False, n_feats = 10,            
           fix_string_nans = True, verbose = True,                
           multiprocessing_load = 3, fill_nans_in_pure_text = True,                    
           drop_empty_cols = True, drop_nan_cols_with_constant = True                        
           feature_selection = correlation)

Dataset dimensions:
 - rows:         500137
 - columns:      18
 - mb in memory: 62.01
 - NaN cols num: 8
--------------------------

Deploy multiprocessing with 12 parallel proceses


NaNs imputation time: 0.78 minutes
--------------------------------------------------


In [19]:
df = p1.Impute(df)

In [20]:
X,y = p1.seperate_data(df)

In [21]:
X_train, X_test, y_train, y_test = p1.train_test_sp(X,y)

In [22]:
X_train = p1.feature_encoder(X_train)
X_test = p1.feature_encoder(X_test)

In [23]:
X_train_new = p1.onehot_encoder(X_train)
X_test_new = p1.onehot_encoder(X_test)

In [24]:
X_train = p1.scaling(X_train_new)
X_test = p1.scaling(X_test_new)

In [25]:
X_re, y_re = p1.OverSample_rand(X_train, y_train)

In [26]:
X_re = pd.DataFrame(X_re, columns=X_train_new.columns)

In [None]:
model = p1.classifier(X_re, y_re, X_test, y_test )