In [1]:
#function for data reading

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

def read_data(fileid): 
    stringvariables=["LOAN_ID","CHANNEL", "FIRST_TIME_HOME_BUYER_IND",
                  "LOAN_PURPOSE", "PROPERTY_TYPE", "OCCUPANCY_STATUS",
                  "PROPERTY_STATE", "ZIP_CODE_SHORT"]
    floatvariables=["ORIGINAL_INTEREST_RATE", "CURRENT_INTEREST_RATE"]
    datevariables=["MONTHLY_REPORTING_PERIOD",
                   "ORIGINATION_DATE",
                   "FIRST_PAYMENT_DATE",
                   "MATURITY_DATE"]
    intvariables=["ORIGINAL_UPB", "ORIGINAL_LOAN_TERM", 
                  "REM_MONTHS_MATURITY", "REM_MONTHS_LEGAL_MATURITY", 
                  "LTV", "CLTV", "NUMBER_OF_BORROWERS","DTI", "B_CREDIT_SCORE_O",
                  "NUMBER_OF_UNITS", "MSA", "LOAN_AGE", "FORECLOSURE", "NMONTHS",
                  "CB_CREDIT_SCORE_O", "MORTGAGE_INSURANCE_PERCENTAGE"]
    objectvariables=stringvariables + datevariables
    dtypes={}
    for s in objectvariables:
        dtypes[s]="object"
    for s in floatvariables:
        dtypes[s]="float"
    na_strings=["".join([" " for j in range(m)]) for m in range(10)]
    df=pd.read_csv(fileid,dtype=dtypes,na_values=na_strings)
    for datevar in datevariables:
        df[datevar]=pd.to_datetime(df[datevar],format="%m%Y")
    return(df)


In [2]:
#data reading

data = pd.DataFrame(read_data('trainQFXD.csv'))
data_test = pd.DataFrame(read_data('test_predsQFXD.csv'))

In [3]:
#data cleaning

#delete two columns that contain too much missing data
del data['CB_CREDIT_SCORE_O']
del data['MORTGAGE_INSURANCE_PERCENTAGE']
del data_test['CB_CREDIT_SCORE_O']
del data_test['MORTGAGE_INSURANCE_PERCENTAGE']

#delete columns that provide no information for the prediction
del data['ZIP_CODE_SHORT']

del data['MONTHLY_REPORTING_PERIOD']
del data['ORIGINATION_DATE']
del data['FIRST_PAYMENT_DATE']
del data['MATURITY_DATE']

del data_test['ZIP_CODE_SHORT']

del data_test['MONTHLY_REPORTING_PERIOD']
del data_test['ORIGINATION_DATE']
del data_test['FIRST_PAYMENT_DATE']
del data_test['MATURITY_DATE']

#fill null values by mean value or char value
data['CURRENT_INTEREST_RATE'] = data['CURRENT_INTEREST_RATE'].fillna(data['CURRENT_INTEREST_RATE'].mean())
data['LOAN_AGE'] = data['LOAN_AGE'].fillna(data['LOAN_AGE'].mean())
data['REM_MONTHS_LEGAL_MATURITY'] = data['REM_MONTHS_LEGAL_MATURITY'].fillna(data['REM_MONTHS_LEGAL_MATURITY'].mean())
data['REM_MONTHS_MATURITY'] = data['REM_MONTHS_MATURITY'].fillna(data['REM_MONTHS_MATURITY'].mean())
data['CLTV'] = data['CLTV'].fillna(data['CLTV'].mean())
data['NUMBER_OF_BORROWERS'] = data['NUMBER_OF_BORROWERS'].fillna(data['NUMBER_OF_BORROWERS'].mean())
data['DTI'] = data['DTI'].fillna(data['DTI'].mean())
data['LTV'] = data['LTV'].fillna(data['LTV'].mean())
data['B_CREDIT_SCORE_O'] = data['B_CREDIT_SCORE_O'].fillna(data['B_CREDIT_SCORE_O'].mean())
data['FIRST_TIME_HOME_BUYER_IND'] = data['FIRST_TIME_HOME_BUYER_IND'].fillna('N')

data_test['CURRENT_INTEREST_RATE'] = data_test['CURRENT_INTEREST_RATE'].fillna(data_test['CURRENT_INTEREST_RATE'].mean())
data_test['LOAN_AGE'] = data_test['LOAN_AGE'].fillna(data_test['LOAN_AGE'].mean())
data_test['REM_MONTHS_LEGAL_MATURITY'] = data_test['REM_MONTHS_LEGAL_MATURITY'].fillna(data_test['REM_MONTHS_LEGAL_MATURITY'].mean())
data_test['REM_MONTHS_MATURITY'] = data_test['REM_MONTHS_MATURITY'].fillna(data_test['REM_MONTHS_MATURITY'].mean())
data_test['CLTV'] = data_test['CLTV'].fillna(data_test['CLTV'].mean())
data_test['NUMBER_OF_BORROWERS'] = data_test['NUMBER_OF_BORROWERS'].fillna(data_test['NUMBER_OF_BORROWERS'].mean())
data_test['DTI'] = data_test['DTI'].fillna(data_test['DTI'].mean())
data_test['LTV'] = data_test['LTV'].fillna(data_test['LTV'].mean())
data_test['B_CREDIT_SCORE_O'] = data_test['B_CREDIT_SCORE_O'].fillna(data_test['B_CREDIT_SCORE_O'].mean())
data_test['FIRST_TIME_HOME_BUYER_IND'] = data_test['FIRST_TIME_HOME_BUYER_IND'].fillna('N')

#fixing 0 value data by replacing 0 with mean value
data['MSA']=data['MSA'].replace(0,data['MSA'].mean())
data_test['MSA']=data_test['MSA'].replace(0,data_test['MSA'].mean())

data['LOAN_AGE']=data['LOAN_AGE'].replace(-1,data['LOAN_AGE'].mean())
data_test['LOAN_AGE']=data_test['LOAN_AGE'].replace(-1,data_test['LOAN_AGE'].mean())

In [4]:
#feature engineering

#feature extraction
numeric_features = data.dtypes[data.dtypes == 'float64'].index
object_features = data.dtypes[data.dtypes == 'object'].index
numeric_features_test = data_test.dtypes[data_test.dtypes == 'float64'].index
object_features_test = data_test.dtypes[data_test.dtypes == 'object'].index

#handling categorical/object data - label encoding
from sklearn.preprocessing import LabelEncoder
LE = LabelEncoder()
categorical_features = ['CHANNEL',
 'FIRST_TIME_HOME_BUYER_IND',
 'LOAN_PURPOSE',
 'PROPERTY_TYPE',
 'OCCUPANCY_STATUS',
 'PROPERTY_STATE']
categorical_features_test = ['CHANNEL',
 'FIRST_TIME_HOME_BUYER_IND',
 'LOAN_PURPOSE',
 'PROPERTY_TYPE',
 'OCCUPANCY_STATUS',
 'PROPERTY_STATE']
for col in categorical_features:
    data[col] = LE.fit_transform(data[col])
for col in categorical_features_test:
    data_test[col] = LE.fit_transform(data_test[col])
    
#handling numeric data - normalization
#data[numeric_features] = data[numeric_features].apply(lambda x: (x - x.mean()) / (x.std()))
#data_test[numeric_features_test] = data_test[numeric_features_test].apply(lambda x: (x - x.mean()) / (x.std()))

In [5]:
#set splitting - train dataset NMONTHS prediction

from sklearn.model_selection import train_test_split
X = data.drop(columns = ['FORECLOSURE', 'NMONTHS', 'LOAN_ID'], axis = 1).values
y = data['NMONTHS'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 10)

In [6]:
#model creation: linear regression - NMONTHS

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
LR = LinearRegression()
LR.fit(X_train, y_train)
y_prediction = LR.predict(X_test)

In [7]:
#model creation: xgboost regression - NMONTHS

import xgboost as xgb
XGB = xgb.XGBRegressor()
XGB.fit(X_train, y_train)
y_prediction_XGB = XGB.predict(X_test)

In [8]:
#model evaluation: linear regression and xgboost for NMONTHS

from sklearn.metrics import mean_squared_error

def mae(y_true, predictions):
    y_true, predictions = np.array(y_true), np.array(predictions)
    return np.mean(np.abs(y_true - predictions))

def root_mean_squared_error(actual, predictions):
    return np.sqrt(mean_squared_error(actual, predictions))

def adjusted_r2_score(actual, predictions, num_pred, num_samples):
    n = num_samples
    k = num_pred
    r2 = r2_score(actual, predictions)
    adjusted_r2 = 1 - ((1-r2) * ((n-1)/(n-k-1)))
    return adjusted_r2

print("NMONTHS linear regression model's r2_score:", LR.score(X_test, y_test))
print("NMONTHS linear regression model's MAE(Mean Absolute Error):", mae(y_test, y_prediction))
print("NMONTHS linear regression model's MSE(Mean Square Error):",mean_squared_error(y_test, y_prediction))
print("NMONTHS linear regression model's RMSE(Root Mean Square Error):",root_mean_squared_error(y_test, y_prediction))

print("NMONTHS xgboost regression model's r2_score:", XGB.score(X_test, y_test))
print("NMONTHS xgboost regression model's MAE(Mean Absolute Error):", mae(y_test, y_prediction_XGB))
print("NMONTHS xgboost regression model's MSE(Mean Square Error):",mean_squared_error(y_test, y_prediction_XGB))
print("NMONTHS xgboost regression model's RMSE(Root Mean Square Error):",root_mean_squared_error(y_test, y_prediction_XGB))

NMONTHS linear regression model's r2_score: 0.15945008647805037
NMONTHS linear regression model's MAE(Mean Absolute Error): 36.447790677018936
NMONTHS linear regression model's MSE(Mean Square Error): 2297.193562369628
NMONTHS linear regression model's RMSE(Root Mean Square Error): 47.92904716734548
NMONTHS xgboost regression model's r2_score: 0.20701248717965381
NMONTHS xgboost regression model's MAE(Mean Absolute Error): 34.26425335583007
NMONTHS xgboost regression model's MSE(Mean Square Error): 2167.2071820906
NMONTHS xgboost regression model's RMSE(Root Mean Square Error): 46.55327251752126


In [9]:
#set splitting - train dataset FORECLOSURE prediction

logistic_features = ['ORIGINAL_INTEREST_RATE', 'CURRENT_INTEREST_RATE', 'ORIGINAL_LOAN_TERM', 'REM_MONTHS_LEGAL_MATURITY','REM_MONTHS_MATURITY','LOAN_AGE', 'NUMBER_OF_BORROWERS', 'DTI', 'LTV', 'CLTV', 'B_CREDIT_SCORE_O']
X_F = data[logistic_features].values
y_F = data['FORECLOSURE'].values
X_train_F, X_test_F, y_train_F, y_test_F = train_test_split(X_F, y_F, test_size = 0.2, random_state = 10)

In [10]:
#model creation: logistic regression - FORECLOSURE

from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(random_state=20)
logreg.fit(X_train_F, y_train_F)
y_prediction_F_prob = logreg.predict_proba(X_test_F)
threshold=data['FORECLOSURE'].sum()/len(data)
y_prediction_F = pd.Series(y_prediction_F_prob[:,1] > threshold * 2.85)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:
#model evaluation: logistic regression for FORECLOSURE

print("FORECLOSURE logistic regression model's r2_score:", logreg.score(X_test_F, y_test_F))
print("FORECLOSURE logistic regression model's MAE(Mean Absolute Error):", mae(y_test_F, logreg.predict(X_test_F)))
print("FORECLOSURE logistic regression model's MSE(Mean Square Error):",mean_squared_error(y_test_F, logreg.predict(X_test_F)))
print("FORECLOSURE logistic regression model's RMSE(Root Mean Square Error):",root_mean_squared_error(y_test_F, logreg.predict(X_test_F)))

FORECLOSURE logistic regression model's r2_score: 0.97716
FORECLOSURE logistic regression model's MAE(Mean Absolute Error): 0.02284
FORECLOSURE logistic regression model's MSE(Mean Square Error): 0.02284
FORECLOSURE logistic regression model's RMSE(Root Mean Square Error): 0.1511290838985005


In [12]:
#save data to local machine as .csv file

save_y_test = pd.Series(XGB.predict(data_test.drop(columns = ['LOAN_ID'], axis = 1)), name='NMONTHS').tolist()
save_NMONTHS = []
for item in save_y_test:
    save_NMONTHS.append(round(item, 3))
save_ID = pd.Series(data_test['LOAN_ID'] ,name='ID').tolist()
save_data = pd.DataFrame(
    {'LOAN_ID': save_ID,
     'NMONTHS': save_NMONTHS
    })
save_data.to_csv('/Users/jiangxingnandecomputer/Desktop/NMONTHS_pred.csv', index=False)

y_prediction_F_prob = logreg.predict_proba(data_test[logistic_features].values)
y_prediction_F = pd.Series(y_prediction_F_prob[:,1] > 0.11644).tolist()
y_prediction_prob_F = pd.Series(y_prediction_F_prob[:,1]).tolist()
answer_list = []
for i in y_prediction_F:
    answer_list.append(int(i))
save_data_F = pd.DataFrame(
    {'LOAN_ID': save_ID,
     'FORECLOSURE': answer_list,
     'NMONTHS': save_NMONTHS
    })
save_data_F.to_csv('/Users/jiangxingnandecomputer/Desktop/FORECLOSURE_pred.csv', index=False)