# Credit Risk Morgage Loans
Use machine learning to detect credit risk on homes and the owner's ability repay the loan.

Each SK_ID_CURR in the test set, will predict a probability for the TARGET variable. The final prediction file should contain a header and have the following format:
SK_ID_CURR,TARGET <br />
100001,0.1 <br />
100005,0.9 <br />
100013,0.2 <br />

## View Data
Dataset csv file can be found here: https://www.kaggle.com/c/home-credit-default-risk  <br> 
There are 307511 rows with 122 columns. The columns and first five rows will be shown below to view. 

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_validate
from statistics import mean
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.datasets import make_classification
from sklearn import ensemble
#sample=r'/kaggle/input/home-credit-default-risk/sample_submission.csv'
#bureau=r'/kaggle/input/home-credit-default-risk/bureau_balance.csv'
#cash=r'/kaggle/input/home-credit-default-risk/POS_CASH_balance.csv'
#info='/kaggle/input/home-credit-default-risk/HomeCredit_columns_description.csv'
#app=r'/kaggle/input/home-credit-default-risk/previous_application.csv'
#cc=r'/kaggle/input/home-credit-default-risk/credit_card_balance.csv'
#install=r'/kaggle/input/home-credit-default-risk/installments_payments.csv'
train=r'/kaggle/input/home-credit-default-risk/application_train.csv'
test=r'/kaggle/input/home-credit-default-risk/application_test.csv'
buraeu=r'/kaggle/input/home-credit-default-risk/bureau.csv'
data=pd.read_csv(train) # (307511, 122)
test=pd.read_csv(test)
print(data.head())

## Find missing values
Too many missing values on a column will get the colunmn removed. Since there are 60 numeric columns with missing data, we need to interpret the Buraeu to find feature importance in order to engineer which columns are most worth keeping.

In [3]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
df = data.select_dtypes(include=numerics) # (307511, 106)

#search for columns with missing values:
def findNA():
    print("Missing data by column as a percent:")
    findNA=df.isnull().sum().sort_values(ascending=False)/len(data)
    print(findNA.head(60))
#findNA() 

## Remove columns and Fill Missing Values

In [4]:
df = df[df.isnull().sum(axis=1) <= 20] #remove col with 5 or more missing values
df= df.fillna(df.mean())

## Heat Map Correlations and Multicollinearity
There is no major multicollinearity. In fact, there are not many correlated variables. The following heatmap is set for correlations above .05 because there are so few variables that are highly correlated.

In [5]:
def printHeat():
    corr = df.corr()
    #print(corr)
    y='TARGET'
    highly_corr_features = corr.index[abs(corr[y])>0.05]
    plt.figure(figsize=(10,10))
    heat = sns.heatmap(df[highly_corr_features].corr(),annot=True,cmap="RdYlGn")
    top10=corr[y].sort_values(ascending=False).head(10)
    print(heat)
    print("Top 10 Correlations:\n", top10) # top ten correlations
#printHeat()

## View Buraeu Data for Feature Importance
The Buraeu data has [1716428 rows x 17 columns]

bData=r'/kaggle/input/home-credit-default-risk/bureau.csv'
bData=pd.read_csv(bData) #[1716428 rows x 17 columns]

numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
bDF = bData.select_dtypes(include=numerics)
print(bDF.head())

### Split Data
Split the data set into training data and test data. Target will always be Y since it is the independent variable.

In [6]:
X=df.drop('TARGET', axis=1)
y=df['TARGET']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=13)

## Gradient Booster and Feature Importance

In [7]:
from sklearn.inspection import permutation_importance
from sklearn.ensemble import GradientBoostingClassifier,GradientBoostingRegressor

params = {
 "n_estimators": 5, "max_depth": 4, "min_samples_split": 5, "learning_rate": 0.01,
}

reg = ensemble.GradientBoostingRegressor(**params)
reg.fit(X_train, y_train)

y_pred = reg.predict(X_test)
gbr_r2 = r2_score(y_test, y_pred).round(4) 
print("Gradient boosting regression r2: ", gbr_r2) 

mse = mean_squared_error(y_test, reg.predict(X_test))
print("The mean squared error (MSE) on test set: {:.4f}".format(mse))

#FEATURE IMPORTANCE:
num=10 # How many features?
cols=X.columns
feature_importance = reg.feature_importances_[:num]
sorted_idx = np.argsort(feature_importance)[:num]
pos = np.arange(sorted_idx.shape[0]) + 0.5
fig = plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.barh(pos, feature_importance[sorted_idx], align="center")
plt.yticks(pos, np.array(cols)[sorted_idx])
plt.title("Feature Importance (MDI)")

## Linear Regression

In [8]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso

lrModel = LinearRegression()
lrModel.fit(X_train, y_train)
#print(model.coef_)
#print(model.intercept_)

#Generate Predictions:
predictions = lrModel.predict(X_test)

# plt.scatter(y_test, predictions)
plt.hist(y_test - predictions)

#Performance measurement:
import sklearn.metrics as metrics
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, r2_score
#print(classification_report(y_test_data, predictions))
#print(confusion_matrix(y_test_data, predictions))

metrics.mean_absolute_error(y_test, predictions)
np.sqrt(metrics.mean_squared_error(y_test, predictions))

#use model to predict probability that given y value is 1:
y_pred_proba = lrModel.predict(X_test)
r2 = r2_score(y_test, y_pred_proba).round(4) 
print("Linear regression r2 score: ", r2)

#CROSS VALIDATE TEST RESULTS:
lr_score = lrModel.score(X_test, y_test).round(4)  # train test 
print("Accuracy: ", lr_score)
lr_cv = cross_validate(lrModel, X, y, cv = 5, scoring= 'r2')
lr_cvMean=lr_cv['test_score'].mean().round(4)
print(lr_cvMean, " linear regression cross validate mean")

#RIDGE REGRESSION:
ridge = Ridge(alpha = .5)  # sets alpha to a default value as baseline  
ridge.fit(X_train, y_train)
ridge_cv = cross_validate(ridge, X, y, cv = 5, scoring = 'r2')
ridge_cvMean=ridge_cv['test_score'].mean().round(4)
print ("Ridge Regression R2: ", ridge_cvMean)

#LASSO REGRESSION:
lasso = Lasso(alpha = .1, normalize=True)  # sets alpha to almost zero as baseline
lasso.fit(X_train, y_train)
lasso_cv = cross_validate(lasso, X, y, cv = 5, scoring = 'r2')
lasso_cvMean=lasso_cv['test_score'].mean().round(4)
print ("Lasso Regression R2: ", lasso_cvMean)