# **Team A Submission**:

### Importing Libraries :

### Loading Data

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns # for visualization
import plotly.express as px # for visualization
import matplotlib.pyplot as plt # for visualization
%matplotlib inline

# To display all the columns of dataframe
pd.set_option('display.max_columns', 500)
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('Bondora_preprocessed.csv')

## Feature Engineering:
<ul>
<li><a href="#intro">Handling Null values</a></li>
<li><a href="#wrangling">Handling outliers</a></li>
<li><a href="#wrangling">Feature Selection</a></li>
<li><a href="#eda">Categorical Features Encoding</a></li>
<li><a href="#conclusions">Feature scaling</a></li>
<li><a href="#conclusions">Feature extraction and Dimensionality reduction using PCA</a></li>
<li><a href="#conclusions">Spiliting Data into train and test sets</a></li>
</ul>

### 2. Handling outliers:

In [3]:
# Let's compute IQR for each numerical feature
df_IQR = df[df.select_dtypes([float, int]).columns].quantile(.75) - df[df.select_dtypes([float, int]).columns].quantile(.25)

In [4]:
# Let's compute maximum and minimum limits
df_Max =  df[df.select_dtypes([float, int]).columns].quantile(.75) + (1.5*df_IQR)
df_Min =  df[df.select_dtypes([float, int]).columns].quantile(.25) - (1.5*df_IQR)

###### We can Handling the outliers of each numerical feature using "Winsorizing" method:

In [5]:
# Loop for replacing outliers above upper bound with the upper bound value:
for column in df.select_dtypes([float, int]).columns :
   
    col_IQR = df[column].quantile(.75) - df[column].quantile(.25)
    col_Max =  df[column].quantile(.75) + (1.5*col_IQR)
    df[column][df[column] > col_Max] =  col_Max

In [6]:
# Loop for replacing outliers under lower bound with the lower bound value:
for column in df.select_dtypes([float, int]).columns :
    col_IQR = df[column].quantile(.75) - df[column].quantile(.25)
    col_Min =  df[column].quantile(.25) - (1.5*col_IQR)
    df[column][df[column] < col_Min] =  col_Min

### 3. Feature Selection

In [7]:
# A function to select highly correlated features.
def Correlation(dataset, threshold): 
    correltated_features = set() # as a container of highly correlated features
    correlation_matrix = dataset.corr()
    for i in range(len(correlation_matrix.columns)):
        for j in range(i):
            if abs(correlation_matrix.iloc[i, j]) > threshold:
                column_name = correlation_matrix.columns[i]
                correltated_features.add(column_name)
    return correltated_features

In [8]:
# let's selected features with a correlation factor > 0.8
Correlation(df, 0.8)

{'Amount', 'AmountOfPreviousLoansBeforeLoan', 'NoOfPreviousLoansBeforeLoan'}

In [9]:
# Now we can drop these features from our dataset
df.drop(columns= ['AmountOfPreviousLoansBeforeLoan', 'NoOfPreviousLoansBeforeLoan'], inplace = True )

### 4. Feature Encoding

###### Let's divide our features to "Target" feature and "Independnt features" :

In [10]:
Target_feature = df.LoanStatus
Ind_features   = df.drop(columns = ['LoanStatus'])

In [11]:
# Target_feature Encoding:
Target_feature = np.where(Target_feature=='NoDefault', 1, 0)

In [12]:
# Ind_features Encoding:
Ind_features = pd.get_dummies(Ind_features)

In [13]:
Ind_features.shape

(77394, 137)

### 5. Feature Scaling

In [14]:
from sklearn.preprocessing import StandardScaler 

Scalar = StandardScaler()

Ind_features = Scalar.fit_transform(Ind_features)

### 6. Feature Extraction and Dimensionality-reduction using (PCA) 

In [15]:
# importing PCA class
from sklearn.decomposition import PCA

# Create a PCA object with number of component = 25
pca = PCA(n_components = 110) 

# Let's fit our data using PCA
Ind_features_pca = pca.fit_transform(Ind_features)

# Percentage of information we have after apllying 2-d PCA
sum(pca.explained_variance_ratio_) * 100

99.7288382417102

### 7. Spiliting Data into training and testing sets

In [16]:
X = Ind_features_pca
y = Target_feature

In [17]:
# Let's use Train Test Split 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0,
                                                    train_size = .75, stratify=y)

In [18]:
X_train.shape, y_train.shape

((58045, 110), (58045,))

In [19]:
X_test.shape, y_test.shape

((19349, 110), (19349,))

## Modeling (Classification)
Using: **Logistic Regression** and **Random Forest Classifier**

### I. Logistic Regression:

In [23]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(random_state=0)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

In [24]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(f"Logistic Regression Reort: \n{'-'*30}\n")
print("Accuracy score:\n", accuracy_score(y_test, y_pred))
print('*'*40)
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print('*'*40)
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=['Default', 'Non-deafault']))

Logistic Regression Reort: 
------------------------------

Accuracy score:
 0.8842834255000258
****************************************

Confusion Matrix:
 [[8963 1736]
 [ 503 8147]]
****************************************

Classification Report:
               precision    recall  f1-score   support

     Default       0.95      0.84      0.89     10699
Non-deafault       0.82      0.94      0.88      8650

    accuracy                           0.88     19349
   macro avg       0.89      0.89      0.88     19349
weighted avg       0.89      0.88      0.88     19349



In [29]:
from sklearn.metrics import roc_auc_score

y_pred_prob = logreg.predict_proba(X_test)[:,1]

print("roc_auc_score = ", roc_auc_score(y_test, y_pred_prob))

roc_auc_score =  0.9425745369752562


In [26]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(logreg, X_train, y_train, cv=5, scoring='roc_auc')

print(cv_scores)

[0.94577493 0.94432052 0.94601035 0.94482064 0.94382633]


### II. Random Forest Classifier:

In [27]:
# Defining base-model with default parameters
from sklearn.ensemble import RandomForestClassifier

rf_base = RandomForestClassifier(random_state=0)
rf_base.fit(X_train, y_train)
rf_y_pred_base = rf_base.predict(X_test)

In [28]:
print(f"Random Forest Classifier Reort: \n{'-'*30}\n")
print("Accuracy score:\n", accuracy_score(y_test, rf_y_pred_base))
print('*'*40)
print("\nConfusion Matrix:\n", confusion_matrix(y_test, rf_y_pred_base))
print('*'*40)
print("\nClassification Report:\n", classification_report(y_test, rf_y_pred_base,
                                        target_names=['Default', 'Non-deafault']))

Random Forest Classifier Reort: 
------------------------------

Accuracy score:
 0.8200940617086154
****************************************

Confusion Matrix:
 [[9160 1539]
 [1942 6708]]
****************************************

Classification Report:
               precision    recall  f1-score   support

     Default       0.83      0.86      0.84     10699
Non-deafault       0.81      0.78      0.79      8650

    accuracy                           0.82     19349
   macro avg       0.82      0.82      0.82     19349
weighted avg       0.82      0.82      0.82     19349



In [30]:
# Defining a Random Forest Classifier using Hyperparameter tunnimg
from sklearn.model_selection import RandomizedSearchCV

clf_rf = RandomForestClassifier()

param_dist = {"max_depth": [5, 10, 15],
              "n_estimators": list(range(10,200)),
              "max_features": list(range(10, X_test.shape[1]+1)),
              "min_samples_split": list(range(2,11)),
              "min_samples_leaf": list(range(1,11)),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

random_search = RandomizedSearchCV(clf_rf, param_distributions=param_dist)

random_search.fit(X_train, y_train)

rf_preds = random_search.best_estimator_.predict(X_test)

In [32]:
print(f"Random Forest Classifier Report: \n{'-'*30}")
print("Best estimators' Hyperparameters are: \n", random_search.best_estimator_)
print('*'*40)
print("\nAccuracy score:\n", accuracy_score(y_test, rf_preds))
print('*'*40)
print("\nConfusion Matrix:\n", confusion_matrix(y_test, rf_preds))
print('*'*40)
print("\nClassification Report:\n", classification_report(y_test, rf_preds,
                                        target_names=['Default', 'Non-deafault']))

Random Forest Classifier Report: 
------------------------------
Best estimators' Hyperparameters are: 
 RandomForestClassifier(bootstrap=False, criterion='entropy', max_depth=15,
                       max_features=38, min_samples_split=4, n_estimators=29)
****************************************

Accuracy score:
 0.8284665874205386
****************************************

Confusion Matrix:
 [[8902 1797]
 [1522 7128]]
****************************************

Classification Report:
               precision    recall  f1-score   support

     Default       0.85      0.83      0.84     10699
Non-deafault       0.80      0.82      0.81      8650

    accuracy                           0.83     19349
   macro avg       0.83      0.83      0.83     19349
weighted avg       0.83      0.83      0.83     19349



In [33]:
rf_pred_prob = random_search.best_estimator_.predict_proba(X_test)[:,1]

print("roc_auc_score = ", roc_auc_score(y_test, rf_pred_prob))

roc_auc_score =  0.9119358786164987


**Conclusion:**

Random Forest Classifier, with parameters of:
> * max_depth=15,
> * max_features=22,
> * min_samples_leaf=4,
> * n_estimators=115

Achieved the highest performance in classifying the Defaulted and Non-defaulted Loans

# Modeling (Regression) **Current Task**

## Preprocessing (Creating **Target Variables**)

In [None]:
loan_data = df.copy()

In [None]:
loan_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77394 entries, 0 to 77393
Data columns (total 39 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   BidsPortfolioManager                    77394 non-null  int64  
 1   BidsApi                                 77394 non-null  int64  
 2   BidsManual                              77394 non-null  float64
 3   NewCreditCustomer                       77394 non-null  bool   
 4   VerificationType                        77394 non-null  object 
 5   LanguageCode                            77394 non-null  object 
 6   Age                                     77394 non-null  int64  
 7   Gender                                  77394 non-null  object 
 8   Country                                 77394 non-null  object 
 9   AppliedAmount                           77394 non-null  float64
 10  Amount                                  77394 non-null  fl

### The **EMI is calculated based on the following mathematical formula: DONE**

* **EMI = P × r × (1 + r) ^ n / ((1 + r) ^ n – 1)**

> Where,
> 
> **P** = Loan amount. **"Amount"**
>
> **r** = Rate of interest, which is calculated on a monthly basis. **Interest**
>
> **n** = Loan tenure (in months). **LoanDuration**


* **Loan tenure:** is the amount of time you are given to repay your loan

* **Amount** is not evident in this dataset (after FeatureEngineering), so I'll upload it from the original daaset.

In [None]:
main_data = pd.read_csv('/content/drive/MyDrive/Technocolabs_Team/Bondora_preprocessed.csv')
print(main_data.shape)
main_data['Amount'].head()

(77394, 39)


0    115.0408
1    140.6057
2    319.5409
3     57.5205
4    319.5436
Name: Amount, dtype: float64

In [None]:
main_data_amnt = main_data.loc[loan_data.index, 'Amount']
main_data.head()

Unnamed: 0,BidsPortfolioManager,BidsApi,BidsManual,NewCreditCustomer,VerificationType,LanguageCode,Age,Gender,Country,AppliedAmount,Amount,Interest,LoanDuration,MonthlyPayment,UseOfLoan,Education,MaritalStatus,EmploymentStatus,EmploymentDurationCurrentEmployer,OccupationArea,HomeOwnershipType,IncomeTotal,ExistingLiabilities,LiabilitiesTotal,RefinanceLiabilities,DebtToIncome,FreeCash,Rating,Restructured,CreditScoreEsMicroL,PrincipalPaymentsMade,InterestAndPenaltyPaymentsMade,PrincipalBalance,InterestAndPenaltyBalance,NoOfPreviousLoansBeforeLoan,AmountOfPreviousLoansBeforeLoan,PreviousRepaymentsBeforeLoan,PreviousEarlyRepaymentsCountBeforeLoan,LoanStatus
0,0,0,115.041,True,"Income unverified,cross-referenced by phone",Estonian,61,Female,EE,319.5582,115.0408,30.0,12,130.393314,Other,Vocational education,Married,Fully employed,UpTo3Years,Retail and wholesale,Not specified,10500.0,0,0.0,0,0.0,0.0,F,False,M,115.0408,20.4222,0.0,0.0,1,83.0852,0.0,0,NoDefault
1,0,0,140.6057,False,"Income unverified,cross-referenced by phone",Estonian,48,Female,EE,191.7349,140.6057,25.0,1,130.393314,Other,Higher education,Divorced,Fully employed,MoreThan5Years,Education,Not specified,10800.0,0,0.0,0,0.0,0.0,F,False,M,140.6057,2.0227,0.0,0.0,2,255.6467,258.6256,0,NoDefault
2,0,0,319.558,True,"Income unverified,cross-referenced by phone",Estonian,58,Female,EE,319.5582,319.5409,25.0,20,130.393314,Home improvement,Secondary education,Married,Fully employed,UpTo4Years,Hospitality and catering,Not specified,7000.0,0,0.0,0,0.0,0.0,F,True,M,203.1909,59.7626,116.35,414.07,0,0.0,0.0,0,Default
3,0,0,57.5205,True,"Income unverified,cross-referenced by phone",Estonian,23,Female,EE,127.8233,57.5205,45.0,15,130.393314,Loan consolidation,Basic education,Single,Not present,UpTo2Years,Other,Not specified,11600.0,0,0.0,0,0.0,0.0,F,False,M,57.5205,18.7323,0.0,0.0,1,134.2144,0.0,0,NoDefault
4,0,0,319.5582,True,"Income unverified,cross-referenced by phone",Estonian,25,Female,EE,319.5582,319.5436,30.0,12,130.393314,Vehicle,Secondary education,Cohabitant,Fully employed,UpTo2Years,Retail and wholesale,Not specified,6800.0,0,0.0,0,0.0,0.0,F,False,M,319.5436,220.42,0.0,0.0,1,146.9966,0.0,0,Default


In [None]:
loan_data['Amount'] = main_data_amnt.values

In [None]:
loan_data_temp = loan_data[['LoanDuration', 'Interest', 'Amount']]
loan_data_temp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77394 entries, 0 to 77393
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   LoanDuration  77394 non-null  int64  
 1   Interest      77394 non-null  float64
 2   Amount        77394 non-null  float64
dtypes: float64(2), int64(1)
memory usage: 1.8 MB


In [None]:
loan_data_temp.isnull().sum()

LoanDuration    0
Interest        0
Amount          0
dtype: int64

In [None]:
def cal_EMI(P, r, n):
  P = P.values
  r = r.values
  n = n.values
  #print(P.shape[0])
  result_1 = np.empty(0)
  result_2 = np.empty(0)
  result = np.empty(0)
  for i in range(P.shape[0]):
    #print(P[i])
    #print(r[i])
    #print(n[i])
    # EMI = P × r × (1 + r) ^ n / ((1 + r) ^ n – 1)
    #print(P[i] * (1 + r[i]))
    result_1 = np.append(result_1, P[i] * r[i] * np.power((1 + r[i]),n[i]))
    result_2 = np.append(result_2, np.power((1 + r[i]),n[i]) - 1)
    result = np.append(result, (result_1[i] / result_2[i]))

  return result

In [None]:
loan_data_temp['EMI'] = cal_EMI(loan_data_temp['Amount'], loan_data_temp['Interest'], loan_data_temp['LoanDuration'])

In [None]:
loan_data['EMI'] = loan_data_temp['EMI']

In [None]:
loan_data['EMI'].head()

0    3451.2240
1    3655.7482
2    7988.5225
3    2588.4225
4    9586.3080
Name: EMI, dtype: float64

### **Eligible Loan Amount DONE**

**ELA** = Assets (Income) - Liabilities of the borrower

* Assets:
> **FreeCash** = ELA
>
> **TotalIncome** - **LiabilitiesTotal** = ELA
>
> Let's Check both of them out in the data.

**Under Concsideration**

Eligible Loan Amount means, with respect to a **Mortgage Loan** that is an Eligible Loan, **the lesser of**:

(i) the Principal Balance of such Eligible Loan, **AppliedAmount**

(ii) the Market Value of such Eligible Loan **PurchasePrice** | **BidPrinciple**

**My Approach**

1. Calculate AppliedAmount + AppliedAmount*Interest = Total Liabilities Amount
2. Divide on the loan tenure (months)
3. If the result is less than (TotalIncome- LiabilitiesTotal)*30/100
>Then allow the Applied Amount, If not allow only the result of the previous calculation.

In [None]:
loan_data_temp = loan_data[['AppliedAmount', 'Interest', 'IncomeTotal', 'LiabilitiesTotal', 'LoanDuration']]
loan_data_temp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77394 entries, 0 to 77393
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   AppliedAmount     77394 non-null  float64
 1   Interest          77394 non-null  float64
 2   IncomeTotal       77394 non-null  float64
 3   LiabilitiesTotal  77394 non-null  float64
 4   LoanDuration      77394 non-null  int64  
dtypes: float64(4), int64(1)
memory usage: 3.0 MB


In [None]:
loan_data_temp[loan_data_temp['IncomeTotal']==3665].shape

(0, 5)

In [None]:
# Step 1
loan_data_temp['Ava_Inc'] = ((loan_data_temp['IncomeTotal']-loan_data_temp['LiabilitiesTotal'])*0.3)
loan_data_temp['Total_Loan_Amnt'] = np.round((df['AppliedAmount'] + (df['AppliedAmount'] * df['Interest']) /100)*df['LoanDuration'])
loan_data_temp.head()

Unnamed: 0,AppliedAmount,Interest,IncomeTotal,LiabilitiesTotal,LoanDuration,Ava_Inc,Total_Loan_Amnt
0,319.5582,30.0,10500.0,0.0,12,3150.0,4985.0
1,191.7349,25.0,10800.0,0.0,1,3240.0,240.0
2,319.5582,25.0,7000.0,0.0,20,2100.0,7989.0
3,127.8233,45.0,11600.0,0.0,15,3480.0,2780.0
4,319.5582,30.0,6800.0,0.0,12,2040.0,4985.0


In [None]:
# Step 2
def eligible_loan_amnt(df):
  Ava_Inc = df['Ava_Inc'].values
  Total_Loan_Amnt = df['Total_Loan_Amnt'].values
  ELA = np.empty(0)
  for i in range(len(Ava_Inc)):
    if Total_Loan_Amnt[i] <= Ava_Inc[i]:
      ELA = np.append(ELA, Total_Loan_Amnt[i])
    else:
      ELA = np.append(ELA, Ava_Inc[i])
  return ELA

In [None]:
loan_data_temp['ELA'] = eligible_loan_amnt(loan_data_temp)

In [None]:
loan_data_temp.head()

Unnamed: 0,AppliedAmount,Interest,IncomeTotal,LiabilitiesTotal,LoanDuration,Ava_Inc,Total_Loan_Amnt,ELA
0,319.5582,30.0,10500.0,0.0,12,3150.0,4985.0,3150.0
1,191.7349,25.0,10800.0,0.0,1,3240.0,240.0,240.0
2,319.5582,25.0,7000.0,0.0,20,2100.0,7989.0,2100.0
3,127.8233,45.0,11600.0,0.0,15,3480.0,2780.0,2780.0
4,319.5582,30.0,6800.0,0.0,12,2040.0,4985.0,2040.0


In [None]:
loan_data['ELA'] = loan_data_temp['ELA']
loan_data.columns

Index(['BidsPortfolioManager', 'BidsApi', 'BidsManual', 'NewCreditCustomer',
       'VerificationType', 'LanguageCode', 'Age', 'Gender', 'Country',
       'AppliedAmount', 'Amount', 'Interest', 'LoanDuration', 'MonthlyPayment',
       'UseOfLoan', 'Education', 'MaritalStatus', 'EmploymentStatus',
       'EmploymentDurationCurrentEmployer', 'OccupationArea',
       'HomeOwnershipType', 'IncomeTotal', 'ExistingLiabilities',
       'LiabilitiesTotal', 'RefinanceLiabilities', 'DebtToIncome', 'FreeCash',
       'Rating', 'Restructured', 'CreditScoreEsMicroL',
       'PrincipalPaymentsMade', 'InterestAndPenaltyPaymentsMade',
       'PrincipalBalance', 'InterestAndPenaltyBalance',
       'NoOfPreviousLoansBeforeLoan', 'AmountOfPreviousLoansBeforeLoan',
       'PreviousRepaymentsBeforeLoan',
       'PreviousEarlyRepaymentsCountBeforeLoan', 'LoanStatus', 'EMI', 'ELA'],
      dtype='object')

### **Preferred ROI**

* We weren't able to determine the procedure of handling Risk related to loan in order to determine **Preferred ROI**.

* In order to complete the task in hand and complete it, we'll calculate **ROI** instead.
>**ROI** = Investment Gain / Investment Base
>
> **ROI** = Amount lended * interest/100

* **InterestAndPenaltyDebtServicingCost**	Service cost related to the recovery of the debt based on the interest and penalties of the investment

* **InterestAndPenaltyWriteOffs**	Interest that was written off on the investment

* **PrincipalDebtServicingCost**	Service cost related to the recovery of the debt based on the principal of the investment

* **PrincipalWriteOffs**	Principal that was written off on the investment

* **PurchasePrice**	Investment amount or secondary market purchase price

In [None]:
loan_data_temp = loan_data[['Amount', 'Interest']]
loan_data_temp.head()

Unnamed: 0,Amount,Interest
0,115.0408,30.0
1,140.6057,25.0
2,319.5409,25.0
3,57.5205,45.0
4,319.5436,30.0


In [None]:
loan_data_temp['InterestAmount'] = (loan_data_temp['Amount']*(loan_data_temp['Interest']/100))
loan_data_temp['TotalAmount'] = (loan_data_temp['InterestAmount'] + loan_data_temp['Amount'])
loan_data_temp['ROI'] = (loan_data_temp['InterestAmount'] / loan_data_temp['TotalAmount'])*100
loan_data['ROI'] = loan_data_temp['ROI']

In [None]:
loan_data_temp.head()

Unnamed: 0,Amount,Interest,InterestAmount,TotalAmount,ROI
0,115.0408,30.0,34.51224,149.55304,23.076923
1,140.6057,25.0,35.151425,175.757125,20.0
2,319.5409,25.0,79.885225,399.426125,20.0
3,57.5205,45.0,25.884225,83.404725,31.034483
4,319.5436,30.0,95.86308,415.40668,23.076923


### I. Handling outliers:

In [None]:
# Let's compute IQR for each numerical feature
df_IQR = loan_data[loan_data.select_dtypes([float, int]).columns].quantile(.75) - loan_data[loan_data.select_dtypes([float, int]).columns].quantile(.25)

In [None]:
# Let's compute maximum and minimum limits
df_Max =  loan_data[loan_data.select_dtypes([float, int]).columns].quantile(.75) + (1.5*df_IQR)
df_Min =  loan_data[loan_data.select_dtypes([float, int]).columns].quantile(.25) - (1.5*df_IQR)

In [None]:
loan_data.select_dtypes([float, int]).columns

Index(['BidsPortfolioManager', 'BidsApi', 'BidsManual', 'Age', 'AppliedAmount',
       'Amount', 'Interest', 'LoanDuration', 'MonthlyPayment', 'IncomeTotal',
       'ExistingLiabilities', 'LiabilitiesTotal', 'RefinanceLiabilities',
       'DebtToIncome', 'FreeCash', 'PrincipalPaymentsMade',
       'InterestAndPenaltyPaymentsMade', 'PrincipalBalance',
       'InterestAndPenaltyBalance', 'NoOfPreviousLoansBeforeLoan',
       'AmountOfPreviousLoansBeforeLoan', 'PreviousRepaymentsBeforeLoan',
       'PreviousEarlyRepaymentsCountBeforeLoan', 'EMI', 'ELA', 'ROI'],
      dtype='object')

In [None]:
col_IQR = loan_data['Age'].quantile(.75) - loan_data['Age'].quantile(.25)
col_Max =  loan_data['Age'].quantile(.75) + (1.5*col_IQR)

Now we will replace outliers of each column with Lower and Upper bounds of each column:

In [None]:
# Loop for replacing outliers above upper bound with the upper bound value:
for column in loan_data.select_dtypes([float, int]).columns :
   
    col_IQR = loan_data[column].quantile(.75) - loan_data[column].quantile(.25)
    col_Max =  loan_data[column].quantile(.75) + (1.5*col_IQR)
    loan_data[column][loan_data[column] > col_Max] =  col_Max

In [None]:
# Loop for replacing outliers under lower bound with the lower bound value:
for column in loan_data.select_dtypes([float, int]).columns :
    col_IQR = loan_data[column].quantile(.75) - loan_data[column].quantile(.25)
    col_Min =  loan_data[column].quantile(.25) - (1.5*col_IQR)
    loan_data[column][loan_data[column] < col_Min] =  col_Min

### II. Feature Engineering for Regression Modeling

* Splitting the data into Independent and Dependent variables
* Creating Dummy variables for Categorical data
* Standard Scaling

In [None]:
# Splitting Independent and Dependent Variables
X = loan_data.drop(['EMI', 'ELA', 'ROI'], axis=1)
y = loan_data[['EMI', 'ELA', 'ROI']]

**Creating Dummy variables for Categorical Data**

In [None]:
# Let's perform categorical features encoding:
X = pd.get_dummies(X)

**Standard Scaling**

In [None]:
from sklearn.preprocessing import StandardScaler 

scalar = StandardScaler()

X = scalar.fit_transform(X)

In [None]:
y = scalar.fit_transform(y)

## **Regression Modeling:**

**Splitting the Data**

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, train_size = .75)

### **LinearRegression**

In [None]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()

lin_reg.fit(X_train, y_train)

y_pred_lin = lin_reg.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error

# model evaluation
print('Model Evaluation:')
print('mean_squared_error : ', mean_squared_error(y_test, y_pred_lin))
print('mean_squared_percentage_error : ', mean_absolute_percentage_error(y_test, y_pred_lin, multioutput=[0.3, 0.5, 0.7]))
print('R2_score : ', r2_score(y_test, y_pred_lin))

Model Evaluation:
mean_squared_error :  2.9078032169352886e+22
mean_squared_percentage_error :  23311701480.844982
R2_score :  -2.938897503917032e+22


### **RidgeRegression L2 Norm**

**Base Model**

In [None]:
from sklearn.linear_model import Ridge

rid_reg = Ridge()

rid_reg.fit(X_train, y_train)

y_pred_base = rid_reg.predict(X_test)

In [None]:
# model evaluation
print('Ridge Regression - Base Model:')
print('mean_squared_error : ', mean_squared_error(y_test, y_pred_base))
print('mean_squared_percentage_error : ', mean_absolute_percentage_error(y_test, y_pred_base, multioutput=[0.3, 0.5, 0.7]))
print('R2_score : ', r2_score(y_test, y_pred_base))

Ridge Regression - Base Model:
mean_squared_error :  0.0553599180861661
mean_squared_percentage_error :  0.447943532253813
R2_score :  0.9442172956442175


**Hyperparameter Tunning**

In [None]:
# Defining a Random Forest Classifier using Hyperparameter tunnimg
from sklearn.model_selection import RandomizedSearchCV

rid_reg = Ridge()

param_dist = {"alpha": list(np.array(range(1,9))*0.25),
              "normalize": [True, False]}

random_search = RandomizedSearchCV(rid_reg, param_distributions=param_dist)

random_search.fit(X_train, y_train)

rid_preds = random_search.best_estimator_.predict(X_test)

In [None]:
# model evaluation
print('Ridge Regression - Best estimator:')
print('Best Estimator: \n', random_search.best_estimator_)
print('mean_squared_error : ', mean_squared_error(y_test, rid_preds))
print('mean_squared_percentage_error : ', mean_absolute_percentage_error(y_test, rid_preds, multioutput=[0.3,0.5, 0.7]))
print('R2_score : ', r2_score(y_test, rid_preds))

Ridge Regression - Best estimator:
Best Estimator: 
 Ridge(alpha=1.75, normalize=False)
mean_squared_error :  0.055360012302617105
mean_squared_percentage_error :  0.447960855158721
R2_score :  0.9442172007769433
