# Importing some packages

In [5]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, RandomizedSearchCV, GridSearchCV
from sklearn.linear_model import Lasso, Ridge
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.feature_selection import RFE
from sklearn.feature_selection import r_regression, SelectKBest
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, explained_variance_score

from xgboost import XGBRegressor

from lightgbm import LGBMRegressor

from scipy import stats

from statsmodels.stats.outliers_influence import variance_inflation_factor

from boruta import BorutaPy

from BorutaShap import BorutaShap

from collections import Counter

import shap

import os
from pathlib import Path

from bisect import bisect

import re

import warnings 
warnings.filterwarnings('ignore')

# Loading data

In [2]:
data_path = Path(r"C:/\Users/\vchar/\OneDrive/\Desktop/\ML Projects/\portfolio/\VehicleLoanDefault/\data")

train_df = pd.read_csv(os.path.join(data_path, 'train.csv'))
test_df = pd.read_csv(os.path.join(data_path, 'test.csv'))

# Feature engineering

## Creating some features

In [6]:
signs = [(1,20,"Cap"), (2,18,"Aqu"), (3,20,"Pis"), (4,20,"Ari"),
         (5,21,"Tau"), (6,21,"Gem"), (7,22,"Can"), (8,23,"Leo"),
         (9,23,"Vir"), (10,23,"Lib"), (11,22,"Sco"), (12,22,"Sag"),
         (12,31,"Cap")]

def zodiac_sign(m,d):
    return signs[bisect(signs,(m,d))][2]

train_df['DisbursalDate'] = pd.to_datetime(train_df['DisbursalDate'])#, format="%Y-%m-%d")
train_df['Date.of.Birth'] = pd.to_datetime(train_df['Date.of.Birth'])#, format="%Y-%m-%d")
train_df['age'] = round((train_df['DisbursalDate'] - train_df['Date.of.Birth']).dt.days / 365, 1)

train_df['disb_month'] = train_df['DisbursalDate'].dt.month
train_df['zodiac_sign'] = train_df['Date.of.Birth'].apply(lambda x: zodiac_sign(x.month, x.day))

def convert2numbers(x):
    derived_numbers = re.findall(r'(\d+)yrs (\d+)mon', x)
    return round(int(derived_numbers[0][0]) + int(derived_numbers[0][1]) / 12, 2)

train_df['avg_loan_tenure'] = train_df['AVERAGE.ACCT.AGE'].apply(convert2numbers)
train_df['hist_length'] = train_df['CREDIT.HISTORY.LENGTH'].apply(convert2numbers)

score_desc_list = list(train_df['PERFORM_CNS.SCORE.DESCRIPTION'].unique())
score_desc_map_dict = {score_desc_list[i]: i+1 for i in range(len(score_desc_list))}

train_df['score_desc'] = train_df['PERFORM_CNS.SCORE.DESCRIPTION'].map(score_desc_map_dict)

train_df['Employment.Type'].fillna('Unknown', inplace=True)

employment_type_list = list(train_df['Employment.Type'].unique())
employment_type_map_dict = {employment_type_list[i]: i+1 for i in range(len(employment_type_list))}

train_df['Employment.Type'] = train_df['Employment.Type'].map(employment_type_map_dict)

train_df.drop(
    columns=[
        'DisbursalDate', 'Date.of.Birth', 
        'AVERAGE.ACCT.AGE', 'CREDIT.HISTORY.LENGTH',
        'PERFORM_CNS.SCORE.DESCRIPTION'
    ], 
    inplace=True
)

score_desc_map_dict

{'No Bureau History Available': 1,
 'I-Medium Risk': 2,
 'L-Very High Risk': 3,
 'A-Very Low Risk': 4,
 'Not Scored: Not Enough Info available on the customer': 5,
 'D-Very Low Risk': 6,
 'M-Very High Risk': 7,
 'B-Very Low Risk': 8,
 'C-Very Low Risk': 9,
 'E-Low Risk': 10,
 'H-Medium Risk': 11,
 'F-Low Risk': 12,
 'K-High Risk': 13,
 'Not Scored: No Activity seen on the customer (Inactive)': 14,
 'Not Scored: Sufficient History Not Available': 15,
 'Not Scored: No Updates available in last 36 months': 16,
 'G-Low Risk': 17,
 'J-High Risk': 18,
 'Not Scored: Only a Guarantor': 19,
 'Not Scored: More than 50 active Accounts found': 20}

In [8]:
employment_type_map_dict

{'Salaried': 1, 'Self employed': 2, 'Unknown': 3}

In [7]:
test_df['DisbursalDate'] = pd.to_datetime(test_df['DisbursalDate'])#, format="%d-%m-%Y")
test_df['Date.of.Birth'] = pd.to_datetime(test_df['Date.of.Birth'])#, format="%d-%m-%Y")
test_df['age'] = round((test_df['DisbursalDate'] - test_df['Date.of.Birth']).dt.days / 365, 1)

test_df['disb_month'] = test_df['DisbursalDate'].dt.month
test_df['zodiac_sign'] = test_df['Date.of.Birth'].apply(lambda x: zodiac_sign(x.month, x.day))

test_df['avg_loan_tenure'] = test_df['AVERAGE.ACCT.AGE'].apply(convert2numbers)
test_df['hist_length'] = test_df['CREDIT.HISTORY.LENGTH'].apply(convert2numbers)

test_df['score_desc'] = test_df['PERFORM_CNS.SCORE.DESCRIPTION'].map(score_desc_map_dict)

test_df['Employment.Type'].fillna('Unknown', inplace=True)

test_df['Employment.Type'] = test_df['Employment.Type'].map(employment_type_map_dict)

test_df.drop(
    columns=[
        'DisbursalDate', 'Date.of.Birth', 
        'AVERAGE.ACCT.AGE', 'CREDIT.HISTORY.LENGTH',
        'PERFORM_CNS.SCORE.DESCRIPTION'
    ], 
    inplace=True
)