# [Give me some credit competition](https://www.kaggle.com/c/GiveMeSomeCredit/overview) - EDA & Data Cleaning
---

### Competition Intro : 
*Banks play a crucial role in market economies. They decide who can get finance and on what terms and can make or break investment decisions. For markets and society to function, individuals and companies need access to credit.* 

*Credit scoring algorithms, which make a guess at the probability of default, are the method banks use to determine whether or not a loan should be granted. This competition requires participants to improve on the state of the art in credit scoring, by predicting the probability that somebody will experience financial distress in the next two years.*

*The goal of this competition is to build a model that borrowers can use to help make the best financial decisions.*

*Historical data are provided on 250,000 borrowers.*

---
### Data Dictionary

`SeriousDlqin2yrs` : Person experienced 90 days past due delinquency or worse.

`RevolvingUtilizationOfUnsecuredLines` : Total balance on credit cards and personal lines of credit except real estate and no installment debt like car loans divided by the sum of credit limits.

`age` : Age of borrower in years.

`NumberOfTime30-59DaysPastDueNotWorse` : Number of times borrower has been 30-59 days past due but no worse in the last 2 years.

`DebtRatio` : Monthly debt payments, alimony,living costs divided by monthy gross income.

`MonthlyIncome` : Monthly income.

`NumberOfOpenCreditLinesAndLoans` : Number of Open loans (installment like car loan or mortgage) and Lines of credit (e.g. credit cards).

`NumberOfTimes90DaysLate` : Number of times borrower has been 90 days or more past due.

`NumberRealEstateLoansOrLines` : Number of mortgage and real estate loans including home equity lines of credit.

`NumberOfTime60-89DaysPastDueNotWorse` : Number of times borrower has been 60-89 days past due but no worse in the last 2 years.

`NumberOfDependents` : Number of dependents in family excluding themselves (spouse, children etc.).

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import mutual_info_classif

%matplotlib inline

In [None]:
def plot_formatting():
    '''
    Set up the default plotting settings.
    '''
    
    plt.rc(
        'figure',
        figsize=(12,6),
        titleweight='bold',
        titlesize=25
    )
    plt.rc(
        'axes',
        labelweight='ultralight',
        titleweight='ultralight',
        titlelocation='left',
        titlecolor='k',
        titley=1.03,
        titlesize=16,
        grid=True
    )
    plt.rc(
        'axes.spines',
        right=False,
        left=False,
        top=False   
    )
    plt.rc(
        'grid',
        color='k',
        linestyle=(0,15,2,0),
        alpha=0.5
    )
    plt.rc('axes.grid', axis='y')
    plt.rc('ytick.major', width=0)
    plt.rc('font', family='monospace')
    
plot_formatting() # Setting our default settings

In [None]:
def key_figures(train, test):
    '''
    Display key figures of datasets
    '''
    
    fig, axes = plt.subplots(2,1, figsize=(5, 4))
    fig.subplots_adjust(hspace=1.2)
    test = test.drop('SeriousDlqin2yrs', axis=1)
    # Calculate key figures
    train_n = train.shape[0]
    test_n = test.shape[0]
    m = train.shape[1]-1
    train_total_cells = np.prod(train.shape)
    train_perc_nan = (train.isnull().sum().sum()) * 100 / train_total_cells
    test_total_cells = np.prod(test.shape)
    test_perc_nan = (test.isnull().sum().sum()) * 100 / test_total_cells
    perc_target = train['SeriousDlqin2yrs'].mean() * 100
    
    # Hide axis
    axes[0].axis('off')
    axes[1].axis('off')
    
    axes[0].set_title('- Training set key figures', x=-0.7)
    axes[1].set_title('- Test set key figures', x=-0.7)
    
    axes[0].text(0, 0.6, train_n, fontsize=18, fontweight="bold", color='seagreen', ha='center')
    axes[0].text(0, 0.01, 'borrowers in the dataset \n(training examples)',
                 fontsize=15, fontweight="bold", c='grey', ha='center')
    
    axes[1].text(0, 0.6, test_n, fontsize=18, fontweight="bold", color='tan', ha='center')
    axes[1].text(0, 0.01, 'borrowers in the dataset \n(test examples)',
                 fontsize=15, fontweight="bold", c='grey', ha='center')
    
    axes[0].text(0.75, 0.6, m, fontsize=18, fontweight="bold", color='seagreen', ha='center')
    axes[0].text(0.75, 0.1, 'features',fontsize=15, fontweight="bold", c='grey', ha='center')
    
    axes[0].text(1.5, 0.6, str(int(perc_target)) + '%', fontsize=18, fontweight="bold", color='seagreen', ha='center')
    axes[0].text(1.5, 0.01, 'of borrowers had serious \ndelinquency (target=1)',fontsize=15, fontweight="bold",
                 c='grey', ha='center')
    
    axes[0].text(2.25, 0.6, str(int(train_perc_nan)) + '%', fontsize=18, fontweight="bold", color='seagreen',
                 ha='center')
    axes[0].text(2.25, 0.1, 'of missing values',fontsize=15, fontweight="bold", c='grey', ha='center')
    
    axes[1].text(0.75, 0.6, str(int(test_perc_nan)) + '%', fontsize=18, fontweight="bold", color='tan', ha='center')
    axes[1].text(0.75, 0.01, 'of missing values',fontsize=15, fontweight="bold", c='grey', ha='center')

In [None]:
train = pd.read_csv('../input/GiveMeSomeCredit/cs-training.csv')
test = pd.read_csv('../input/GiveMeSomeCredit/cs-test.csv')

In [None]:
train.head()

In [None]:
train = train.drop('Unnamed: 0', axis=1)

In [None]:
test.head()

In [None]:
test = test.drop('Unnamed: 0', axis=1)

## Key Figures

In [None]:
key_figures(train, test)
plt.suptitle('Unbalanced dataset (6% of rows with target=1)', x=0.45, y=1.07);

## dtype distribution

In [None]:
countplt = sns.countplot(x=train.dtypes.astype(str), palette=['darkgreen', 'mediumseagreen'],
              linewidth=1, edgecolor='black')

for i, p in enumerate(countplt.patches):
        countplt.annotate('{}'.format(plt.gca().get_xticklabels()[i].get_text()), (p.get_x()+0.4, p.get_height()-1),
                             ha='center', va='bottom', color= 'white', fontsize=16)
        
plt.tick_params(bottom=False, labelbottom=False)
plt.suptitle('All Features are numericals', x=0.34, y=1.03)
plt.title('dtype distribution for the training set');

In [None]:
train.select_dtypes('int64').describe().transpose()[['min', '25%', '50%', '75%', 'max']]

- All integer features are **count features** as the only binary column is the target `SeriousDlqin2yrs`
- `NumberOfTime30-59DaysPastDueNotWorse`, `NumberOfTime60-89DaysPastDueNotWorse` and `NumberOfTimes90DaysLate` seem to be slightly sparse so in general people do not tend to have past due.
- `Age` has one or more outlier values as 0 is the minimum of the entries.

In [None]:
train.select_dtypes('float64').describe().transpose()[['min', '25%', '50%', '75%', 'max']]

- `NumberOfDependents` is stored as a float feature but is normally an integer. There should be some outliers or missing values among its entries
- `DebtRatio` is a ratio, thus should be between 0 and 1, but the max value of entries for this column is 329664. Therefore, there are outliers present.

## Missing values per feature

In [None]:
train.isnull().sum()

## Distribution of each feature

In [None]:
def plot_distributions_discrete(feature):
    
    _, axes = plt.subplots(1, 2, figsize=(12,4))
    sns.histplot(train[feature], kde=True, label='Train',stat='density',
                 discrete=True, color='steelblue', alpha=0.6, ax=axes[0])
    sns.histplot(test[feature], kde=True, label='Test',stat='density',
                 discrete=True, color='gold', alpha=0.25, ax=axes[0])
    axes[0].legend()
    axes[0].set_title('Distr Train set vs Distr Test set')

    sns.boxplot(x='SeriousDlqin2yrs', y=feature, data=train, ax=axes[1], palette=['seagreen', 'tan'])
    
    X = train[[feature]].dropna()
    MI = mutual_info_classif(X, train.loc[X.index, 'SeriousDlqin2yrs'], discrete_features=True,
                             random_state=0)
    axes[1].set_title('Distribution depending on the SeriousDlqin2yrs\n-> MI Score : ' + str(round(MI[0], 7)))

    plt.suptitle('"{}" distributions'.format(feature), y=1.15);


In [None]:
def plot_distributions_continuous(feature):
    
    _, axes = plt.subplots(1, 2, figsize=(12,4))
    sns.kdeplot(train[feature].apply(np.log1p), label='Train',
                color='steelblue', alpha=0.5, shade=True, edgecolor='k', ax=axes[0])
    sns.kdeplot(test[feature].apply(np.log1p), label='Train',
                color='gold', alpha=0.3, shade=True, edgecolor='k', ax=axes[0])

    axes[0].legend()
    axes[0].set_title('Distr Train set vs Distr Test set')
    axes[0].set_xlabel('log-{}'.format(feature))

    sns.boxplot(x='SeriousDlqin2yrs', y=train[feature],
                data=train, ax=axes[1], palette=['seagreen', 'tan'])

    X = train[[feature]].dropna()
    MI = mutual_info_classif(X, train.loc[X.index, 'SeriousDlqin2yrs'], random_state=0)
    axes[1].set_title('Distribution depending on the SeriousDlqin2yrs\n-> MI Score : ' + str(round(MI[0], 7)))

    plt.suptitle('"{}" distributions'.format(feature), y=1.15);


In [None]:
plot_distributions_discrete('age')

- Train and test sets have pretty much the same distributions for the feature `age` (The training histogram is blue and the test one is yellow, so when they overlap it appears green).
- Borrowers that experienced a Serious Delinquency seem to be a bit younger in general

In [None]:
train['age'] = train['age'].apply(lambda x: int(train['age'].median()) if x<18 else x)

In [None]:
plot_distributions_discrete('NumberOfOpenCreditLinesAndLoans')

In [None]:
plot_distributions_discrete('NumberRealEstateLoansOrLines')
plt.gca().set_ylim(0, 5);

In [None]:
plot_distributions_discrete('NumberOfDependents')

In [None]:
train['NumberOfDependents'].fillna(train['NumberOfDependents'].median(), inplace=True)
train['NumberOfDependents'] = train['NumberOfDependents'].astype('int64')

In [None]:
test['NumberOfDependents'].fillna(test['NumberOfDependents'].median(), inplace=True)
test['NumberOfDependents'] = test['NumberOfDependents'].astype('int64')

In [None]:
fig, axes = plt.subplots(3, 3, figsize=(20,10))
fig.subplots_adjust(hspace=0.55)
for i, feature in enumerate(['NumberOfTime30-59DaysPastDueNotWorse',
                             'NumberOfTime60-89DaysPastDueNotWorse',
                             'NumberOfTimes90DaysLate']):
    
    sns.histplot(train[feature].apply(np.log1p), label='Train',stat='density',
                 color='steelblue', alpha=0.5, edgecolor='k', ax=axes[0][i])
    sns.histplot(test[feature].apply(np.log1p), label='Test',stat='density',
                 color='gold', alpha=0.3, edgecolor='k', ax=axes[0][i])
    sns.histplot(train[feature].apply(np.log1p), label='Train',stat='density',
                 color='steelblue', alpha=0.5, edgecolor='k', ax=axes[1][i])
    sns.histplot(test[feature].apply(np.log1p), label='Test',stat='density',
                 color='gold', alpha=0.3, edgecolor='k', ax=axes[1][i])

    sns.boxplot(x='SeriousDlqin2yrs', y=train[feature],
                data=train, ax=axes[2][i], palette=['seagreen', 'tan'])

    MI = mutual_info_classif(train[[feature]],
                             train['SeriousDlqin2yrs'], random_state=0, discrete_features=True)
    axes[0][i].set_title('"{} - {}"'.format(i+1,feature))
    axes[1][i].set_title('Zoom-In')
    axes[2][i].set_title('-> MI Score : ' + str(round(MI[0], 7)))
    axes[1][i].set_ylim(0, 0.02)
    axes[2][i].set_ylim(0, 6)
    axes[0][i].legend()
    axes[2][i].legend()

    
plt.suptitle('The three "DaysPast" count features');

The bar after x=4, for each of the three histograms (those labeled "Zoom-In"), appear to have the same height and to be distant from the other bars. The points in these areas might be outliers. As a logarithm function has been applied to Data on the x-axes to enable visualisation, we must find these potential outliers manually.

In [None]:
train['NumberOfTime30-59DaysPastDueNotWorse'].value_counts().sort_index(ascending=False)

In [None]:
train['NumberOfTime60-89DaysPastDueNotWorse'].value_counts().sort_index(ascending=False)

In [None]:
train['NumberOfTimes90DaysLate'].value_counts().sort_index(ascending=False)

These points seem to be **96** and **98**. They show up 269 times in total in each "NumberPastDue" feature

In [None]:
train[train['NumberOfTimes90DaysLate'] >= 96][['NumberOfTimes90DaysLate',
                                               'NumberOfTime30-59DaysPastDueNotWorse',
                                               'NumberOfTime60-89DaysPastDueNotWorse']]

In [None]:
train[train['NumberOfTimes90DaysLate'] >= 96][['NumberOfTimes90DaysLate',
                                               'NumberOfTime30-59DaysPastDueNotWorse',
                                               'NumberOfTime60-89DaysPastDueNotWorse']].drop_duplicates()

They are definitely outliers. Once one of the three features has one of these two values, the other features get the same outlier value.

In [None]:
train[train['NumberOfTimes90DaysLate'] >= 96]['NumberOfOpenCreditLinesAndLoans'].value_counts()

In [None]:
train[train['NumberOfTimes90DaysLate'] >= 96]['NumberRealEstateLoansOrLines'].value_counts()

### Cleaning of 'PastDue' features 

In [None]:
mask = (train['NumberOfOpenCreditLinesAndLoans'] == 0) & (train['NumberRealEstateLoansOrLines'] == 0)

train['NumberOfTimes90DaysLate'] = (
    
    train['NumberOfTimes90DaysLate']
    .apply(lambda x: int(train[mask]['NumberOfTimes90DaysLate'].median()) if x >= 96 else x)
    
)

In [None]:
test[test['NumberOfTimes90DaysLate'] >= 96][['NumberOfTimes90DaysLate',
                                             'NumberOfTime30-59DaysPastDueNotWorse',
                                             'NumberOfTime60-89DaysPastDueNotWorse']].drop_duplicates()

In [None]:
masktest = (test['NumberOfOpenCreditLinesAndLoans'] == 0) & (test['NumberRealEstateLoansOrLines'] == 0)

test['NumberOfTimes90DaysLate'] = (
    
    test['NumberOfTimes90DaysLate']
    .apply(lambda x: int(test[masktest]['NumberOfTimes90DaysLate'].median()) if x >= 96 else x)
    
)


train['NumberOfTime30-59DaysPastDueNotWorse'] = (
    
    train['NumberOfTime30-59DaysPastDueNotWorse']
    .apply(lambda x: int(train[mask]['NumberOfTime30-59DaysPastDueNotWorse'].median()) if x >= 96 else x)
    
)

test['NumberOfTime30-59DaysPastDueNotWorse'] = (
    
    test['NumberOfTime30-59DaysPastDueNotWorse']
    .apply(lambda x: int(test[masktest]['NumberOfTime30-59DaysPastDueNotWorse'].median()) if x >= 96 else x)
    
)


train['NumberOfTime60-89DaysPastDueNotWorse'] = (
    
    train['NumberOfTime60-89DaysPastDueNotWorse']
    .apply(lambda x: int(train[mask]['NumberOfTime60-89DaysPastDueNotWorse'].median()) if x >= 96 else x)
    
)

test['NumberOfTime60-89DaysPastDueNotWorse'] = (
    
    test['NumberOfTime60-89DaysPastDueNotWorse']
    .apply(lambda x: int(test[masktest]['NumberOfTime60-89DaysPastDueNotWorse'].median()) if x >= 96 else x)
    
)

In [None]:
_, axes = plt.subplots(1, 2, figsize=(12,4))
sns.kdeplot(train['MonthlyIncome'].apply(np.log1p), label='Train',
            color='steelblue', alpha=0.5, shade=True, edgecolor='k', ax=axes[0])
sns.kdeplot(test['MonthlyIncome'].apply(np.log1p), label='Train',
            color='gold', alpha=0.3, shade=True, edgecolor='k', ax=axes[0])

axes[0].legend()
axes[0].set_title('Distr Train set vs Distr Test set')
axes[0].set_xlabel('log-{}'.format('MonthlyIncome'))

sns.boxplot(x='SeriousDlqin2yrs', y=train['MonthlyIncome'],
            data=train, ax=axes[1], palette=['seagreen', 'tan'])
axes[1].set_ylim(0, 20000)
X = train[['MonthlyIncome']].dropna()
MI = mutual_info_classif(X, train.loc[X.index, 'SeriousDlqin2yrs'], random_state=0, discrete_features=True)
axes[1].set_title('Distribution depending on the SeriousDlqin2yrs\n-> MI Score : ' + str(round(MI[0], 7)))

plt.suptitle('"{}" distributions'.format('MonthlyIncome'), y=1.15);

In [None]:
train[['MonthlyIncome']].isnull().mean()

About 20% of entries in `MonthlyIncome` are misssing. We will impute them later

In [None]:
plot_distributions_continuous('RevolvingUtilizationOfUnsecuredLines')
plt.gca().set_ylim(0, 2);

In [None]:
plot_distributions_continuous('DebtRatio')
plt.gca().set_ylim(0, 2);

In [None]:
train['DebtRatio'].describe()

`DebtRatio` should be between 0 and 1, as it is a ratio. However, we can see with the descriptive statistics that the maximum entry of this feature is 329664. Let's see the amount of entries that have a DebtRatio greater than 1.

In [None]:
(train['DebtRatio'] > 1).mean()

**23%** of entries in `DebtRatio` has a value greater than 1µ

In [None]:
(test['DebtRatio'] > 1).mean()

Same for the test dataset

Let's see how many monthly incomes are missing for this kind of DebtRatio

In [None]:
print('Training set----------\n')
print('Missing Income for debt > 1: ', train[train['DebtRatio'] > 1]['MonthlyIncome'].isnull().mean())
print('Missing Income in the entire dataset: ', train['MonthlyIncome'].isnull().sum())
print('Percentage of Missing Income that have a debt > 1 : ',
      (train[train['DebtRatio'] > 1]['MonthlyIncome'].isnull().sum()) / (train['MonthlyIncome'].isnull().sum()))

So 93% of rows with missing `MonthlyIncome` have a `debtRatio` greater than 1.

In [None]:
train[(train['DebtRatio'] > 1) & (train['MonthlyIncome'].isnull())]['DebtRatio'].describe()

In [None]:
print('Test set----------\n')
print('Missing Income for debt > 1: ', test[test['DebtRatio'] > 1]['MonthlyIncome'].isnull().mean())
print('Missing Income in the entire dataset: ', test['MonthlyIncome'].isnull().sum())
print('Percentage of Missing Income that have a debt > 1 : ',
      (test[test['DebtRatio'] > 1]['MonthlyIncome'].isnull().sum()) / (test['MonthlyIncome'].isnull().sum()))

It seems that due to borrowers' Incomes not being known, the DebtRatio can not be otbained (As it is the result of the raw Debt by the Income). The DebtRatio values in these rows seem therefore to be the raw Debt of the borrowers.

What about the DebtRatio of borrowers with missing Income but whose DebtRatio is <1?

In [None]:
# DebtRatio values of rows that have missing Income but debtRatio < 1
train[(train['MonthlyIncome'].isnull()) & (train['DebtRatio'] < 1)]['DebtRatio'].value_counts()

In [None]:
# DebtRatio values of rows that have missing Income but debtRatio < 1
test[(test['MonthlyIncome'].isnull()) & (test['DebtRatio'] < 1)]['DebtRatio'].value_counts()

All rows with missing `MonthlyIncome` have incorrect `DebtRatio`, either greater than 1 or equal to 0.

From these last findings, a DebtRatio > 1 might actually be the Debt value. As the income is missing, the DebtRatio cell keeps the Debt value as entry in this case. Rows with a DebtRatio = 0 and a missing Income might have these issues because there were neither a Debt value nor an Income value present.

However, within the rows with DebtRatio > 1, there were some with non-null Income. 

In [None]:
train[(train['DebtRatio'] > 1) & (train['MonthlyIncome'].notnull())]['DebtRatio'].describe()

In [None]:
train[(train['DebtRatio'] > 1) & (train['MonthlyIncome'].notnull())]['MonthlyIncome'].describe()

In [None]:
train[(train['DebtRatio'] > 1) & (train['MonthlyIncome'].notnull())][['DebtRatio', 'MonthlyIncome']]

There seems to be rows with 0 and 1 as Income, and those also have a DebtRatio that behaves as the Debt instead.
Where the monthly Income is "normal", the DebtRatio is very close to 1 (but still an outlier)

The following lines of code show this:

In [None]:
print('Incorrect MonthlyIncome and DebtRatio > 1\n-----------')
train[(train['DebtRatio'] > 1) & (train['MonthlyIncome'] <= 1)]['DebtRatio'].describe()

In [None]:
print('Correct MonthlyIncome and DebtRatio > 1\n-----------')
train[(train['DebtRatio'] > 1) & (train['MonthlyIncome'] > 1)]['DebtRatio'].describe()

### Observations on `DebtRatio` and `MonthlyIncome` columns:
----
- When the `MonthlyIncome` is missing, `DebtRatio` acquires an abnormal value, **which is either =0 or > 1**

    - Whereby the DebtRatio is greater than 1, **it might represent the raw Debt value**. As the Income is unknown, the DebtRatio can't be obtained from the Debt and the cell keeps the original Debt value. If we impute those missing Incomes with proper values, we would be able to subsequently obtain proper DebtRatio values.
    - Whereby the DebtRatio is 0, it might be because **even the raw Debt value was unknown**. We will change those entries and impute them with the median of Debt Values.
    
- Nevertheless, some rows with incorrect `DebtRatio` values have Income information

    - There are cases where the Income is either 0 or 1. In this case, the DebtRatio still behaves like the Debt. We'll put NaN to these Income values instead and join them with the previously discovered missing incomes.
    - When the MonthlyIncome is "normal", the DebtRatio is very close to 1.0. Nothing can be interpreted from this, so we will just change those DebtRatio entries and bring them all to a 0-1 scale.

### Cleaning of `DebtRatio` and `MonthlyIncome`

#### Baseline Mutual Information score of `DebtRatio` with `SeriousDlqin2yrs`

In [None]:
X = train.copy()

mutual_info_classif(X[['DebtRatio']], train['SeriousDlqin2yrs'], random_state=0)[0]

In [None]:
# Impute DebtRatio that are >1 but have correct Income entries

train['DebtRatio'] = train.apply(lambda x: train['DebtRatio'].median() if ((x['DebtRatio'] > 1) & (x['MonthlyIncome'] > 1)) else x['DebtRatio'], axis=1)
test['DebtRatio'] = test.apply(lambda x: test['DebtRatio'].median() if ((x['DebtRatio'] > 1) & (x['MonthlyIncome'] > 1)) else x['DebtRatio'], axis=1)

mutual_info_classif(train[['DebtRatio']], train['SeriousDlqin2yrs'], random_state=0)[0]

In [None]:
(train['DebtRatio'] == 0).sum()

In [None]:
# Put NaN to zero DebtRatio entries when the Income of the row is missing

train['DebtRatio'] = train.apply(lambda x: np.nan if ((x['DebtRatio'] == 0) & (pd.isna(x['MonthlyIncome']))) else x['DebtRatio'], axis=1)
test['DebtRatio'] = test.apply(lambda x: np.nan if ((x['DebtRatio'] == 0) & (pd.isna(x['MonthlyIncome']))) else x['DebtRatio'], axis=1)

In [None]:
X = train[['DebtRatio']].dropna().copy()
mutual_info_classif(X, train.loc[X.index, 'SeriousDlqin2yrs'], random_state=0)[0]

In [None]:
# Baseline MI score of the MonthlyIncome  with SeriousDlqin2yrs

X = train.copy()

X = X[['MonthlyIncome']].dropna()

mutual_info_classif(X, train.loc[X.index, 'SeriousDlqin2yrs'], discrete_features=True, random_state=0)[0]

In [None]:
# Impute NaN to Income with values equal to 0 or 1

train['MonthlyIncome'] = train.where(train['MonthlyIncome']>1)['MonthlyIncome']
test['MonthlyIncome'] = test.where(test['MonthlyIncome']>1)['MonthlyIncome']

In [None]:
# Let's see how this has increased the score
X = train[['MonthlyIncome']].dropna().copy()
mutual_info_classif(X, train.loc[X.index, 'SeriousDlqin2yrs'], discrete_features=True, random_state=0)[0]

In [None]:
train['Debt'] = train.apply(lambda x: x['DebtRatio'] * x['MonthlyIncome'] if pd.isna(x['MonthlyIncome'])==False else x['DebtRatio'],
                            axis=1)
test['Debt'] = test.apply(lambda x: x['DebtRatio'] * x['MonthlyIncome'] if pd.isna(x['MonthlyIncome'])==False else x['DebtRatio'],
                          axis=1)

In [None]:
train['Debt']

In [None]:
train['Debt'].isnull().sum()

Null entries here come from the `DebtRatio` cells we impute NaN to. We will impute these NaNs in Debt with particular values, then we will impute NaNs in `MonthlyIncome` and finally divide Debt by `MonthlyIncome` to get a new correct `DebtRatio` column.

In [None]:
# Baseline Score
X = train[['Debt']].dropna().copy()

mutual_info_classif(X, train.loc[X.index, 'SeriousDlqin2yrs'], random_state=0)[0]

In [None]:
grouping = (train.groupby(['NumberRealEstateLoansOrLines', 'NumberOfOpenCreditLinesAndLoans'])
            .mean()['Debt']
           )

dict_debt = grouping[0].to_dict()

In [None]:
train['Debt'] = train.apply(lambda x: dict_debt[x['NumberOfOpenCreditLinesAndLoans']] if pd.isna(x['Debt']) else x['Debt'],
                          axis=1)
test['Debt'] = test.apply(lambda x: dict_debt[x['NumberOfOpenCreditLinesAndLoans']] if pd.isna(x['Debt']) else x['Debt'],
                          axis=1)

In [None]:
mutual_info_classif(train[['Debt']], train['SeriousDlqin2yrs'], random_state=0)[0]

In [None]:
X = train.copy()

X = X[['MonthlyIncome']].dropna()

mutual_info_classif(X, train.loc[X.index, 'SeriousDlqin2yrs'], discrete_features=True, random_state=0)[0]

In [None]:
X = pd.concat([train, test], axis=0).copy()

X['Quantile_Debt'] = pd.qcut(X.Debt, q=40000, duplicates='drop')

X[['Quantile_Debt']]

In [None]:
grouping = X.groupby('Quantile_Debt').mean()[['MonthlyIncome']].fillna(method='ffill')
grouping

In [None]:
grouping = grouping['MonthlyIncome'].to_dict()

In [None]:
copy_train = train.copy()
copy_test = test.copy()
copy_train = copy_train.join(X['Quantile_Debt'].iloc[:len(train)])
copy_test = copy_test.join(X['Quantile_Debt'].iloc[len(train):len(train) + len(test)])


train['MonthlyIncome'] = copy_train.apply(lambda x: grouping[x['Quantile_Debt']] if pd.isna(x['MonthlyIncome']) else x['MonthlyIncome'],
                                          axis=1).astype('int64')
test['MonthlyIncome'] = copy_test.apply(lambda x: grouping[x['Quantile_Debt']] if pd.isna(x['MonthlyIncome']) else x['MonthlyIncome'],
                                          axis=1).astype('int64')

In [None]:
mutual_info_classif(train[['MonthlyIncome']], train['SeriousDlqin2yrs'], discrete_features=True, random_state=0)[0]

In [None]:
train['DebtRatio'] = train.apply(lambda x: x['Debt'] / x['MonthlyIncome'], axis=1)

train['DebtRatio'] = train['DebtRatio'].apply(lambda x: 0.99 if x>1 else x)

test['DebtRatio'] = test.apply(lambda x: x['Debt'] / x['MonthlyIncome'], axis=1)

test['DebtRatio'] = test['DebtRatio'].apply(lambda x: 0.99 if x>1 else x)

In [None]:
mutual_info_classif(train[['DebtRatio']], train['SeriousDlqin2yrs'], random_state=0)[0]

### MI Final Scores & Correlation Matrix

In [None]:
def make_mi_scores(X, y):
    X = X.copy()

    discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
    mi_scores = mutual_info_classif(X, y, discrete_features=discrete_features, random_state=0)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    
    return mi_scores

In [None]:
mi_scores = make_mi_scores(train.drop('SeriousDlqin2yrs', axis=1), train['SeriousDlqin2yrs'])

In [None]:
mi_scores = mi_scores.sort_values(ascending=False)
sns.barplot(x=mi_scores, y=mi_scores.index, orient='h')
plt.suptitle("Mutual Information Scores", x=0.25);

In [None]:
sns.heatmap(train.corr(), annot=True, fmt='.2f')
plt.suptitle('Correlation matrix', x=0.2);

In [None]:
# train.to_csv('Cleaned_train.csv', index=False)
# test.to_csv('Cleaned_test.csv', index=False)