In [100]:
# Data Manipulation
import numpy as np
import pandas as pd
pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.float_format = lambda x: '%.2f' % x
# pd.set_option('display.float_format', lambda x: '%.2f' % x)

#Visualisation
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

#Preprocesing
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import RandomizedSearchCV, train_test_split

# Algorithms
import xgboost as xgb
import lightgbm as lm
import catboost as cbt
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# Model Evaluation
from sklearn.metrics import confusion_matrix, precision_score, accuracy_score, f1_score, recall_score 

import warnings
warnings.filterwarnings('ignore')

#### Load and inspect data

In [None]:
data = pd.read_excel('CreditScoreData.xlsx')
data.head()

In [None]:
data.drop(['ID', 'Customer_ID', 'Month', 'Name', 'SSN'], inplace=True, axis=1)
data.head()

In [None]:
data.shape

In [None]:
data.describe(include='all')

In [None]:
missing = data.isnull().sum()
count = data.isnull().count()
percentage = missing/count * 100
percentage.sort_values(ascending=True)

In [None]:
df1 = data.drop('Type_of_Loan', axis=1)
df1.info()

In [None]:
df1.nunique()

#### Dealing with outliers

In [None]:
columns = ['Age','Annual_Income','Monthly_Inhand_Salary','Changed_Credit_Limit',
           'Delay_from_due_date','Amount_invested_monthly','Outstanding_Debt',
           'Credit_Utilization_Ratio','Num_of_Delayed_Payment','Credit_History_Age_Months',
           'Total_EMI_per_month','Amount_invested_monthly','Monthly_Balance']

plt.figure(figsize = (18, 40))
index = 1
for index, column in enumerate(columns):
    plt.subplot(7, 2, index+1)
    sns.boxplot(df1[column])

plt.subplots_adjust(hspace = 0.4)
plt.show()

In [None]:
df2 = df1.copy()
df2.shape

In [64]:
df2.shape

(23929, 22)

In [95]:
def outlier_remover(df,columns):
    for column in columns:
        q1 = np.percentile(df[column], 25, interpolation = 'midpoint')
        q3 = np.percentile(df[column], 75, interpolation = 'midpoint')
        IQR = q3 - q1
        upper_key_val = (q3+1.5*IQR)
        lower_key_val = (q1-1.5*IQR)
        df.loc[(df[column] <= upper_key_val)&(df[column] >= lower_key_val), column] = None
        print(df.shape)
#     df.dropna(axis=0)
    return df.isna()
    

In [None]:
outlier_remover(df2,['Annual_Income','Monthly_Inhand_Salary','Changed_Credit_Limit'])

(23929, 22)
(23929, 22)
(23929, 22)


In [85]:
outlier_remover(df2, 'Monthly_Inhand_Salary')

(23428, 22)

In [None]:
'Annual_Income','Monthly_Inhand_Salary','Changed_Credit_Limit',
           'Delay_from_due_date','Amount_invested_monthly','Outstanding_Debt',
           'Credit_Utilization_Ratio','Num_of_Delayed_Payment','Credit_History_Age_Months',
           'Total_EMI_per_month','Amount_invested_monthly','Monthly_Balance'

### print(outlier_remover(df2, 'Age'))