# Importing necessary libraries and packages

In [1]:
import pandas as pd
import numpy as np

### Creating a random database to depict the data cleaning procedures

In [2]:
df = pd.DataFrame({
    'Default': [1,0,0,0,0,0,1],
    'Income': [250000,400000,'NAN',440000,500000,700000,800000],
    'Age': [20,30, 40,35,25,40,'NAN'],
    'Name':['Allen','Sara','Lily','Rock','David','Rose','Mat'],
    'Gender': ['M','F','F','M','M','F','M'],
    'Type of job': ['Skilled','Unskilled','Super skilled','Super skilled','NAN','Skilled','Skilled'],
    'Amt of credit':['NAN',30000,50000,80000,40000,100000,300000],
    'Years employed':[1,10,12,6,4,13,12]})
df=df.replace('NAN',np.NaN)


In [3]:
df

Unnamed: 0,Default,Income,Age,Name,Gender,Type of job,Amt of credit,Years employed
0,1,250000.0,20.0,Allen,M,Skilled,,1
1,0,400000.0,30.0,Sara,F,Unskilled,30000.0,10
2,0,,40.0,Lily,F,Super skilled,50000.0,12
3,0,440000.0,35.0,Rock,M,Super skilled,80000.0,6
4,0,500000.0,25.0,David,M,,40000.0,4
5,0,700000.0,40.0,Rose,F,Skilled,100000.0,13
6,1,800000.0,,Mat,M,Skilled,300000.0,12


### Implementing test train split method

In [4]:
X=df.drop('Default', axis=1)
y=df['Default']
# implementing train-test-split
from sklearn.model_selection import train_test_split


### 1.1) Handling missing values

 Let us use the imputer on numeric and non numeric columns. 
 We will have both types of values in our dataset. We want to replace the numeric missing values with say the median and the 

In [5]:
#original order of columns
cols = X.columns

X_numeric=X.select_dtypes(include=['int', 'float']).columns
#joined columns numeric and non numeric
X_non_numeric=X.select_dtypes(exclude=['int', 'float']).columns
new = X_numeric.tolist() + X_non_numeric.tolist()

from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
t = [('num', SimpleImputer(strategy='median'), X_numeric),
('cat', SimpleImputer(strategy='most_frequent'), X_non_numeric)]
transformer = ColumnTransformer(transformers=t, remainder='passthrough')
X = transformer.fit_transform(X) #numpy array

#DataFrame constructor with new columns names and added reindex for change by original order
X = pd.DataFrame(X, columns=new).reindex(cols, axis=1)

### 1.2) Handling non-numeric data

#### Handling categorical data

In [6]:
from sklearn.preprocessing import OneHotEncoder
onehotencoder = pd.get_dummies(X['Gender'], drop_first=True)
# Drop column non_numeric_col as it is now encoded
X = X.drop('Gender',axis = 1)
# Join the encoded dataframe to X_train
X = X.join(onehotencoder)

#### Handling ordinal data

In [7]:
X['Type of job'] = X['Type of job'].map( {'Super skilled':3, 'Skilled':2, 'Unskilled':1})


#### Handling string data

In [8]:
cols_remove=['Name']
X = X.drop(cols_remove, axis=1)


### Feature Engineering

In [9]:
X['Years employed percent'] = X['Years employed'] / X['Age']
X['Credit Income percent'] = X['Amt of credit'] / X['Income']


### Feature selection

In [10]:
#Selectkclass
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

bestfeatures = SelectKBest(score_func=chi2, k=5)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Features','Score']  #naming the dataframe columns
print(featureScores.nlargest(5,'Score'))  #print 5best features

         Features          Score
3   Amt of credit  225657.894737
0          Income    1485.955056
1             Age       2.699438
4  Years employed       1.077586
5               M       0.900000


### Implementing test train split on training and test sets

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=66)


### Scaling data

In [12]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test=sc.transform(X_test)