In [None]:
import numpy as np
import pandas as pd

In [None]:
data = pd.read_csv('/Users/vijay.rajan/Desktop/datasets/BYOC/lending_club.csv')
data.head()

In [None]:
data.info()

In [None]:
# Finding which columns have missing values

have_missing_values = data.columns[data.isna().any()].tolist()
have_missing_values

In [None]:
# Separating columns with missing values for imputation

features_with_missing_values = data[have_missing_values]

# Select columns with categorical values so I can do encoding on those

categorical_features = data.select_dtypes('object')

# Separating rest of the data so I can later merge it with the preprocessed features

no_preprocessing_needed = data.drop(columns=(list(categorical_features.columns) + list(features_with_missing_values.columns)), 
                                    axis='columns')


In [None]:
# Imputing meadian for all the missing values and creating a missing value feature flag

from sklearn.impute import SimpleImputer, MissingIndicator

indicator = MissingIndicator(missing_values=np.nan)
mask_missing_values_only = indicator.fit_transform(features_with_missing_values)

# Creating feature flags and naming them with the prefix 'missing_flag_'

mask_missing_values_only = pd.DataFrame(mask_missing_values_only, columns=['missing_flag_' + name for name in list(features_with_missing_values.columns)])

# Imputing missing values with median of the column
imp = SimpleImputer(missing_values = np.nan, strategy = 'median')
imp.fit(features_with_missing_values)
SimpleImputer()

# Creating a dataframe with the columns where missing values imputation was done

features_with_imputed_values = pd.DataFrame(imp.transform(features_with_missing_values), columns=list(features_with_missing_values.columns))

after_imputation = pd.concat([mask_missing_values_only, features_with_imputed_values], axis='columns')

In [None]:
# Encoding for the categorical features

from sklearn.preprocessing import OneHotEncoder

one_hot = OneHotEncoder(handle_unknown='ignore')
encoded_features = [one_hot.fit_transform(categorical_features[[name]]).toarray() for name in list(categorical_features.columns)]


In [None]:
# Generating the list of names for the encoded columns and creating a dataframe of encoded features

encoded_column_names = []
for col_name in list(categorical_features):
    for i in range(0, data[col_name].nunique()):
        encoded_column_names.append(col_name + '_' + str(i))
        
encoded_df = pd.concat([pd.DataFrame(element) for element in encoded_features], axis='columns')
encoded_df.columns = encoded_column_names

In [None]:
# Creating the final dataframe

final_df = pd.concat([no_preprocessing_needed, after_imputation, encoded_df], axis='columns')

In [None]:
# Creating the train-test split

y = final_df.loc[:, 'is_bad']
X = final_df.iloc[:, 1:]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123)

# Scaling all the features except for the target is_bad

from sklearn.preprocessing import StandardScaler

standard_scalar = StandardScaler()
X_train = standard_scalar.fit_transform(X_train)
X_test = standard_scalar.transform(X_test)

In [None]:
from sklearn import tree

clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [None]:
from sklearn import metrics

print(metrics.confusion_matrix(y_test, y_pred, labels=[0, 1]))
metrics.roc_auc_score(y_test, y_pred)