In [None]:
import os
import gc
import numpy as np
import pandas as pd
from scipy.stats import kurtosis
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
import xgboost as xgb
from xgboost import XGBClassifier

In [None]:
DATA_DIRECTORY = "../input/home-credit-loan-better-data-processing"

In [None]:
train = pd.read_csv(os.path.join(DATA_DIRECTORY, 'train.csv'))
test = pd.read_csv(os.path.join(DATA_DIRECTORY, 'test.csv'))
labels = pd.read_csv(os.path.join(DATA_DIRECTORY, 'labels.csv'))


In [None]:
import re
train = train.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
test = test.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
labels = labels.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

In [None]:
train=np.nan_to_num(train)
test=np.nan_to_num(test)
labels=np.nan_to_num(labels)

In [None]:
train = pd.DataFrame(train)
test = pd.DataFrame(test)
labels=pd.DataFrame(labels)

In [None]:
print(train.shape)

In [None]:
#perform training and test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train, labels, random_state=42)

In [None]:
#Dummy Classifier
from sklearn.dummy import DummyClassifier
clf = DummyClassifier(strategy= 'most_frequent').fit(X_train,y_train)
y_pred = clf.predict(X_test)

#Distribution of y test
print('y actual : \n' +  str(y_test.value_counts()))

#Distribution of y predicted
print('y predicted : \n' + str(pd.Series(y_pred).value_counts()))

# Calculate the evaluation metrics of this model.

In [None]:
# Model Evaluation metrics 
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
print('Accuracy Score : ' + str(accuracy_score(y_test,y_pred)))
print('Precision Score : ' + str(precision_score(y_test,y_pred)))
print('Recall Score : ' + str(recall_score(y_test,y_pred)))
print('F1 Score : ' + str(f1_score(y_test,y_pred)))

#Dummy Classifier Confusion matrix
from sklearn.metrics import confusion_matrix
print('Confusion Matrix : \n' + str(confusion_matrix(y_test,y_pred)))

# Now that we have the baseline accuracy, let’s build a Logistic regression model with default parameters and evaluate the model.


In [None]:
from lightgbm import LGBMClassifier


clf = LGBMClassifier().fit(X_train,y_train)
y_pred = clf.predict(X_test)

# Model Evaluation metrics 
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
print('Accuracy Score : ' + str(accuracy_score(y_test,y_pred)))
print('Precision Score : ' + str(precision_score(y_test,y_pred)))
print('Recall Score : ' + str(recall_score(y_test,y_pred)))
print('F1 Score : ' + str(f1_score(y_test,y_pred)))

#Logistic Regression Classifier Confusion matrix
from sklearn.metrics import confusion_matrix
print('Confusion Matrix : \n' + str(confusion_matrix(y_test,y_pred)))

In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

pre_probs1 = clf.predict_proba(X_test)

pre_probs1 = pre_probs1[:, 1]

pre_auc = roc_auc_score(y_test, pre_probs1)

print('LGBM: ROC AUC=%.3f' % (pre_auc))

pre_fpr, pre_tpr, _ = roc_curve(y_test, pre_probs1)
# plot the roc curve for the model

plt.plot(pre_fpr, pre_tpr, marker='.', label='LGBM')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
# show the legend
plt.legend()
# show the plot
plt.show()


# Grid Search to maximize Recall

In [None]:
#Grid Search
from sklearn.model_selection import GridSearchCV
clf = LogisticRegression()
grid_values = {'penalty': ['l2'],'C':[0.001,.009,0.01,.09,1,5,10,25]}
grid_clf_acc = GridSearchCV(clf, param_grid = grid_values,scoring = 'recall')
grid_clf_acc.fit(X_train, y_train)

#Predict values based on new parameters
y_pred_acc = grid_clf_acc.predict(X_test)

# New Model Evaluation metrics 
print('Accuracy Score : ' + str(accuracy_score(y_test,y_pred_acc)))
print('Precision Score : ' + str(precision_score(y_test,y_pred_acc)))
print('Recall Score : ' + str(recall_score(y_test,y_pred_acc)))
print('F1 Score : ' + str(f1_score(y_test,y_pred_acc)))

#Logistic Regression (Grid Search) Confusion matrix
confusion_matrix(y_test,y_pred_acc)

In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

pre_probs1 = clf.predict_proba(X_test)

pre_probs1 = pre_probs1[:, 1]

pre_auc = roc_auc_score(y_test, pre_probs1)

print('LGBM: ROC AUC=%.3f' % (pre_auc))

pre_fpr, pre_tpr, _ = roc_curve(y_test, pre_probs1)
# plot the roc curve for the model

plt.plot(pre_fpr, pre_tpr, marker='.', label='SVM')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
# show the legend
plt.legend()
# show the plot
plt.show()


# ****LGBM****

In [None]:
pred = model.predict_proba(df_test)

In [None]:
submit = test[['SK_ID_CURR']]
submit['TARGET'] = pred
submit.to_csv('lgbm_Minimized_code.csv', index = False)