# Import Relevant Libraries

In [None]:
import numpy as np
import pandas as pd
import math
import seaborn as sb
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler,LabelEncoder
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.model_selection import GridSearchCV,cross_val_score
import warnings
warnings.filterwarnings('ignore')

# Loading Dataset

In [None]:
Fulldf = pd.read_csv('Data/Train_Dataset.csv')

# Data Pre Processing

### Conversions

In [None]:
#list of columns which are in object type, to convert to numeric type
initdf=Fulldf.drop(columns=['ID','Score_Source_1','Score_Source_2','Score_Source_3','Credit_Bureau'])

#list of columns which are in object type, to convert to numeric type
toconvert_type_list=['Client_Income','Credit_Amount','Loan_Annuity','Population_Region_Relative','Age_Days','Employed_Days','Registration_Days','ID_Days']

#list of columns which are categorical type
categorical_list = ['Accompany_Client','Client_Income_Type','Client_Education','Client_Marital_Status','Client_Gender','Loan_Contract_Type','Client_Housing_Type','Client_Occupation','Client_Permanent_Match_Tag','Client_Contact_Work_Tag','Type_Organization']

#list of columns which are numerical type
numeric_list=['Bike_Owned','Active_Loan','House_Own','Child_Count','Own_House_Age','Mobile_Tag','Homephone_Tag','Workphone_Working','Client_Family_Members','Cleint_City_Rating','Application_Process_Day','Application_Process_Hour','Social_Circle_Default','Phone_Change','Default']


### visualize the target variable

In [None]:
g = sb.countplot(initdf['Default'])
g.set_xticklabels(['Not Default','Default'])
plt.show()

### Data visualization for categorical columns

In [None]:
figure,axes = plt.subplots(6,2,figsize=(30,30))
for index,cat_col in enumerate(categorical_list):
    row,col = index//2,index%2
    sb.countplot(x=cat_col,data=Fulldf,hue='Default',ax=axes[row,col])


plt.subplots_adjust(hspace=1)

### convert object type columns to float type

In [None]:
for x in initdf:
    if x in toconvert_type_list:
        initdf[x] = pd.to_numeric(initdf[x],errors = 'coerce')
        numeric_list.append(x)

### get a list of categorical type columns

In [None]:
categ_dummy_list=[]
for x in initdf:
    if x in categorical_list:      
        categ_dummy_list.append(x)

### check columns with XNA values

In [None]:
totcount =0
count=0
t=0
for x in initdf:
    for xx in initdf[x]:
        if xx == 'XNA':
            if t==0:
                t =t+1
            count =count +1   
            totcount =totcount +1
        


### Drop 'Type_Organization' from initdf and categ_dummy_lis

In [None]:
initdf=initdf.drop(['Type_Organization'],axis=1)
categ_dummy_list.remove('Type_Organization')

### Remove rows with XNA values in gender column

In [None]:
for x in initdf['Client_Gender']:
    if x == 'XNA':
        initdf.drop(initdf[initdf['Client_Gender'] == 'XNA'].index, inplace = True)

### Check unique values of each categorical variables

In [None]:
count = 0
for x in initdf:
    if x in categ_dummy_list:
        count =len(Fulldf[x].value_counts())
    count = 0

### Reducing number of categories

In [None]:
initdf['Accompany_Client'] = [x if x in ('Alone','Relative') else 'Other' for x in initdf['Accompany_Client']]
initdf['Client_Income_Type'] = [x if x in ('Service','Commercial','Retired' , 'Unemployed' , 'Student') else 'Other' for x in initdf['Client_Income_Type']]
initdf['Client_Education'] = [x if x in ('Secondary','Graduation') else 'Other' for x in initdf['Client_Education']]
initdf['Client_Housing_Type'] = [x if x =='Home' else 'Other' for x in initdf['Client_Housing_Type']]
initdf['Client_Marital_Status'] = [x if x =='M' else 'Other' for x in initdf['Client_Marital_Status']]
initdf['Client_Occupation'] = [x if x in ('Laborers','Sales','Core','Managers','Drivers','High skill tech','Medicine') else 'Other' for x in initdf['Client_Occupation']]

In [None]:
count = 0
for x in initdf:
    if x in categ_dummy_list:
        count =len(Fulldf[x].value_counts())
    count = 0

### Checking Unique values

In [None]:
initdf['Loan_Annuity'].unique()
initdf['Phone_Change'].unique()
initdf['Client_Education'].unique()
initdf['Client_Occupation'].unique()
initdf['Client_Gender'].unique()
initdf['Client_Income_Type'].unique()
initdf['Child_Count'].unique()
initdf['Workphone_Working'].unique()
initdf['Application_Process_Hour'].unique()
initdf['Cleint_City_Rating'].unique()
initdf['Homephone_Tag'].unique()
initdf['Car_Owned'].unique()


### fucntion to encode categorical variables

In [None]:
def encode_df(df, todummy_list):
    for x in todummy_list:
        df[x] = LabelEncoder().fit_transform(df[x])
    return df

### Creating a list of categorical values in necessary columns before encoding to use when labeling inputs

In [None]:
education = initdf['Client_Education'].unique()
occupation = initdf['Client_Occupation'].unique()
income_type = initdf['Client_Income_Type'].unique()

### Encoding

In [None]:
initdf = encode_df(initdf, categ_dummy_list)

### Creating a list of categorical values in necessary columns After encoding to use when labeling inputs

In [None]:
education_label = initdf['Client_Education'].unique()
occupation_label = initdf['Client_Occupation'].unique()
income_type_label = initdf['Client_Income_Type'].unique()

In [None]:
print(income_type_label)
print(income_type)

### Handle missing values
### missing data count

In [None]:
initdf.isnull().sum().sort_values(ascending=False).head()

### Remove columns which has null values more than 30%

In [None]:
tot =0
for x in initdf:
    tot = initdf[x].isnull().sum()
    if (tot/len(initdf.index)) > 0.3 :
        print("Droped column'",x,"' and total number of null values:",initdf[x].isnull().sum())
        del initdf[x]
    tot=0   

### Using Imputer in sklearn.preprocessing, impute missing values

In [None]:
imp = SimpleImputer(strategy='mean')
imp.fit(initdf)
initdf = pd.DataFrame(data=imp.transform(initdf),columns=initdf.columns)

### Convert days to years

In [None]:
days = ['Age_Days','Employed_Days','Registration_Days','ID_Days','Phone_Change']
for var in days:
    initdf[var] = [math.ceil(x/365) if x != '' else null for x in initdf[var]]

### Investigate all the variabl's max and min values


In [None]:
for column in initdf:
    maxVal = initdf[column].max()
    minVal = initdf[column].min()
    
    print('{} Max:{} -- Min:{}'.format(column, maxVal,minVal))
   

### Remove unwanted rows 


In [None]:
#Population_Region_Relative values must be between 1-0
val=0
for col in initdf['Population_Region_Relative']:
    if col>1:
        initdf.drop(initdf[initdf['Population_Region_Relative'] > 1].index, inplace = True)
        val+=1 
val    

In [None]:
val=0
for col in initdf['Employed_Days']:
    if col>100:
        #print(col)
        val+=1 

In [None]:
initdf=initdf.drop(columns=['Employed_Days'])


## Correlations between variables

In [None]:
#Correlations between variables
plt.figure(figsize=(30,30))
sb.heatmap(initdf.corr(), annot=True, square=True, cmap='coolwarm')
plt.show()

#### Function to get variables which has greater than value of Correlations for given threshold value 

In [None]:
def correlation(dataset, threshold):
    col_correlation = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_correlation.add(colname)
    return col_correlation

In [None]:
correlation_features = correlation(initdf, 0.5)
correlation_features

### Selecting the best Features

In [None]:
X = initdf.iloc[:,:-1]  #independent columns
y = initdf.iloc[:,-1]    #target column 
#apply SelectKBest class to extract top best features
bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
topfeatures=featureScores.nlargest(20,'Score')
# print(featureScores.nlargest(20,'Score'))  #print 10 best features

### Get final data set with top 17 best features with class variable

In [None]:
Finaldf=initdf[['Credit_Amount','Client_Income','Loan_Annuity','Age_Days','Phone_Change','Registration_Days','Client_Education','Client_Occupation','Client_Gender','Client_Income_Type','Child_Count','Workphone_Working','Application_Process_Hour','Cleint_City_Rating','Homephone_Tag','Car_Owned','Default']]
Finaldf.head()

# Logistic Regression

### Create the data

In [None]:

X = Finaldf.drop('Default', axis=1).values# Input features (attributes)
y = Finaldf['Default'].values # Target vector
print('X shape: {}'.format(np.shape(X)))
print('y shape: {}'.format(np.shape(y)))

### Using Undersampling to balance the class distribution

In [None]:
from imblearn.under_sampling import RandomUnderSampler 

under = RandomUnderSampler()
X, y = under.fit_resample(X, y)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train =scaler.fit_transform(x_train)
x_test =scaler.transform(x_test) 

### Split the dataset into training set & testing set

In [None]:
x_train,x_test,y_train,y_test = train_test_split(X,y, test_size=0.3)

### Instantiate and fit model

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=200)
model.fit(x_train,y_train)

In [None]:
prediction_test = model.predict(x_test)
classes = ['Not Default','Default']
def plot_confusionmatrix(pred,test,dom):
    print(f'{dom} Confusion matrix')
    cf = confusion_matrix(pred,test)
    sb.heatmap(cf,annot=True,yticklabels=classes
               ,xticklabels=classes,cmap='Blues', fmt='g')
    plt.tight_layout()
    plt.show()    

## Confusion Matrix

In [None]:
#Training Accuracy
print("Training Accuracy is: ", model.score(x_train, y_train))
#Test Accuracy
print("Testing Accuracy is: ", model.score(x_test, y_test))


plot_confusionmatrix(y_test,prediction_test,dom='Test')


In [None]:
print(classification_report(y_test,prediction_test))

In [None]:
cross_val_score(model,X,y,cv=20)