In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df_train = pd.read_csv("C:\\Users\\user\\Downloads\\pavani\\bankloan_train.csv")
df_test = pd.read_csv("C:\\Users\\user\\Downloads\\pavani\\credit_test.csv")

In [None]:
df_train.head()

In [None]:
# We need to convert the values of years in current job into integer format.
df_train['Years in current job'] = df_train['Years in current job'].map({'8 years':8, '10+ years':15,'3 years':3, '5 years':5, '< 1 year':0.5, 
                               '2 years':2, '4 years':4, '9 years':9, '7 years':7, '1 year':1, '6 years':6})

df_test['Years in current job'] = df_test['Years in current job'].map({'8 years':8, '10+ years':15,'3 years':3, '5 years':5, '< 1 year':0.5, 
                            '2 years':2, '4 years':4, '9 years':9, '7 years':7, '1 year':1, '6 years':6})

# Let's check the null values of the DATASET

In [None]:
temp_df = df_train.isnull().sum().reset_index()
temp_df['Percentage'] = (temp_df[0]/len(df_train))*100
temp_df.columns = ['Column Name', 'Number of null values', 'Null values in percentage']
print(f"The length of dataset is \t {len(df_train)}")
temp_df

In [None]:
df_test.drop(labels=['Loan ID', 'Customer ID'], axis=1, inplace=True)
df_train.drop(labels=['Loan ID', 'Customer ID'], axis=1, inplace=True)

# Let us deal with missing values in categorical data.

### Term

In [None]:
sns.countplot(data=df_train, x='Term')
plt.show()

In [None]:
df_train['Term'].fillna(value='Short Term', inplace=True)
df_test['Term'].fillna(value='Short Term', inplace=True)

### Home Ownership

In [None]:
sns.countplot(data=df_train, x='Home Ownership')
plt.show()

In [None]:
df_train['Home Ownership'].unique()

In [None]:
df_train['Home Ownership'].fillna(value='Home Mortgage', inplace=True)
df_test['Home Ownership'].fillna(value='Home Mortgage', inplace=True)

### Purpose

In [None]:
sns.countplot(data=df_train, x='Purpose')
plt.xticks(rotation=90)
plt.show()

In [None]:
df_train['Purpose'].fillna(value='Debt Consolidation', inplace=True)
df_test['Purpose'].fillna(value='Debt Consolidation', inplace=True)

### Loan Status

In [None]:
sns.countplot(data=df_train, x='Loan Status')
plt.show()

In [None]:
df_train['Loan Status'].fillna(value='Fully Paid', inplace=True)

# PREDICTION WITH ML MODELS 

In [None]:
# Let us Import the Important Libraries  to train our Model for Machine Learning 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder # To deal with Categorical Data in Target Vector.
from sklearn.model_selection import train_test_split  # To Split the dataset into training data and testing data.
from sklearn.model_selection import cross_val_score   # To check the accuracy of the model.
from sklearn.impute import SimpleImputer   # To deal with the missing values
from sklearn.preprocessing import StandardScaler   # To appy scaling on the dataset.

In [None]:
# Convert DataFrame into array.
x_train = df_train.drop(labels='Loan Status', axis=1).values
y_train = df_train['Loan Status'].values
x_test = df_test.values

### Let us deal with Missing data

In [None]:
imputer = SimpleImputer()
x_train[:, [0,2,3,4,7,8,9,10,11,12,13,14,15]]  = imputer.fit_transform(x_train[:, [0,2,3,4,7,8,9,10,11,12,13,14,15]])
x_test[:, [0,2,3,4,7,8,9,10,11,12,13,14,15]]  = imputer.fit_transform(x_test[:, [0,2,3,4,7,8,9,10,11,12,13,14,15]])

### Let us deal with the categorical values in the training and test dataset.

In [None]:
labelencoder_x = LabelEncoder()
x_train[:, 1 ] = labelencoder_x.fit_transform(x_train[:,1 ])
x_train[:, 5 ] = labelencoder_x.fit_transform(x_train[:,5 ])
x_train[:, 6 ] = labelencoder_x.fit_transform(x_train[:,6 ])

In [None]:
x_test[:, 1 ] = labelencoder_x.fit_transform(x_test[:,1 ])
x_test[:, 5 ] = labelencoder_x.fit_transform(x_test[:,5 ])
x_test[:, 6 ] = labelencoder_x.fit_transform(x_test[:,6 ])

In [None]:
labelencoder_y=LabelEncoder()
y_train = labelencoder_y.fit_transform(y_train)

### Let us apply scaling on the dataset

In [None]:
sc_X=StandardScaler()
x_train=sc_X.fit_transform(x_train)
x_test = sc_X.fit_transform(x_test)

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=None)
x_train = pca.fit_transform(x_train)
x_test = pca.fit_transform(x_test)
explained_variance = pca.explained_variance_ratio_
explained_variance

In [None]:
pca = PCA(n_components=16)
x_train = pca.fit_transform(x_train)
x_test = pca.fit_transform(x_test)

### APPLY LOGISTIC REGRESSION AND RANDOM FOREST MODEL AND CHECK ACCURACY FOR EACH MODEL

#### Logistic Regression

In [None]:
classifier_logi = LogisticRegression()
classifier_logi.fit(x_train,y_train)

In [None]:
classifier_logi.score(x_train,y_train)

In [None]:
accuracy = cross_val_score(estimator=classifier_logi, X=x_train, y=y_train, cv=10)
print(f"The accuracy of the Logistic Regressor Model is \t {accuracy.mean()}")
print(f"The deviation in the accuracy is \t {accuracy.std()}")

#### Random Forest Mode

In [None]:
classifier_ran = RandomForestClassifier()
classifier_ran.fit(x_train,y_train)

In [None]:
classifier_ran.score(x_train,y_train)

In [None]:
accuracy = cross_val_score(estimator=classifier_ran, X=x_train, y=y_train, cv=10)
print(f"The accuracy of the Random Forest Model is \t {accuracy.mean()}") 
print(f"The deviation in the accuracy is \t {accuracy.std()}")

In [None]:
print(np.unique(y_train))
print(y_train[:10])
print("Here 1 indicates 'Fully Paid'. And 0 indicates 'Charged Off' ")

In [None]:
y_pred = classifier_logi.predict(x_test)

In [None]:
y_pred1=classifier_ran.predict(x_test)

In [None]:
y_pred

In [None]:
y_pred1

In [None]:
y_pred = list(map(lambda x: 'Fully Paid' if x==1 else 'Charged Off' ,y_pred))
y_pred = np.array(y_pred)
y_pred[:5]