In [246]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import make_scorer, roc_auc_score

In [247]:
# Load the datasets
train_data = pd.read_csv('/content/drive/MyDrive/Dataset/train.csv')
test_data = pd.read_csv('/content/drive/MyDrive/Dataset/test.csv')

In [248]:
train_data.head()

Unnamed: 0,customer_id,year_of_loan,income_group,annual_income,loan_dsbursement_amount,loan_time_period,loan_application_category,loan_type,category_of_interest,interest_rate,loan_grade,anonymous_X1,emi_amount,living_area,loan_status
0,CSID_543704,2016,Low,47836.7,1200.0,36 months,COLLATERAL LOAN,other,High,15.47,D,15.26,41.95,urban,Good Loan
1,CSID_785937,2019,Low,32656.53,9950.0,60 months,INDIVIDUAL,debt_consolidation,Low,13.3,C,24.36,201.13,urban,Bad Loan
2,CSID_854712,2019,Low,103760.56,20000.0,36 months,COLLATERAL LOAN,debt_consolidation,High,18.26,E,15.88,853.55,semi urban,Good Loan
3,CSID_604825,2017,Low,68338.13,6000.0,60 months,INDIVIDUAL,debt_consolidation,Low,10.26,B,17.93,116.15,township,Good Loan
4,CSID_715030,2017,Medium,166065.52,3500.0,48 months,COLLATERAL LOAN,home_improvement,Low,5.79,A,6.16,79.61,urban,Good Loan


In [249]:
test_data.head()

Unnamed: 0,customer_id,year_of_loan,income_group,annual_income,loan_dsbursement_amount,loan_time_period,loan_application_category,loan_type,category_of_interest,interest_rate,loan_grade,anonymous_X1,emi_amount,living_area
0,CSID_879508,2018,Low,119842.02,35000.0,60 months,JOINT,debt_consolidation,High,22.67,F,9.66,973.64,township
1,CSID_931391,2018,Low,82832.93,20000.0,24 months,JOINT,debt_consolidation,High,15.96,C,6.97,1214.85,rural
2,CSID_1000579,2019,Low,54969.7,10000.0,24 months,INDIVIDUAL,debt_consolidation,Low,7.73,A,23.51,466.66,rural
3,CSID_1002502,2019,Low,41939.71,4500.0,36 months,UNKNOWN,debt_consolidation,Low,10.02,B,11.06,145.02,urban
4,CSID_1010293,2017,Medium,135288.47,28000.0,48 months,COLLATERAL LOAN,debt_consolidation,Low,9.98,B,25.07,676.72,urban


In [250]:
train_data = train_data.drop('customer_id', axis=1)
train_data = train_data.drop('loan_type', axis=1)
#test_data = train_data.drop('customer_id', axis=1)
#test_data = train_data.drop('loan_type', axis=1)

In [251]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24450 entries, 0 to 24449
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   year_of_loan               24450 non-null  int64  
 1   income_group               24450 non-null  object 
 2   annual_income              23597 non-null  float64
 3   loan_dsbursement_amount    23594 non-null  float64
 4   loan_time_period           24450 non-null  object 
 5   loan_application_category  24450 non-null  object 
 6   category_of_interest       24450 non-null  object 
 7   interest_rate              24406 non-null  float64
 8   loan_grade                 24402 non-null  object 
 9   anonymous_X1               24450 non-null  float64
 10  emi_amount                 24450 non-null  float64
 11  living_area                24450 non-null  object 
 12  loan_status                24450 non-null  object 
dtypes: float64(5), int64(1), object(7)
memory usag

In [252]:
train_data.isnull().sum()

Unnamed: 0,0
year_of_loan,0
income_group,0
annual_income,853
loan_dsbursement_amount,856
loan_time_period,0
loan_application_category,0
category_of_interest,0
interest_rate,44
loan_grade,48
anonymous_X1,0


In [253]:
train_data.isnull().sum()*100 / len(train_data)

Unnamed: 0,0
year_of_loan,0.0
income_group,0.0
annual_income,3.488753
loan_dsbursement_amount,3.501022
loan_time_period,0.0
loan_application_category,0.0
category_of_interest,0.0
interest_rate,0.179959
loan_grade,0.196319
anonymous_X1,0.0


In [254]:
#Drop rows where missing values are <3%
column =['loan_grade','interest_rate']
train_data.dropna(subset=column, inplace=True)

train_data.isnull().sum()

Unnamed: 0,0
year_of_loan,0
income_group,0
annual_income,853
loan_dsbursement_amount,854
loan_time_period,0
loan_application_category,0
category_of_interest,0
interest_rate,0
loan_grade,0
anonymous_X1,0


In [255]:
#Here I will fill when missing values are <3% with the most frequent values

train_data['annual_income'] = train_data['annual_income'].fillna(train_data['annual_income'].mode()[0])
train_data['loan_dsbursement_amount'] = train_data['loan_dsbursement_amount'].fillna(train_data['loan_dsbursement_amount'].mode()[0])
#train_data['loan_type'] = train_data['loan_type'].fillna(train_data['loan_type'].mode()[0])

train_data.isnull().sum()

Unnamed: 0,0
year_of_loan,0
income_group,0
annual_income,0
loan_dsbursement_amount,0
loan_time_period,0
loan_application_category,0
category_of_interest,0
interest_rate,0
loan_grade,0
anonymous_X1,0


In [256]:
train_data.sample(5)

Unnamed: 0,year_of_loan,income_group,annual_income,loan_dsbursement_amount,loan_time_period,loan_application_category,category_of_interest,interest_rate,loan_grade,anonymous_X1,emi_amount,living_area,loan_status
16074,2017,Low,50477.2,20000.0,24 months,JOINT,High,18.95,E,22.4,1287.15,rural,Good Loan
8410,2017,Medium,137110.62,6000.0,60 months,UNKNOWN,Low,7.51,A,24.84,111.97,semi rural,Good Loan
7277,2019,Medium,132116.9,25000.0,48 months,UNKNOWN,High,13.78,C,8.92,636.2,urban,Good Loan
2527,2017,Low,16808.56,7275.0,48 months,UNKNOWN,Low,5.57,A,8.06,164.56,rural,Good Loan
13982,2017,Low,21793.63,2150.0,36 months,UNKNOWN,Low,10.69,B,18.49,69.75,urban,Good Loan


In [257]:
train_data['loan_grade'].unique()

array(['D', 'C', 'E', 'B', 'A', 'F', 'G'], dtype=object)

In [258]:
train_data['category_of_interest'] = train_data['category_of_interest'].map({'Low': 0, 'High': 1}).astype('int')
train_data['income_group'] = train_data['income_group'].map({'Low': 0, 'High': 1, 'Medium': 2}).astype('int')
train_data['loan_time_period'] = train_data['loan_time_period'].map({'36 months': 0, '60 months': 1, '48 months': 2, '24 months': 3, '120 months': 4, '240 months': 5, '360 months': 6, '300 months': 7}).astype('int')
train_data['loan_application_category'] = train_data['loan_application_category'].map({'COLLATERAL LOAN': 0, 'INDIVIDUAL': 1, 'UNKNOWN': 2, 'JOINT': 3}).astype('int')
train_data['living_area'] = train_data['living_area'].map({'urban': 0, 'semi urban': 1, 'township': 2, 'rural': 3, 'semi rural': 4}).astype('int')
train_data['loan_status'] = train_data['loan_status'].map({'Good Loan': 0, 'Bad Loan': 1}).astype('int')
train_data['loan_grade'] = train_data['loan_grade'].map({'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6}).astype('int')


In [259]:
train_data.head()

Unnamed: 0,year_of_loan,income_group,annual_income,loan_dsbursement_amount,loan_time_period,loan_application_category,category_of_interest,interest_rate,loan_grade,anonymous_X1,emi_amount,living_area,loan_status
0,2016,0,47836.7,1200.0,0,0,1,15.47,3,15.26,41.95,0,0
1,2019,0,32656.53,9950.0,1,1,0,13.3,2,24.36,201.13,0,1
2,2019,0,103760.56,20000.0,0,0,1,18.26,4,15.88,853.55,1,0
3,2017,0,68338.13,6000.0,1,1,0,10.26,1,17.93,116.15,2,0
4,2017,2,166065.52,3500.0,2,0,0,5.79,0,6.16,79.61,0,0


In [260]:

X = train_data.drop('loan_status', axis=1)
y = train_data['loan_status']

In [261]:
train_data.head()

Unnamed: 0,year_of_loan,income_group,annual_income,loan_dsbursement_amount,loan_time_period,loan_application_category,category_of_interest,interest_rate,loan_grade,anonymous_X1,emi_amount,living_area,loan_status
0,2016,0,47836.7,1200.0,0,0,1,15.47,3,15.26,41.95,0,0
1,2019,0,32656.53,9950.0,1,1,0,13.3,2,24.36,201.13,0,1
2,2019,0,103760.56,20000.0,0,0,1,18.26,4,15.88,853.55,1,0
3,2017,0,68338.13,6000.0,1,1,0,10.26,1,17.93,116.15,2,0
4,2017,2,166065.52,3500.0,2,0,0,5.79,0,6.16,79.61,0,0


In [262]:
cols = ['annual_income','loan_dsbursement_amount', 'interest_rate', 'anonymous_X1', 'emi_amount' ]

In [263]:
st = StandardScaler()
X[cols] = st.fit_transform(X[cols])

In [264]:
model_df = {}
def model_val(model, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"{model} accuracy is {accuracy_score(y_test, y_pred)}")
    precision = precision_score(y_test, y_pred)

    score = cross_val_score(model, X, y, cv=5)
    print(f"{model} Avg cross val score is {np.mean(score)}")
    model_df[model] = round(np.mean(score)*100,2)



In [265]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model_val(model, X, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression() accuracy is 0.8509852216748769


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

LogisticRegression() Avg cross val score is 0.8515477479900986


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [266]:
#Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model_val(model, X, y)

DecisionTreeClassifier() accuracy is 0.7346059113300493
DecisionTreeClassifier() Avg cross val score is 0.747885613019516


In [267]:
#Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model_val(model, X, y)

RandomForestClassifier() accuracy is 0.8495484400656814
RandomForestClassifier() Avg cross val score is 0.8496181532807517


In [268]:
model_val(model, X, y)

RandomForestClassifier() accuracy is 0.8477011494252874
RandomForestClassifier() Avg cross val score is 0.8492487457183513


In [269]:
#Saving the model
X = train_data.drop('loan_status', axis=1)
y = train_data['loan_status']
rf = RandomForestClassifier()
rf.fit(X, y)

In [270]:
import joblib
joblib.dump(rf, 'loan_status_predict')

['loan_status_predict']

In [271]:
model = joblib.load('loan_status_predict')

In [273]:
result = model.predict(test_data(1))

TypeError: 'DataFrame' object is not callable