# extracting data from s3 storage to python using boto3

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
data=pd.read_csv("preprocessed_data_model_B.csv")

In [3]:
data.columns

Index(['Unnamed: 0', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status',
       'Applicant_log', 'CoapplicantIncome_log', 'LoanAmount_log',
       'new_income_bin', 'loan_bin', 'CoapplicantIncome_bin', 'EMI'],
      dtype='object')

In [4]:
dataset=data.drop("Unnamed: 0",axis=1)

In [5]:
len(dataset.columns)

19

In [6]:
dataset.isna().sum()

Gender                   0
Married                  0
Dependents               0
Education                0
Self_Employed            0
ApplicantIncome          0
CoapplicantIncome        0
LoanAmount               0
Loan_Amount_Term         0
Credit_History           0
Property_Area            0
Loan_Status              0
Applicant_log            0
CoapplicantIncome_log    0
LoanAmount_log           0
new_income_bin           0
loan_bin                 0
CoapplicantIncome_bin    0
EMI                      0
dtype: int64

# now lets rearrange our columns

In [7]:
loan_status = dataset.pop('Loan_Status')
dataset.insert(len(dataset.columns), 'Loan_Status', loan_status)

In [9]:
dataset

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Applicant_log,CoapplicantIncome_log,LoanAmount_log,new_income_bin,loan_bin,CoapplicantIncome_bin,EMI,Loan_Status
0,Male,No,0,Graduate,No,5849.0,0.0,146.412162,360.0,1.0,Urban,8.674197,0.000000,4.993232,very high,high,very high,0.406700,yes
1,Male,Yes,1,Graduate,No,4583.0,1508.0,128.000000,360.0,1.0,Rural,8.430327,7.319202,4.859812,high,medium,high,0.355556,no
2,Male,Yes,0,Graduate,Yes,3000.0,0.0,66.000000,360.0,1.0,Urban,8.006701,0.000000,4.204693,medium,low,very high,0.183333,yes
3,Male,Yes,0,Not Graduate,No,2583.0,2358.0,120.000000,360.0,1.0,Urban,7.857094,7.765993,4.795791,low,medium,very high,0.333333,yes
4,Male,No,0,Graduate,No,6000.0,0.0,141.000000,360.0,1.0,Urban,8.699681,0.000000,4.955827,very high,high,very high,0.391667,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,Female,No,0,Graduate,No,2900.0,0.0,71.000000,360.0,1.0,Rural,7.972811,0.000000,4.276666,medium,low,very high,0.197222,yes
610,Male,Yes,3,Graduate,No,4106.0,0.0,40.000000,360.0,1.0,Rural,8.320448,0.000000,3.713572,high,low,very high,0.222222,yes
611,Male,Yes,1,Graduate,No,8072.0,240.0,253.000000,360.0,1.0,Urban,8.996280,5.484797,5.537334,very high,very high,medium,0.702778,yes
612,Male,Yes,2,Graduate,No,7583.0,0.0,187.000000,360.0,1.0,Urban,8.933796,0.000000,5.236442,very high,very high,very high,0.519444,yes


# now we r going  to split the data to independent and dependent

In [10]:
independent=dataset.iloc[:,:-1]
#independent


In [11]:
dependent=dataset.iloc[:,-1]
#dependent

# now lets split to train and test set

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
x_train,x_test,y_train,y_test=train_test_split(independent,dependent,test_size=.20,random_state=False)

In [14]:
y_train,y_test

(90     yes
 533     no
 452     no
 355    yes
 266    yes
       ... 
 277    yes
 9       no
 359    yes
 192     no
 559    yes
 Name: Loan_Status, Length: 491, dtype: object,
 454    yes
 52      no
 536    yes
 469     no
 55     yes
       ... 
 337    yes
 376    yes
 278    yes
 466     no
 303    yes
 Name: Loan_Status, Length: 123, dtype: object)

# now we r going to encode y_train and y_test and save it

In [17]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
y_train=le.fit_transform(y_train)
y_test=le.transform(y_test)

In [22]:
dataset

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Applicant_log,CoapplicantIncome_log,LoanAmount_log,new_income_bin,loan_bin,CoapplicantIncome_bin,EMI,Loan_Status
0,Male,No,0,Graduate,No,5849.0,0.0,146.412162,360.0,1.0,Urban,8.674197,0.000000,4.993232,very high,high,very high,0.406700,yes
1,Male,Yes,1,Graduate,No,4583.0,1508.0,128.000000,360.0,1.0,Rural,8.430327,7.319202,4.859812,high,medium,high,0.355556,no
2,Male,Yes,0,Graduate,Yes,3000.0,0.0,66.000000,360.0,1.0,Urban,8.006701,0.000000,4.204693,medium,low,very high,0.183333,yes
3,Male,Yes,0,Not Graduate,No,2583.0,2358.0,120.000000,360.0,1.0,Urban,7.857094,7.765993,4.795791,low,medium,very high,0.333333,yes
4,Male,No,0,Graduate,No,6000.0,0.0,141.000000,360.0,1.0,Urban,8.699681,0.000000,4.955827,very high,high,very high,0.391667,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,Female,No,0,Graduate,No,2900.0,0.0,71.000000,360.0,1.0,Rural,7.972811,0.000000,4.276666,medium,low,very high,0.197222,yes
610,Male,Yes,3,Graduate,No,4106.0,0.0,40.000000,360.0,1.0,Rural,8.320448,0.000000,3.713572,high,low,very high,0.222222,yes
611,Male,Yes,1,Graduate,No,8072.0,240.0,253.000000,360.0,1.0,Urban,8.996280,5.484797,5.537334,very high,very high,medium,0.702778,yes
612,Male,Yes,2,Graduate,No,7583.0,0.0,187.000000,360.0,1.0,Urban,8.933796,0.000000,5.236442,very high,very high,very high,0.519444,yes


# now lets create our preprocessing pipeline

In [23]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [25]:
from sklearn.preprocessing import StandardScaler,OneHotEncoder,OrdinalEncoder
num_columns=['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'EMI',"Applicant_log","CoapplicantIncome_log","LoanAmount_log"]
nominal_cols=['Gender', 'Married', 'Education', 'Self_Employed', 'Dependents', 'Property_Area']
ordinal_cols = ['new_income_bin', 'loan_bin', 'CoapplicantIncome_bin']
ordinal_categories = [['low', 'medium', 'high', 'very high']] * 3

preprocessor=ColumnTransformer([('num', StandardScaler(), num_columns),
    ('ord', OrdinalEncoder(categories=ordinal_categories), ordinal_cols),
    ('nom', OneHotEncoder(handle_unknown='ignore'), nominal_cols)
])

# now lets preprocess x_train,x_test

In [26]:
x_train_trans=preprocessor.fit_transform(x_train)
x_test_trans=preprocessor.transform(x_test)

# lets extract final feature names after preprocessing

In [36]:
nominal_features=preprocessor.named_transformers_["nom"].get_feature_names_out(nominal_cols)
final_features=np.concatenate([nominal_features,num_columns,ordinal_cols])

x_train=pd.DataFrame(x_train_trans,columns=final_features)
x_test=pd.DataFrame(x_test_trans,columns=final_features)

# now lets do feature selection using XGBoost

In [37]:
!pip install xgboost



In [38]:
from xgboost import XGBClassifier
from sklearn.feature_selection import RFE

In [39]:
xgb=XGBClassifier()
selector= RFE(estimator=xgb,n_features_to_select=6)

In [40]:
selector.fit(x_train,y_train)

In [41]:
best_features=x_train.columns[selector.support_]
best_features

Index(['Self_Employed_No', 'Dependents_2', 'ApplicantIncome', 'LoanAmount',
       'Applicant_log', 'loan_bin'],
      dtype='object')

In [42]:
print(f"best features are:  {best_features}")

best features are:  Index(['Self_Employed_No', 'Dependents_2', 'ApplicantIncome', 'LoanAmount',
       'Applicant_log', 'loan_bin'],
      dtype='object')


In [43]:
best_features=list(best_features)

In [44]:
x_train=x_train[best_features]

In [45]:
x_test=x_test[best_features]

In [46]:
# now lets create models and train them

In [47]:
#!pip install xgboost

In [48]:
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [49]:
models={"Random Forest": RandomForestClassifier(n_estimators=100, max_depth=None, random_state=42),
    "XGBoost": XGBClassifier(n_estimators=100, learning_rate=0.1, use_label_encoder=False, eval_metric='logloss', random_state=42),
    "Logistic Regression": LogisticRegression(solver='liblinear', random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Naive Bayes": GaussianNB()}

In [50]:
accuracy={}
for name, model in models.items():
    model.fit(x_train,y_train)
    y_predict=model.predict(x_test)
    acc=accuracy_score(y_predict,y_test)
    accuracy[name]=acc

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [51]:
accuracy

{'Random Forest': 0.6422764227642277,
 'XGBoost': 0.6666666666666666,
 'Logistic Regression': 0.7235772357723578,
 'KNN': 0.6666666666666666,
 'Naive Bayes': 0.7317073170731707}

# Model selection using pipeline

In [52]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [61]:
gnb = GaussianNB()
log=LogisticRegression()

pipe=Pipeline([("clf",gnb)])
param_grid = [{"clf":[gnb]},{"clf":[log],"clf__C":[.1,1,10]}]
grid_search = GridSearchCV(pipe, param_grid=param_grid, cv=5, scoring="roc_auc")
grid_search.fit(x_train, y_train)

# evaluation metrics

In [71]:
from sklearn.metrics import roc_auc_score,classification_report
y_predict_prob=grid_search.predict_proba(x_test)[:,1]
y_pred_label = (y_predict_prob >= 0.5).astype(int)

In [72]:
roc=roc_auc_score(y_test,y_predict_prob)
roc
print(grid_search.best_params_)
best_model = grid_search.best_estimator_
print(classification_report(y_test, y_pred_label ))

{'clf': GaussianNB()}
              precision    recall  f1-score   support

           0       0.50      0.18      0.27        33
           1       0.76      0.93      0.84        90

    accuracy                           0.73       123
   macro avg       0.63      0.56      0.55       123
weighted avg       0.69      0.73      0.68       123



# threshhold tuning since recall of class 0 is low

In [78]:
for thresh in [0.4, 0.45, 0.5, 0.55, 0.6]:
    y_pred_thresh = (y_predict_prob >= thresh).astype(int)
    print(classification_report(y_test, y_pred_thresh))

              precision    recall  f1-score   support

           0       0.50      0.06      0.11        33
           1       0.74      0.98      0.84        90

    accuracy                           0.73       123
   macro avg       0.62      0.52      0.48       123
weighted avg       0.68      0.73      0.65       123

              precision    recall  f1-score   support

           0       0.50      0.09      0.15        33
           1       0.74      0.97      0.84        90

    accuracy                           0.73       123
   macro avg       0.62      0.53      0.50       123
weighted avg       0.68      0.73      0.66       123

              precision    recall  f1-score   support

           0       0.50      0.18      0.27        33
           1       0.76      0.93      0.84        90

    accuracy                           0.73       123
   macro avg       0.63      0.56      0.55       123
weighted avg       0.69      0.73      0.68       123

              preci

In [79]:
import joblib


In [80]:
final_model={"model":best_model,"threshold":0.55}
joblib.dump(final_model,"modelB.pkl")
joblib.dump(preprocessor,"preprocessor.pkl")

['preprocessor.pkl']