# extracting data from s3 storage to python using boto3

In [2]:
import boto3

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [4]:
s3=boto3.client("s3")

In [5]:
bucket_name="loanpredictiondataset"
file_key="preprocessed_data.csv2"

In [6]:
obj=s3.get_object(Bucket=bucket_name,Key=file_key)

In [7]:
data=pd.read_csv(obj["Body"])

In [8]:
data.columns

Index(['Unnamed: 0', 'Gender', 'Married', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Loan_Status', 'Applicant_log',
       'CoapplicantIncome_log', 'LoanAmount_log', 'new_income', 'loan',
       'Dependents_1', 'Dependents_2', 'Dependents_3',
       'Property_Area_Semiurban', 'Property_Area_Urban'],
      dtype='object')

In [9]:
dataset=data.drop("Unnamed: 0",axis=1)

In [10]:
len(dataset.columns)

20

In [11]:
dataset.isna().sum()

Gender                     0
Married                    0
Education                  0
Self_Employed              0
ApplicantIncome            0
CoapplicantIncome          0
LoanAmount                 0
Loan_Amount_Term           0
Credit_History             0
Loan_Status                0
Applicant_log              0
CoapplicantIncome_log      0
LoanAmount_log             0
new_income                 0
loan                       0
Dependents_1               0
Dependents_2               0
Dependents_3               0
Property_Area_Semiurban    0
Property_Area_Urban        0
dtype: int64

# now lets rearrange our columns

In [12]:
loan_status = dataset.pop('Loan_Status')
dataset.insert(len(dataset.columns), 'Loan_Status', loan_status)

In [13]:
#dataset

# now we r going  to split the data to independent and dependent

In [14]:
independent=dataset.iloc[:,:-1]
#independent


In [15]:
dependent=dataset.iloc[:,-1]
#dependent

# now lets split to train and test set

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
x_train,x_test,y_train,y_test=train_test_split(independent,dependent,test_size=.20,random_state=False)

In [18]:
y_train,y_test

(90     1
 533    0
 452    0
 355    1
 266    1
       ..
 277    1
 9      0
 359    1
 192    0
 559    1
 Name: Loan_Status, Length: 491, dtype: int64,
 454    1
 52     0
 536    1
 469    0
 55     1
       ..
 337    1
 376    1
 278    1
 466    0
 303    1
 Name: Loan_Status, Length: 123, dtype: int64)

In [19]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train_scaled= scaler.fit_transform(x_train)
x_test_scaled= scaler.transform(x_test)

In [20]:
x_train = pd.DataFrame(x_train_scaled, columns=x_train.columns)
x_test = pd.DataFrame(x_test_scaled, columns=x_test.columns)

# now lets do feature selection using RFE

In [21]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

In [22]:
rf=RandomForestClassifier()
selector= RFE(estimator=rf,n_features_to_select=10)

In [23]:
selector.fit(x_train,y_train)

In [24]:
best_features=x_train.columns[selector.support_]
best_features

Index(['Married', 'Education', 'ApplicantIncome', 'CoapplicantIncome',
       'LoanAmount', 'Loan_Amount_Term', 'Applicant_log',
       'CoapplicantIncome_log', 'LoanAmount_log', 'Property_Area_Semiurban'],
      dtype='object')

In [25]:
print(f"best features are:  {best_features}")

best features are:  Index(['Married', 'Education', 'ApplicantIncome', 'CoapplicantIncome',
       'LoanAmount', 'Loan_Amount_Term', 'Applicant_log',
       'CoapplicantIncome_log', 'LoanAmount_log', 'Property_Area_Semiurban'],
      dtype='object')


In [26]:
best_features=list(best_features)

In [27]:
x_train=x_train[best_features]

In [28]:
x_test=x_test[best_features]

In [29]:
# now lets create models and train them

In [32]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-3.0.0-py3-none-manylinux2014_x86_64.whl.metadata (2.0 kB)
Downloading xgboost-3.0.0-py3-none-manylinux2014_x86_64.whl (4.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.9/4.9 MB[0m [31m93.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-3.0.0


In [33]:
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.


In [34]:
models={"Random Forest": RandomForestClassifier(n_estimators=100, max_depth=None, random_state=42),
    "XGBoost": XGBClassifier(n_estimators=100, learning_rate=0.1, use_label_encoder=False, eval_metric='logloss', random_state=42),
    "Logistic Regression": LogisticRegression(solver='liblinear', random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Naive Bayes": GaussianNB()}

In [35]:
accuracy={}
for name, model in models.items():
    model.fit(x_train,y_train)
    y_predict=model.predict(x_test)
    acc=accuracy_score(y_predict,y_test)
    accuracy[name]=acc

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [36]:
accuracy

{'Random Forest': 0.6991869918699187,
 'XGBoost': 0.6585365853658537,
 'Logistic Regression': 0.7317073170731707,
 'KNN': 0.6666666666666666,
 'Naive Bayes': 0.7235772357723578}

In [37]:
from sklearn.model_selection import GridSearchCV

In [38]:
log_reg = LogisticRegression()
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],    
    'solver': ['liblinear', 'lbfgs'],  
    'penalty': ['l2']}
grid_search = GridSearchCV(estimator=log_reg, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(x_train, y_train)

In [39]:
print(grid_search.best_params_)

{'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}


In [40]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(x_test)
final_acc = accuracy_score(y_test, y_pred)
final_acc

0.7317073170731707

In [41]:
import joblib
import os

In [47]:
os.makedirs("models",exist_ok=True)
joblib.dump(best_model,"modelA.pkl")

['modelA.pkl']