In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv('/content/loan_data.csv')

In [None]:
df.head(10) # first 10 rows you can see by using this

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
0,22.0,female,Master,71948.0,0,RENT,35000.0,PERSONAL,16.02,0.49,3.0,561,No,1
1,21.0,female,High School,12282.0,0,OWN,1000.0,EDUCATION,11.14,0.08,2.0,504,Yes,0
2,25.0,female,High School,12438.0,3,MORTGAGE,5500.0,MEDICAL,12.87,0.44,3.0,635,No,1
3,23.0,female,Bachelor,79753.0,0,RENT,35000.0,MEDICAL,15.23,0.44,2.0,675,No,1
4,24.0,male,Master,66135.0,1,RENT,35000.0,MEDICAL,14.27,0.53,4.0,586,No,1
5,21.0,female,High School,12951.0,0,OWN,2500.0,VENTURE,7.14,0.19,2.0,532,No,1
6,26.0,female,Bachelor,93471.0,1,RENT,35000.0,EDUCATION,12.42,0.37,3.0,701,No,1
7,24.0,female,High School,95550.0,5,RENT,35000.0,MEDICAL,11.11,0.37,4.0,585,No,1
8,24.0,female,Associate,100684.0,3,RENT,35000.0,PERSONAL,8.9,0.35,2.0,544,No,1
9,21.0,female,High School,12739.0,0,OWN,1600.0,VENTURE,14.74,0.13,3.0,640,No,1


## drop the column

In [None]:
df.drop(columns=['person_emp_exp','loan_int_rate','loan_percent_income','cb_person_cred_hist_length'],inplace=True) # drop the columns which is not reiured for the next operation
df.drop(columns=['previous_loan_defaults_on_file'],inplace=True)
df.sample(5)

Unnamed: 0,person_age,person_gender,person_education,person_income,person_home_ownership,loan_amnt,loan_intent,credit_score,loan_status
39151,28.0,female,Bachelor,168633.0,MORTGAGE,10000.0,EDUCATION,603,0
25430,27.0,female,Associate,103126.0,MORTGAGE,12000.0,EDUCATION,629,0
39175,27.0,male,High School,100027.0,MORTGAGE,5000.0,PERSONAL,657,0
15280,25.0,male,Bachelor,155347.0,MORTGAGE,28000.0,PERSONAL,664,0
21331,34.0,female,Associate,51254.0,RENT,5500.0,DEBTCONSOLIDATION,692,0


## Check the values

In [None]:
df['person_gender'].value_counts()

Unnamed: 0_level_0,count
person_gender,Unnamed: 1_level_1
male,24841
female,20159


In [None]:
df['person_home_ownership'].value_counts()

Unnamed: 0_level_0,count
person_home_ownership,Unnamed: 1_level_1
RENT,23443
MORTGAGE,18489
OWN,2951
OTHER,117


In [None]:
df['loan_intent'].value_counts()

Unnamed: 0_level_0,count
loan_intent,Unnamed: 1_level_1
EDUCATION,9153
MEDICAL,8548
VENTURE,7819
PERSONAL,7552
DEBTCONSOLIDATION,7145
HOMEIMPROVEMENT,4783


In [None]:
df['person_education'].value_counts() # check every value

Unnamed: 0_level_0,count
person_education,Unnamed: 1_level_1
Bachelor,13399
Associate,12028
High School,11972
Master,6980
Doctorate,621


## Split the dataset X and Y

In [None]:
X = df.iloc[:,:-1] # taking the all columns only left the last column
X

Unnamed: 0,person_age,person_gender,person_education,person_income,person_home_ownership,loan_amnt,loan_intent,credit_score
0,22.0,female,Master,71948.0,RENT,35000.0,PERSONAL,561
1,21.0,female,High School,12282.0,OWN,1000.0,EDUCATION,504
2,25.0,female,High School,12438.0,MORTGAGE,5500.0,MEDICAL,635
3,23.0,female,Bachelor,79753.0,RENT,35000.0,MEDICAL,675
4,24.0,male,Master,66135.0,RENT,35000.0,MEDICAL,586
...,...,...,...,...,...,...,...,...
44995,27.0,male,Associate,47971.0,RENT,15000.0,MEDICAL,645
44996,37.0,female,Associate,65800.0,RENT,9000.0,HOMEIMPROVEMENT,621
44997,33.0,male,Associate,56942.0,RENT,2771.0,DEBTCONSOLIDATION,668
44998,29.0,male,Bachelor,33164.0,RENT,12000.0,EDUCATION,604


In [None]:
Y = df.iloc[::,-1] # only taking the last column for the in y
Y

Unnamed: 0,loan_status
0,1
1,0
2,1
3,1
4,1
...,...
44995,1
44996,1
44997,1
44998,1


## train_test_split dataset

In [None]:
from sklearn.model_selection  import train_test_split

X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.25,random_state=0) # train_split

In [None]:
print(X_train.shape) # check the X_train shape
print(Y_train.shape) # check the X_test shape
print(Y_test.shape)  # check the Y_train shape
print(X_test.shape)  # check the Y_test shape

(33750, 8)
(33750,)
(11250, 8)
(11250,)


## Use Columntransformer to converting categorical data into numerical data

In [None]:
from sklearn.compose import ColumnTransformer
from  sklearn.preprocessing import OrdinalEncoder
from  sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression


In [None]:
transformer = ColumnTransformer(
    transformers=[
        # Ordinal Encoding for 'person_gender' (male, female)
        ('OrdinalCoding', OrdinalEncoder(categories=[['male', 'female']]), ['person_gender']),

        # OneHot Encoding for 'person_education', 'loan_intent', 'person_home_ownership'
        ('Onehotcoding', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'),
         ['person_education', 'loan_intent', 'person_home_ownership']),
    ],
    remainder='passthrough'  # Keep other columns as they are
)


In [None]:
transformer

## Use the pipeline method combine that and train the model on LogisticRegession

In [None]:

pipeline = Pipeline(steps=[
    ('preprocessor', transformer),  # Preprocessing step
    ('model', LogisticRegression())  # Logistic Regression model
])


In [None]:
# fit the model by using pipeline
pipeline.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



## predict on the model

In [None]:
predictions = pipeline.predict(X_test)
print(predictions)


[1 0 0 ... 0 0 0]


## Accuracy

In [None]:
from sklearn.metrics import accuracy_score

# Predict on the training set
Y_train_pred = pipeline.predict(X_train)
train_accuracy = accuracy_score(Y_train, Y_train_pred)

# Predict on the testing set
Y_test_pred = pipeline.predict(X_test)
test_accuracy = accuracy_score(Y_test, Y_test_pred)

print(f"Training Accuracy: {train_accuracy:.2f}")
print(f"Testing Accuracy: {test_accuracy:.2f}")


Training Accuracy: 0.83
Testing Accuracy: 0.83


## Save the model by using pickle

In [None]:
import pickle

# Assuming 'model' is your trained model
filename = "model.pkl"
with open(filename, "wb") as file:
    pickle.dump(pipeline, file)

print(f"{filename} saved successfully.")


model.pkl saved successfully.


## download that model in local files

In [None]:
from google.colab import files

# Download the file
files.download("model.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>