# Goal
A simple project to illustrate logistic regression on predicting customers who will likely be considered for a loan

### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
plt.style.use('ggplot')

## Importing the Dataset

In [2]:
df = pd.read_csv("loan.csv")
df.head()

Unnamed: 0,age,gender,occupation,education_level,marital_status,income,credit_score,loan_status
0,32,Male,Engineer,Bachelor's,Married,85000,720,Approved
1,45,Female,Teacher,Master's,Single,62000,680,Approved
2,28,Male,Student,High School,Single,25000,590,Denied
3,51,Female,Manager,Bachelor's,Married,105000,780,Approved
4,36,Male,Accountant,Bachelor's,Married,75000,710,Approved


## EDA

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61 entries, 0 to 60
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              61 non-null     int64 
 1   gender           61 non-null     object
 2   occupation       61 non-null     object
 3   education_level  61 non-null     object
 4   marital_status   61 non-null     object
 5   income           61 non-null     int64 
 6   credit_score     61 non-null     int64 
 7   loan_status      61 non-null     object
dtypes: int64(3), object(5)
memory usage: 3.9+ KB


In [11]:
df.describe()

Unnamed: 0,age,income,credit_score
count,61.0,61.0,61.0
mean,37.081967,78983.606557,709.836066
std,8.424755,33772.025802,72.674888
min,24.0,25000.0,560.0
25%,30.0,52000.0,650.0
50%,36.0,78000.0,720.0
75%,43.0,98000.0,770.0
max,55.0,180000.0,830.0


### Checking for missing rows

In [6]:
df.isnull().sum()

age                0
gender             0
occupation         0
education_level    0
marital_status     0
income             0
credit_score       0
loan_status        0
dtype: int64

### Encoding categorical data

In [7]:
df_encoded = pd.get_dummies(df, drop_first = True)
df_encoded

Unnamed: 0,age,income,credit_score,gender_Male,occupation_Analyst,occupation_Architect,occupation_Artist,occupation_Banker,occupation_Chef,occupation_Consultant,...,occupation_Stylist,occupation_Teacher,occupation_Veterinarian,occupation_Writer,education_level_Bachelor's,education_level_Doctoral,education_level_High School,education_level_Master's,marital_status_Single,loan_status_Denied
0,32,85000,720,True,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
1,45,62000,680,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,True,True,False
2,28,25000,590,True,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,True,True
3,51,105000,780,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
4,36,75000,710,True,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56,39,100000,770,True,False,True,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
57,25,32000,570,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,True,True
58,43,95000,760,True,False,False,False,True,False,False,...,False,False,False,False,True,False,False,False,False,False
59,30,55000,650,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,True,True,False


In [8]:
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61 entries, 0 to 60
Data columns (total 47 columns):
 #   Column                       Non-Null Count  Dtype
---  ------                       --------------  -----
 0   age                          61 non-null     int64
 1   income                       61 non-null     int64
 2   credit_score                 61 non-null     int64
 3   gender_Male                  61 non-null     bool 
 4   occupation_Analyst           61 non-null     bool 
 5   occupation_Architect         61 non-null     bool 
 6   occupation_Artist            61 non-null     bool 
 7   occupation_Banker            61 non-null     bool 
 8   occupation_Chef              61 non-null     bool 
 9   occupation_Consultant        61 non-null     bool 
 10  occupation_Dentist           61 non-null     bool 
 11  occupation_Designer          61 non-null     bool 
 12  occupation_Doctor            61 non-null     bool 
 13  occupation_Editor            61 non-null     bool 
 

In [10]:
X = df_encoded.drop("loan_status_Denied", axis = 1)
y = df_encoded["loan_status_Denied"]

## spliting to test and train set

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

## Training the Data

In [16]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()

In [17]:
lr.fit(X_train, y_train)

In [18]:
y_pred = lr.predict(X_test)

In [19]:
## Evaluating our model
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

accuracy = accuracy_score(y_test, y_pred)
print(f"Our accuracy is {accuracy * 100} %")

Our accuracy is 100.0 %


In [20]:
confusion = confusion_matrix(y_test, y_pred)
print(confusion)

[[9 0]
 [0 4]]


In [21]:
report = classification_report(y_test, y_test)
print(report)

              precision    recall  f1-score   support

       False       1.00      1.00      1.00         9
        True       1.00      1.00      1.00         4

    accuracy                           1.00        13
   macro avg       1.00      1.00      1.00        13
weighted avg       1.00      1.00      1.00        13



it's usually not the case to achieve a 100 percent accuracy. this is due to the fact that our data set is very small

In [None]:
Simulat

In [None]:
new_df = pd.DataFrame(np.zeros((1, X_train.shape[1])), columns = X_train.columns)

## np.zeros(row, column), then name of column

new_df.at[0, 'age;