In [1]:
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.feature_selection import SelectKBest,chi2

In [3]:
df = pd.read_excel("Loan_Dataset-2.xlsx")

In [4]:
df

Unnamed: 0,Gender,Dependents,Education,Self_Employed,ApplicantIncome,LoanAmount,Credit_History,Property_Area,Loan_Status
0,Male,0,Graduate,No,5849,,1.0,Urban,Y
1,Male,1,Graduate,No,4583,128.0,1.0,Rural,N
2,Male,0,Graduate,Yes,3000,66.0,1.0,Urban,Y
3,Male,0,Not Graduate,No,2583,120.0,1.0,Urban,Y
4,Male,0,Graduate,No,6000,141.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...
609,Female,0,Graduate,No,2900,71.0,1.0,Rural,Y
610,Male,3+,Graduate,No,4106,40.0,1.0,Rural,Y
611,Male,1,Graduate,No,8072,253.0,1.0,Urban,Y
612,Male,2,Graduate,No,7583,187.0,1.0,Urban,Y


In [5]:
null_values = df.isnull().sum()

In [6]:
null_values

Gender             13
Dependents         15
Education           0
Self_Employed      32
ApplicantIncome     0
LoanAmount         22
Credit_History     50
Property_Area       0
Loan_Status         0
dtype: int64

In [7]:
from sklearn.impute import SimpleImputer

imp1=SimpleImputer(strategy="median")
imp2=SimpleImputer(strategy="most_frequent")

In [8]:
from sklearn.compose import ColumnTransformer


tr1 = ColumnTransformer([
                    ('Mode1',imp2,['Gender']),
                    ('Mode2',imp2,['Dependents']),
                    ('Mode3',imp2,['Self_Employed']),
                    ('Median',imp1,['LoanAmount']),
                    ('Mode4',imp2,['Credit_History'])],
                     remainder='passthrough')

In [9]:
tr1

In [10]:
tr1.fit(df)

In [11]:
df.head()

Unnamed: 0,Gender,Dependents,Education,Self_Employed,ApplicantIncome,LoanAmount,Credit_History,Property_Area,Loan_Status
0,Male,0,Graduate,No,5849,,1.0,Urban,Y
1,Male,1,Graduate,No,4583,128.0,1.0,Rural,N
2,Male,0,Graduate,Yes,3000,66.0,1.0,Urban,Y
3,Male,0,Not Graduate,No,2583,120.0,1.0,Urban,Y
4,Male,0,Graduate,No,6000,141.0,1.0,Urban,Y


In [12]:
df = tr1.fit_transform(df)

In [13]:
df

array([['Male', 0, 'No', ..., 5849, 'Urban', 'Y'],
       ['Male', 1, 'No', ..., 4583, 'Rural', 'N'],
       ['Male', 0, 'Yes', ..., 3000, 'Urban', 'Y'],
       ...,
       ['Male', 1, 'No', ..., 8072, 'Urban', 'Y'],
       ['Male', 2, 'No', ..., 7583, 'Urban', 'Y'],
       ['Female', 0, 'Yes', ..., 4583, 'Semiurban', 'N']], dtype=object)

In [14]:
df = pd.DataFrame(df)

In [15]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,Male,0,No,128.0,1.0,Graduate,5849,Urban,Y
1,Male,1,No,128.0,1.0,Graduate,4583,Rural,N
2,Male,0,Yes,66.0,1.0,Graduate,3000,Urban,Y
3,Male,0,No,120.0,1.0,Not Graduate,2583,Urban,Y
4,Male,0,No,141.0,1.0,Graduate,6000,Urban,Y
...,...,...,...,...,...,...,...,...,...
609,Female,0,No,71.0,1.0,Graduate,2900,Rural,Y
610,Male,3+,No,40.0,1.0,Graduate,4106,Rural,Y
611,Male,1,No,253.0,1.0,Graduate,8072,Urban,Y
612,Male,2,No,187.0,1.0,Graduate,7583,Urban,Y


In [16]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder

tr2 = ColumnTransformer([
        ('onehot1', OneHotEncoder(), [0]),
        ('onehot2', OneHotEncoder(), [2]),
        ('onehot3', OneHotEncoder(), [5]),
        ('ordinal', OrdinalEncoder(categories=[['Urban', 'Semiurban', 'Rural']]), [7])],
                     remainder='passthrough')

In [17]:
tr2

In [18]:
df = tr2.fit_transform(df)

In [19]:
df = pd.DataFrame(df)

In [20]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0,128.0,1.0,5849,Y
1,0.0,1.0,1.0,0.0,1.0,0.0,2.0,1,128.0,1.0,4583,N
2,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0,66.0,1.0,3000,Y
3,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0,120.0,1.0,2583,Y
4,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0,141.0,1.0,6000,Y
...,...,...,...,...,...,...,...,...,...,...,...,...
609,1.0,0.0,1.0,0.0,1.0,0.0,2.0,0,71.0,1.0,2900,Y
610,0.0,1.0,1.0,0.0,1.0,0.0,2.0,3+,40.0,1.0,4106,Y
611,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1,253.0,1.0,8072,Y
612,0.0,1.0,1.0,0.0,1.0,0.0,0.0,2,187.0,1.0,7583,Y


In [21]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

df[11] = label_encoder.fit_transform(df[11])

In [22]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0,128.0,1.0,5849,1
1,0.0,1.0,1.0,0.0,1.0,0.0,2.0,1,128.0,1.0,4583,0
2,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0,66.0,1.0,3000,1
3,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0,120.0,1.0,2583,1
4,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0,141.0,1.0,6000,1
...,...,...,...,...,...,...,...,...,...,...,...,...
609,1.0,0.0,1.0,0.0,1.0,0.0,2.0,0,71.0,1.0,2900,1
610,0.0,1.0,1.0,0.0,1.0,0.0,2.0,3+,40.0,1.0,4106,1
611,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1,253.0,1.0,8072,1
612,0.0,1.0,1.0,0.0,1.0,0.0,0.0,2,187.0,1.0,7583,1


# scaling 

In [23]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer

In [24]:
scaler_ss = StandardScaler()

scaler_mms = MinMaxScaler()

In [25]:
tr3 = ColumnTransformer([
    ('SS1', scaler_ss, [0]),  
    ('SS2', scaler_ss, [1]),  
    ('MMS1', scaler_mms, [2]),
    ('SS3', scaler_ss, [3]),  
    ('MMS2', scaler_mms, [4]),
    ('SS4', scaler_ss, [5]),  
    ('SS5', scaler_ss, [6]),  
    ('SS6', scaler_ss, [7]),  
    ('MMS3', scaler_mms, [8]),
    ('SS7', scaler_ss, [9]),
    ('SS8', scaler_ss,[10]),
    ('MMS4', scaler_mms, [11])
], remainder='passthrough')

In [26]:
tr3

In [27]:
for col in df:
    df[col] = df[col].astype(str).str.replace('+', '').astype(float)

In [28]:
df = tr3.fit_transform(df)

In [29]:
df = pd.DataFrame(df)

In [30]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,-0.472343,0.472343,1.0,-0.392601,1.0,-0.528362,-1.223298,-0.737806,0.172214,0.411733,0.072991,1.0
1,-0.472343,0.472343,1.0,-0.392601,1.0,-0.528362,1.318513,0.253470,0.172214,0.411733,-0.134412,0.0
2,-0.472343,0.472343,0.0,2.547117,1.0,-0.528362,-1.223298,-0.737806,0.082489,0.411733,-0.393747,1.0
3,-0.472343,0.472343,1.0,-0.392601,0.0,1.892641,-1.223298,-0.737806,0.160637,0.411733,-0.462062,1.0
4,-0.472343,0.472343,1.0,-0.392601,1.0,-0.528362,-1.223298,-0.737806,0.191027,0.411733,0.097728,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
609,2.117107,-2.117107,1.0,-0.392601,1.0,-0.528362,1.318513,-0.737806,0.089725,0.411733,-0.410130,1.0
610,-0.472343,0.472343,1.0,-0.392601,1.0,-0.528362,1.318513,2.236021,0.044863,0.411733,-0.212557,1.0
611,-0.472343,0.472343,1.0,-0.392601,1.0,-0.528362,-1.223298,0.253470,0.353111,0.411733,0.437174,1.0
612,-0.472343,0.472343,1.0,-0.392601,1.0,-0.528362,-1.223298,1.244745,0.257598,0.411733,0.357064,1.0


In [31]:
X = df.iloc[:,0:11]
y = df.iloc[:,-1]

In [32]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.11,random_state=1)

In [33]:
# from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error

model = LogisticRegression()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)



In [34]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

0.7794117647058824

# Cross Validation

In [35]:
from sklearn.model_selection import cross_val_score
cross_val_score(model, X_train, y_train, cv=11, scoring='accuracy').mean()

0.813209647495362

# GridSearch 

In [40]:
param_grid = {
    "penalty": ['l1', 'l2'],
    "C": [0.001, 0.01, 0.1, 1, 10, 100]
}

In [41]:
from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(model, param_grid, cv=11, scoring='accuracy')
grid.fit(X_train, y_train)

66 fits failed out of a total of 132.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
66 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/jasonseraphim/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/jasonseraphim/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/jasonseraphim/anaconda3/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py", line 1168, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
   

In [42]:
grid.best_score_

0.813209647495362

In [43]:
grid.best_params_

{'C': 0.01, 'penalty': 'l2'}

# pickling

In [45]:
# export 
import pickle
pickle.dump(model,open('lr_pile.pkl','wb'))