## Optimizing Machine Learning Models

In [46]:
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SequentialFeatureSelector



In [47]:
#read the data into a pandas dataframe.
data = pd.read_csv('loan_data - loan_data.csv')

In [48]:
#Display the first five rows of the dataframe.
data.head()

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,1,debt_consolidation,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0
1,1,credit_card,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0
2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0
3,1,debt_consolidation,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0
4,1,credit_card,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0


In [49]:
# Print data types and null counts for each column.
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9578 entries, 0 to 9577
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   credit.policy      9578 non-null   int64  
 1   purpose            9578 non-null   object 
 2   int.rate           9578 non-null   float64
 3   installment        9578 non-null   float64
 4   log.annual.inc     9578 non-null   float64
 5   dti                9578 non-null   float64
 6   fico               9578 non-null   int64  
 7   days.with.cr.line  9578 non-null   float64
 8   revol.bal          9578 non-null   int64  
 9   revol.util         9578 non-null   float64
 10  inq.last.6mths     9578 non-null   int64  
 11  delinq.2yrs        9578 non-null   int64  
 12  pub.rec            9578 non-null   int64  
 13  not.fully.paid     9578 non-null   int64  
dtypes: float64(6), int64(7), object(1)
memory usage: 1.0+ MB


In [50]:
data['purpose'].value_counts()

debt_consolidation    3957
all_other             2331
credit_card           1262
home_improvement       629
small_business         619
major_purchase         437
educational            343
Name: purpose, dtype: int64

In [51]:
# Convert the purpose column into dummy variables.
data2 = data
data2 = pd.get_dummies(data2, columns=['purpose'], drop_first=False)

In [52]:
data2['not.fully.paid'].value_counts()

0    8045
1    1533
Name: not.fully.paid, dtype: int64

In [53]:
# Working with imbalanced data.
data0 = data2[data2['not.fully.paid']==0].head(2000)
data1 = data2[data2['not.fully.paid']==1]
data2 = pd.concat([data0, data1])

In [54]:
# Split the data into features and labels.
X = data2.drop('not.fully.paid', axis = 1)
y = data2['not.fully.paid']

In [55]:
# Split the data into training and test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.15, random_state=417)

In [56]:
#Normalize the data.
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
X_train_scaled = mms.fit_transform(X_train)
# train_df = pd.DataFrame(X_train_scaled, columns = X.columns)
X_test_scaled = mms.fit_transform(X_test)
# test_df = pd.DataFrame(X_test_scaled, columns = X.columns)

In [57]:
# Instantiate a model, fit and predict. print the accuracy, the precision, and the recall.
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:",precision,"Recall:", recall)

Accuracy: 0.7169811320754716
Precision: 0.8818897637795275 Recall: 0.4534412955465587


In [58]:
# Instantiate a model with class_weight parameter = 'balanced', fit and predict. print the accuracy, the precision, and the recall.
balanced_model = LogisticRegression(class_weight='balanced')
balanced_model.fit(X_train_scaled, y_train)

y_pred_b = balanced_model.predict(X_test_scaled)

accuracy_b = accuracy_score(y_test, y_pred_b)
precision_b = precision_score(y_test, y_pred_b)
recall_b = recall_score(y_test, y_pred_b)

print("Accuracy:", accuracy_b)
print("Precision:",precision_b,"Recall:", recall_b)



Accuracy: 0.7094339622641509
Precision: 0.7961783439490446 Recall: 0.5060728744939271


Balancing the data decreased the accuracy, precision and recall.

In [59]:
# Perform PCA. Instantiate a model, fit and predict. print the accuracy, the precision, and the recall.
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca_train= pca.fit_transform(X_train_scaled)
pca_test = pca.transform(X_test_scaled)

model_pca = LogisticRegression()

model_pca.fit(pca_train, y_train)

y_pred_pca = model_pca.predict(pca_test)

accuracy_pca = accuracy_score(y_test, y_pred_pca)
precision_pca = precision_score(y_test, y_pred_pca)
recall_pca = recall_score(y_test, y_pred_pca)
print("Accuracy:", accuracy_pca)
print(precision_pca, recall_pca)

Accuracy: 0.5339622641509434
0.0 0.0


  _warn_prf(average, modifier, msg_start, len(result))


The pca dencreased the accuracy, and the precision and recall are incorrect.

In [60]:
# Perform sequential feature selection. Instantiate a model, fit and predict. Print the accuracy, the precision, and the recall.
sfs = SequentialFeatureSelector(model, n_features_to_select=5)
X_selected = sfs.fit_transform(X, y)
X_train_sfs, X_test_sfs, y_train_sfs, y_test_sfs = train_test_split(X_selected, y, test_size=.15, random_state=417)
X_train_scaled_sfs = mms.fit_transform(X_train_sfs)
X_test_scaled_sfs = mms.fit_transform(X_test_sfs)

model_sfs = LogisticRegression(max_iter=1000)
model_sfs.fit(X_train_scaled_sfs, y_train_sfs)

y_pred_sfs = model_sfs.predict(X_test_scaled_sfs)

accuracy_sfs = accuracy_score(y_test_sfs, y_pred_sfs)
precision_sfs = precision_score(y_test_sfs, y_pred_sfs)
recall_sfs = recall_score(y_test_sfs, y_pred_sfs)
print("Accuracy:", accuracy_sfs)
print(precision_sfs, recall_sfs)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Accuracy: 0.6867924528301886
1.0 0.32793522267206476


The sequential feature selection decreased the accuracy, and the precision and recall are incorrect.