In [12]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, accuracy_score

iris = load_iris()
X, y = iris.data, iris.target

y_binary = (y == 0).astype(int)

# Split the data into training and testing data sets
X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.2, random_state=42)

# Initialize an array to store cost of training and testing data sets for each c parameter
cost_train_list = []
cost_test_list = []

# Define a range of inverse regularization strengths (C values)
C_values = [0.001, 0.01, 0.1, 1, 10, 100, 1000]

for C_val in C_values:
    
    model = LogisticRegression(penalty='l1', solver='liblinear', C=C_val)
    model.fit(X_train, y_train)
    
    # Predict probabilities for the training and testing sets
    y_train_prob = model.predict_proba(X_train)
    y_test_prob = model.predict_proba(X_test)
    
    # Calculate cost for training and testing sets
    cost_train = log_loss(y_train, y_train_prob)
    cost_test = log_loss(y_test, y_test_prob)
    
    cost_train_list.append(cost_train)
    cost_test_list.append(cost_test)

for i, C in enumerate(C_values):
    print(f"C = {C};")
    print(f"  Training cost : {cost_train_list[i]:.4f}")
    print(f"  Testing cost  : {cost_test_list[i]:.4f}")
    if cost_train_list[i] > cost_test_list[i]:
        print("The model is underfitting the data")
    elif cost_test_list[i] > cost_train_list[i] and (cost_test_list[i] - cost_train_list[i]) / cost_train_list[i] > 0.1:
        print("The model is overfitting the data")
    else:
        print("The model has a good balance between training and testing performance")
    print()


C = 0.001;
  Training cost : 0.6931
  Testing cost  : 0.6931
The model is underfitting the data

C = 0.01;
  Training cost : 0.5454
  Testing cost  : 0.5407
The model is underfitting the data

C = 0.1;
  Training cost : 0.1037
  Testing cost  : 0.1078
The model has a good balance between training and testing performance

C = 1;
  Training cost : 0.0117
  Testing cost  : 0.0126
The model has a good balance between training and testing performance

C = 10;
  Training cost : 0.0014
  Testing cost  : 0.0014
The model is underfitting the data

C = 100;
  Training cost : 0.0003
  Testing cost  : 0.0002
The model is underfitting the data

C = 1000;
  Training cost : 0.0001
  Testing cost  : 0.0000
The model is underfitting the data



In [13]:
from sklearn.metrics import confusion_matrix, classification_report

best_C = 1
best_model = LogisticRegression(penalty='l1', solver='liblinear', C=best_C)
best_model.fit(X_train, y_train)

# Confusion Matrix
y_test_pred = best_model.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_test_pred)

print("Confusion Matrix:")
print(conf_matrix)


Confusion Matrix:
[[20  0]
 [ 0 10]]


In [14]:
# Classification Report
target_names = ['Negative','Positive']
class_report = classification_report(y_test, y_test_pred, target_names=target_names)
print("\nClassification Report:")
print(class_report)


Classification Report:
              precision    recall  f1-score   support

    Negative       1.00      1.00      1.00        20
    Positive       1.00      1.00      1.00        10

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30

