In [7]:
import numpy as np
import pandas as pd

# Define the sigmoid function
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

# Feature mapping function: maps (x1, x2) to [1, x1, x2, x1*x2]
def feature_map(X):
    n_samples = X.shape[0]
    ones = np.ones((n_samples, 1))
    x1 = X[:, [0]]
    x2 = X[:, [1]]
    x1x2 = x1 * x2
    return np.hstack([ones, x1, x2, x1x2])


In [8]:
# Define the XOR dataset: 4 samples with two features each
X = np.array([
    [0, 0],
    [0, 1],
    [1, 0],
    [1, 1]
])

# Corresponding labels for XOR (0 or 1)
T = np.array([0, 1, 1, 0])

# Apply the feature mapping to X
Phi = feature_map(X)  # Phi has shape (4, 4)


In [9]:
# Initialize the weights as given in the assignment
w = np.array([-0.6, 2, 1.2, -2.8])

# Set the learning rate
eta = 0.1

# Total number of epochs (iterations)
num_epochs = 100

# Prepare a list to record details for the last 10 epochs
history = []


In [10]:
# Training loop for Logistic Regression
for epoch in range(num_epochs):
    # Forward pass: compute z = Phi dot w for all samples
    z = Phi.dot(w)  # shape (4,)
    # Compute predictions using the sigmoid function
    y = sigmoid(z)
    
    # Compute cross-entropy loss (averaged over samples)
    loss = -np.mean(T * np.log(y + 1e-10) + (1 - T) * np.log(1 - y + 1e-10))
    
    # Compute gradient for each parameter w_j:
    # grad_j = (1/N) * sum_i (y_i - t_i) * phi_j(x^(i))
    grad = np.mean((y - T)[:, np.newaxis] * Phi, axis=0)
    
    # Update the weights using gradient descent
    w = w - eta * grad
    
    # Record detailed information for the last 10 epochs
    if epoch >= num_epochs - 10:
        history.append({
            "Epoch": epoch + 1,
            "z_1": z[0],
            "z_2": z[1],
            "z_3": z[2],
            "z_4": z[3],
            "y_1": y[0],
            "y_2": y[1],
            "y_3": y[2],
            "y_4": y[3],
            "Loss": loss,
            "w": w.copy()
        })


In [15]:
# Convert the recorded history into a DataFrame for better visualization
df = pd.DataFrame(history)

print(df.to_string())

print(df)


   Epoch       z_1       z_2       z_3       z_4       y_1       y_2       y_3       y_4      Loss                                                                                   w
0     91 -0.702592  0.615201  1.115355 -1.079337  0.331238  0.649126  0.753126  0.253631  0.352628  [-0.7022696198398162, 1.8177775113526038, 1.3202238484076372, -3.5188254894718054]
1     92 -0.702270  0.617954  1.115508 -1.083094  0.331309  0.649753  0.753155  0.252921  0.352166    [-0.7019480665653083, 1.8176256232035046, 1.322656995237396, -3.525148514468718]
2     93 -0.701948  0.620709  1.115678 -1.086814  0.331380  0.650380  0.753186  0.252219  0.351707   [-0.7016271914767117, 1.817490503895978, 1.3250920334671696, -3.5314539820850253]
3     94 -0.701627  0.623465  1.115863 -1.090499  0.331452  0.651006  0.753221  0.251524  0.351249    [-0.7013072592364682, 1.8173718791207774, 1.327528769576227, -3.537742091924406]
4     95 -0.701307  0.626222  1.116065 -1.094149  0.331522  0.651632  0.753258  0.250

In [13]:
## Conclusion

#The table above shows the details of the last 10 iterations (epochs), including:
# - **z_1, z_2, z_3, z_4:** The computed linear combination \(z\) for each of the 4 samples.
# - **y_1, y_2, y_3, y_4:** The corresponding predicted probabilities after applying the sigmoid function.
# - **Loss:** The average cross-entropy loss for that epoch.
# - **w:** The current weight vector after the update.

# If the model converges correctly, the predictions \(y\) for samples with label 1 should approach 1, and for label 0 should approach 0.
