In [29]:
import pandas as pd
inputfile="Realestate.csv"
df=pd.read_csv(inputfile)
df.shape, df.head()

((414, 8),
    No  X1 transaction date  X2 house age  \
 0   1             2012.917          32.0   
 1   2             2012.917          19.5   
 2   3             2013.583          13.3   
 3   4             2013.500          13.3   
 4   5             2012.833           5.0   
 
    X3 distance to the nearest MRT station  X4 number of convenience stores  \
 0                                84.87882                               10   
 1                               306.59470                                9   
 2                               561.98450                                5   
 3                               561.98450                                5   
 4                               390.56840                                5   
 
    X5 latitude  X6 longitude  Y house price of unit area  
 0     24.98298     121.54024                        37.9  
 1     24.98034     121.53951                        42.2  
 2     24.98746     121.54391                        47.3  
 

In [30]:
# Step b: Standardize the data using mean and standard deviation
features = df.drop(columns=["No", "Y house price of unit area"])  # Exclude 'No' and target variable
target = df["Y house price of unit area"]

standardized_features = (features - features.mean()) / features.std()
standardized_target = (target - target.mean()) / target.std()

standardized_data = pd.concat([standardized_features, standardized_target.rename("Y_std")], axis=1)

In [31]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

# Separate features and target
X = standardized_features.values
y = standardized_target.values

# Initialize and fit the Scikit-learn Linear Regression model
model = LinearRegression()
model.fit(X, y)

# Predict and calculate RMSE
y_pred = model.predict(X)
rmse_sklearn = np.sqrt(mean_squared_error(y, y_pred))
print(f"Step c: RMSE (Scikit-learn): {rmse_sklearn:.4f}")

Step c: RMSE (Scikit-learn): 0.6455


In [32]:
class LinearRegressionGD:
    def __init__(self, learning_rate, iterations):
        self.learning_rate = learning_rate
        self.iterations = iterations

    def fit(self, X, y):
        self.m, self.n = X.shape
        self.W = np.zeros(self.n)
        self.b = 0

        for _ in range(self.iterations):
            self.update_weights(X, y)

    def update_weights(self, X, y):
        y_pred = self.predict(X)
        dW = -(2 / self.m) * np.dot(X.T, (y - y_pred))
        db = -(2 / self.m) * np.sum(y - y_pred)

        self.W -= self.learning_rate * dW
        self.b -= self.learning_rate * db

    def predict(self, X):
        return np.dot(X, self.W) + self.b

# Tune learning rate and iteration count
best_eta = 0.01
best_iterations = 1000
model_gd = LinearRegressionGD(learning_rate=best_eta, iterations=best_iterations)
model_gd.fit(X, y)

# Predict and calculate RMSE
y_pred_gd = model_gd.predict(X)
rmse_gd = np.sqrt(mean_squared_error(y, y_pred_gd))
print(f"Step d: RMSE (Gradient Descent): {rmse_gd:.4f}, Learning Rate: {best_eta}, Iterations: {best_iterations}")


Step d: RMSE (Gradient Descent): 0.6455, Learning Rate: 0.01, Iterations: 1000


In [33]:
import random

class LinearRegressionSGD:
    def __init__(self, learning_rate, iterations):
        self.learning_rate = learning_rate
        self.iterations = iterations

    def fit(self, X, y):
        self.m, self.n = X.shape
        self.W = np.zeros(self.n)
        self.b = 0

        for _ in range(self.iterations):
            # Perform stochastic gradient descent (one random sample per update)
            idx = random.randint(0, self.m - 1)  # Select a random index
            X_i = X[idx].reshape(1, -1)  # Single sample
            y_i = y[idx]  # Corresponding target
            self.update_weights(X_i, y_i)

    def update_weights(self, X_i, y_i):
        y_pred = self.predict(X_i)
        dW = -2 * X_i.T.dot(y_i - y_pred)
        db = -2 * (y_i - y_pred)

        self.W -= self.learning_rate * dW.flatten()
        self.b -= self.learning_rate * db

    def predict(self, X):
        return np.dot(X, self.W) + self.b

# Start with the same eta and tune parameters for SGD
sgd_eta = best_eta
sgd_iterations = 5000  # Start with more iterations for stochastic updates
model_sgd = LinearRegressionSGD(learning_rate=sgd_eta, iterations=sgd_iterations)
model_sgd.fit(X, y)

# Predict and calculate RMSE
y_pred_sgd = model_sgd.predict(X)
rmse_sgd = np.sqrt(mean_squared_error(y, y_pred_sgd))
print(f"Step e: RMSE (Stochastic Gradient Descent): {rmse_sgd:.4f}")

Step e: RMSE (Stochastic Gradient Descent): 0.6613


In [34]:
import pandas as pd

data_path = "data.csv"  # Update the path to the uploaded data
df = pd.read_csv(data_path)
df.shape, df.head()

((99, 2),
    X  Y
 0  1  1
 1  0  0
 2  0  0
 3  1  0
 4  1  0)

In [35]:
p_x0 = (df['X'] == 0).mean()
p_x1 = (df['X'] == 1).mean()
p_y0 = (df['Y'] == 0).mean()
p_y1 = (df['Y'] == 1).mean()

print(f"Step b: Prior Probabilities:\nP(x=0): {p_x0:.4f}, P(x=1): {p_x1:.4f}, P(y=0): {p_y0:.4f}, P(y=1): {p_y1:.4f}")

Step b: Prior Probabilities:
P(x=0): 0.5556, P(x=1): 0.4444, P(y=0): 0.5253, P(y=1): 0.4747


In [36]:
# Step c: Calculate likelihoods
p_x0_given_y0 = ((df['X'] == 0) & (df['Y'] == 0)).sum() / (df['Y'] == 0).sum()
p_x1_given_y0 = ((df['X'] == 1) & (df['Y'] == 0)).sum() / (df['Y'] == 0).sum()
p_x0_given_y1 = ((df['X'] == 0) & (df['Y'] == 1)).sum() / (df['Y'] == 1).sum()
p_x1_given_y1 = ((df['X'] == 1) & (df['Y'] == 1)).sum() / (df['Y'] == 1).sum()

print(f"Step c: Likelihoods:\nP(x=0|y=0): {p_x0_given_y0:.4f}, P(x=1|y=0): {p_x1_given_y0:.4f}, P(x=0|y=1): {p_x0_given_y1:.4f}, P(x=1|y=1): {p_x1_given_y1:.4f}")


Step c: Likelihoods:
P(x=0|y=0): 0.5769, P(x=1|y=0): 0.4231, P(x=0|y=1): 0.5319, P(x=1|y=1): 0.4681


In [37]:
# Step d: Calculate posterior probabilities
p_y1_given_x0 = (p_x0_given_y1 * p_y1) / p_x0
p_y0_given_x0 = (p_x0_given_y0 * p_y0) / p_x0
p_y1_given_x1 = (p_x1_given_y1 * p_y1) / p_x1
p_y0_given_x1 = (p_x1_given_y0 * p_y0) / p_x1

print(f"Step d: Posterior Probabilities:\nP(y=1|x=0): {p_y1_given_x0:.4f}, P(y=0|x=0): {p_y0_given_x0:.4f}, P(y=1|x=1): {p_y1_given_x1:.4f}, P(y=0|x=1): {p_y0_given_x1:.4f}")


Step d: Posterior Probabilities:
P(y=1|x=0): 0.4545, P(y=0|x=0): 0.5455, P(y=1|x=1): 0.5000, P(y=0|x=1): 0.5000


In [38]:
# Step a: Generate the dataset and split into train and test sets
from sklearn import datasets
from sklearn.model_selection import train_test_split
import numpy as np

# Generate dataset
X, y = datasets.make_blobs(n_samples=100, n_features=4, centers=2, cluster_std=1.5, random_state=123)
y = np.array([-1 if i == 0 else 1 for i in y])

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [39]:
def compute_sb(X, y):
    overall_mean = np.mean(X, axis=0)
    classes = np.unique(y)
    SB = np.zeros((X.shape[1], X.shape[1]))

    for cls in classes:
        class_mean = np.mean(X[y == cls], axis=0)
        n_cls = X[y == cls].shape[0]
        mean_diff = (class_mean - overall_mean).reshape(-1, 1)
        SB += n_cls * mean_diff @ mean_diff.T

    return SB

SB = compute_sb(X_train, y_train)
print("Step b: Between-class scatter matrix SB:\n", SB)

Step b: Between-class scatter matrix SB:
 [[ 1.42177386e+00 -1.48370842e+01 -7.83733634e+01 -1.38981884e+01]
 [-1.48370842e+01  1.54834094e+02  8.17874222e+02  1.45036139e+02]
 [-7.83733634e+01  8.17874222e+02  4.32022577e+03  7.66118860e+02]
 [-1.38981884e+01  1.45036139e+02  7.66118860e+02  1.35858203e+02]]


In [40]:
# Step c: Compute within-class scatter Sw and its inverse
def compute_sw(X, y):
    classes = np.unique(y)
    SW = np.zeros((X.shape[1], X.shape[1]))

    for cls in classes:
        class_scatter = np.cov(X[y == cls].T, bias=True) * (X[y == cls].shape[0] - 1)
        SW += class_scatter

    return SW

SW = compute_sw(X_train, y_train)
SW_inv = np.linalg.inv(SW)
print("Step c: Within-class scatter matrix SW:\n", SW)
print("Step c: Inverse of SW:\n", SW_inv)


Step c: Within-class scatter matrix SW:
 [[184.40393024   6.38328991 -16.01885729   8.00343862]
 [  6.38328991 198.95383305  -1.96744413   5.73688237]
 [-16.01885729  -1.96744413 150.76557752   2.51812873]
 [  8.00343862   5.73688237   2.51812873 144.09650929]]
Step c: Inverse of SW:
 [[ 5.49283896e-03 -1.61524880e-04  5.86666256e-04 -3.08905827e-04]
 [-1.61524880e-04  5.03753682e-03  5.17913168e-05 -1.92491956e-04]
 [ 5.86666256e-04  5.17913168e-05  6.69835676e-03 -1.51702444e-04]
 [-3.08905827e-04 -1.92491956e-04 -1.51702444e-04  6.96726537e-03]]


In [46]:
# Step d: Find eigenvalues and eigenvectors
eigenvalues, eigenvectors = np.linalg.eig(np.linalg.inv(SW) @ SB)

# Sort eigenvalues and eigenvectors by descending order of eigenvalues
sorted_indices = np.argsort(eigenvalues)[::-1]
eigenvalues = eigenvalues[sorted_indices]
eigenvectors = eigenvectors[:, sorted_indices]

# Transform the train data to each projection plane
projections = [X_train @ eigenvectors[:, i] for i in range(eigenvectors.shape[1])]

print("Step d: Eigenvalues:\n", eigenvalues)
print("Step d: Eigenvectors:\n", eigenvectors)

# Choose the most ideal eigenvector (corresponding to the largest eigenvalue)
best_eigenvector = eigenvectors[:, 0]
best_projection = projections[0]

# Explanation: The eigenvector corresponding to the largest eigenvalue captures the most variance
# between classes, making it the most ideal for binary classification.


Step d: Eigenvalues:
 [3.03906240e+01 5.61892349e-16 5.03934630e-17 0.00000000e+00]
Step d: Eigenvectors:
 [[-0.05876605 -0.16956648  0.3192606  -0.99984555]
 [-0.14254062 -0.96718363  0.29155143 -0.00250949]
 [-0.97595821  0.16297647 -0.20511277 -0.01718218]
 [-0.15405937  0.09613379  0.87806559 -0.00271228]]


In [48]:
# Step e: Predict the class of train data using the best eigenvector
from sklearn.metrics import accuracy_score

# Project train data onto the best eigenvector
train_projection = X_train @ best_eigenvector

# Predict classes based on the sign of the projection
train_predictions = np.sign(train_projection)
train_accuracy = accuracy_score(y_train, train_predictions)
print("Step e: Train accuracy using best eigenvector:", train_accuracy)


Step e: Train accuracy using best eigenvector: 0.0


In [50]:
# Step f: Predict train data classes using other eigenvectors and calculate accuracy
other_accuracies = []
for i in range(1, eigenvectors.shape[1]):
    projection = X_train @ eigenvectors[:, i]
    predictions = np.sign(projection)
    accuracy = accuracy_score(y_train, predictions)
    other_accuracies.append(accuracy)

print("Step f: Train accuracies using other eigenvectors:", other_accuracies)

# Explanation: The best eigenvector should yield the highest accuracy as it captures the most
# discriminative information for binary classification.

Step f: Train accuracies using other eigenvectors: [0.4625, 0.525, 0.525]
