### Ali Abdullah Ahmad 
###  CWID-20031246

## **Linear Regression Modification**

In [113]:
import pandas as pd

In [114]:
df = pd.read_csv('.\Realestate.csv')
df.shape
df.head()

Unnamed: 0,No,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
0,1,2012.917,32.0,84.87882,10,24.98298,121.54024,37.9
1,2,2012.917,19.5,306.5947,9,24.98034,121.53951,42.2
2,3,2013.583,13.3,561.9845,5,24.98746,121.54391,47.3
3,4,2013.5,13.3,561.9845,5,24.98746,121.54391,54.8
4,5,2012.833,5.0,390.5684,5,24.97937,121.54245,43.1


In [115]:
df.isnull().sum()

No                                        0
X1 transaction date                       0
X2 house age                              0
X3 distance to the nearest MRT station    0
X4 number of convenience stores           0
X5 latitude                               0
X6 longitude                              0
Y house price of unit area                0
dtype: int64

In [116]:
features = df.drop(columns='Y house price of unit area')
target = df['Y house price of unit area']

Standardized_features = (features - features.mean())/features.std()
Standardized_target = (target - target.mean())/target.std()
Standardized_data = pd.concat([Standardized_features,Standardized_target.rename("Standardized Y house price of unit area")],axis=1)

In [117]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

X = Standardized_features.values
y = Standardized_target.values

model = LinearRegression()
model.fit(X,y)

y_pred = model.predict(X)
rmse = np.sqrt(mean_squared_error(y,y_pred))
print(f"c : RMSE = {rmse:.4f}")

c : RMSE = 0.6447


In [118]:
class LinearR:
    def __init__(self,learning_rate , iterations):
        self.learning_rate = learning_rate
        self.iterations = iterations
    
    def fit(self,X,y):
        self.m , self.n = X.shape
        self.W = np.zeros(self.n)
        self.b = 0
        for _ in range(self.iterations):
            self.update_weights(X, y)

    def update_weights(self, X, y):
        y_pred = self.predict(X)
        dW = -(2 / self.m) * np.dot(X.T, (y - y_pred))
        db = -(2 / self.m) * np.sum(y - y_pred)

        self.W -= self.learning_rate * dW
        self.b -= self.learning_rate * db

    def predict(self, X):
        return np.dot(X, self.W) + self.b

best_eta = 0.01
best_iterations = 1000
model_gd = LinearR(learning_rate=best_eta, iterations=best_iterations)
model_gd.fit(X, y)

y_pred_gd = model_gd.predict(X)
rmse_gd = np.sqrt(mean_squared_error(y, y_pred_gd))
print(f"d : RMSE (Gradient Descent): {rmse_gd:.4f}, Learning Rate: {best_eta}, Iterations: {best_iterations}")


d : RMSE (Gradient Descent): 0.6447, Learning Rate: 0.01, Iterations: 1000


In [119]:
import random

class LinearRegressionSGD:
    def __init__(self, learning_rate, iterations):
        self.learning_rate = learning_rate
        self.iterations = iterations

    def fit(self, X, y):
        self.m, self.n = X.shape
        self.W = np.zeros(self.n)
        self.b = 0

        for _ in range(self.iterations):
            idx = random.randint(0, self.m - 1) 
            X_i = X[idx].reshape(1, -1)  
            y_i = y[idx]  
            self.update_weights(X_i, y_i)

    def update_weights(self, X_i, y_i):
        y_pred = self.predict(X_i)
        dW = -2 * X_i.T.dot(y_i - y_pred)
        db = -2 * (y_i - y_pred)

        self.W -= self.learning_rate * dW.flatten()
        self.b -= self.learning_rate * db

    def predict(self, X):
        return np.dot(X, self.W) + self.b


sgd_eta = best_eta
sgd_iterations = 5000  
model_sgd = LinearRegressionSGD(learning_rate=sgd_eta, iterations=sgd_iterations)
model_sgd.fit(X, y)


y_pred_sgd = model_sgd.predict(X)
rmse_sgd = np.sqrt(mean_squared_error(y, y_pred_sgd))
print(f"e : RMSE (Stochastic Gradient Descent): {rmse_sgd:.4f}")

e : RMSE (Stochastic Gradient Descent): 0.6542


## **Bayesian Theorem**

In [120]:
import pandas as pd
df = pd.read_csv( './data.csv')
print(df.shape)
df.head()

(99, 2)


Unnamed: 0,X,Y
0,1,1
1,0,0
2,0,0
3,1,0
4,1,0


In [121]:
x0 = (df['X'] == 0).mean()
x1 = (df['X'] == 1).mean()
y0 = (df['Y'] == 0).mean()
y1 = (df['Y'] == 1).mean()

print(f"b : Prior Probabilities:\nP(x=0): {x0:.4f}, P(x=1): {x1:.4f}, P(y=0): {y0:.4f}, P(y=1): {y1:.4f}")

b : Prior Probabilities:
P(x=0): 0.5556, P(x=1): 0.4444, P(y=0): 0.5253, P(y=1): 0.4747


In [122]:
x0_given_y0 = ((df['X'] == 0) & (df['Y'] == 0)).sum() / (df['Y'] == 0).sum()
x1_given_y0 = ((df['X'] == 1) & (df['Y'] == 0)).sum() / (df['Y'] == 0).sum()
x0_given_y1 = ((df['X'] == 0) & (df['Y'] == 1)).sum() / (df['Y'] == 1).sum()
x1_given_y1 = ((df['X'] == 1) & (df['Y'] == 1)).sum() / (df['Y'] == 1).sum()

print(f"c : Likelihoods:\nP(x=0|y=0): {x0_given_y0:.4f}, P(x=1|y=0): {x1_given_y0:.4f}, P(x=0|y=1): {x0_given_y1:.4f}, P(x=1|y=1): {x1_given_y1:.4f}")


c : Likelihoods:
P(x=0|y=0): 0.5769, P(x=1|y=0): 0.4231, P(x=0|y=1): 0.5319, P(x=1|y=1): 0.4681


In [123]:
y1_given_x0 = (x0_given_y1 * y1) / x0
y0_given_x0 = (x0_given_y0 * y0) / x0
y1_given_x1 = (x1_given_y1 * y1) / x1
y0_given_x1 = (x1_given_y0 * y0) / x1

print(f"Step d: Posterior Probabilities:\nP(y=1|x=0): {y1_given_x0:.4f}, P(y=0|x=0): {y0_given_x0:.4f}, P(y=1|x=1): {y1_given_x1:.4f}, P(y=0|x=1): {y0_given_x1:.4f}")


Step d: Posterior Probabilities:
P(y=1|x=0): 0.4545, P(y=0|x=0): 0.5455, P(y=1|x=1): 0.5000, P(y=0|x=1): 0.5000


## **LDA**

In [124]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
import numpy as np


X, y = datasets.make_blobs(n_samples=100, n_features=4, centers=2, cluster_std=1.5, random_state=123)
y = np.array([-1 if i == 0 else 1 for i in y])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [125]:
def compute_sb(X, y):
    overall_mean = np.mean(X, axis=0)
    classes = np.unique(y)
    SB = np.zeros((X.shape[1], X.shape[1]))

    for cls in classes:
        class_mean = np.mean(X[y == cls], axis=0)
        n_cls = X[y == cls].shape[0]
        mean_diff = (class_mean - overall_mean).reshape(-1, 1)
        SB += n_cls * mean_diff @ mean_diff.T

    return SB

SB = compute_sb(X_train, y_train)
print("a: Between-class scatter matrix SB:\n", SB)

a: Between-class scatter matrix SB:
 [[ 1.42177386e+00 -1.48370842e+01 -7.83733634e+01 -1.38981884e+01]
 [-1.48370842e+01  1.54834094e+02  8.17874222e+02  1.45036139e+02]
 [-7.83733634e+01  8.17874222e+02  4.32022577e+03  7.66118860e+02]
 [-1.38981884e+01  1.45036139e+02  7.66118860e+02  1.35858203e+02]]


In [126]:
def compute_sw(X, y):
    classes = np.unique(y)
    SW = np.zeros((X.shape[1], X.shape[1]))

    for cls in classes:
        class_scatter = np.cov(X[y == cls].T, bias=True) * (X[y == cls].shape[0] - 1)
        SW += class_scatter

    return SW

SW = compute_sw(X_train, y_train)
SW_inv = np.linalg.inv(SW)
print("b: Within-class scatter matrix SW:\n", SW)
print("b: Inverse of SW:\n", SW_inv)

b: Within-class scatter matrix SW:
 [[184.40393024   6.38328991 -16.01885729   8.00343862]
 [  6.38328991 198.95383305  -1.96744413   5.73688237]
 [-16.01885729  -1.96744413 150.76557752   2.51812873]
 [  8.00343862   5.73688237   2.51812873 144.09650929]]
b: Inverse of SW:
 [[ 5.49283896e-03 -1.61524880e-04  5.86666256e-04 -3.08905827e-04]
 [-1.61524880e-04  5.03753682e-03  5.17913168e-05 -1.92491956e-04]
 [ 5.86666256e-04  5.17913168e-05  6.69835676e-03 -1.51702444e-04]
 [-3.08905827e-04 -1.92491956e-04 -1.51702444e-04  6.96726537e-03]]


In [127]:
eigenvalues, eigenvectors = np.linalg.eig(np.linalg.inv(SW) @ SB)

sorted_indices = np.argsort(eigenvalues)[::-1]
eigenvalues = eigenvalues[sorted_indices]
eigenvectors = eigenvectors[:, sorted_indices]

projections = [X_train @ eigenvectors[:, i] for i in range(eigenvectors.shape[1])]

print("3c: Eigenvalues:\n", eigenvalues)
print("3c: Eigenvectors:\n", eigenvectors)

best_eigenvector = eigenvectors[:, 0]
best_projection = projections[0]


3c: Eigenvalues:
 [ 3.03906240e+01  0.00000000e+00 -6.76572168e-17 -1.07032138e-15]
3c: Eigenvectors:
 [[ 0.05876605 -0.99984555 -0.45527109  0.02315428]
 [ 0.14254062 -0.00250949 -0.09386513  0.69698811]
 [ 0.97595821 -0.01718218 -0.14536767 -0.25060232]
 [ 0.15405937 -0.00271228  0.8733761   0.67146848]]


3c: The eigenvector corresponding to the largest eigenvalue captures the most variance between classes, making it the most ideal for binary classification.

In [128]:
from sklearn.metrics import accuracy_score

train_projection = X_train @ best_eigenvector

train_predictions = np.sign(train_projection)
train_accuracy = accuracy_score(y_train, train_predictions)
print("3d: Train accuracy using best eigenvector:", train_accuracy)

3d: Train accuracy using best eigenvector: 1.0


In [129]:
other_accuracies = []
for i in range(1, eigenvectors.shape[1]):
    projection = X_train @ eigenvectors[:, i]
    predictions = np.sign(projection)
    accuracy = accuracy_score(y_train, predictions)
    other_accuracies.append(accuracy)

print("3dII: Train accuracies using other eigenvectors:", other_accuracies)


3dII: Train accuracies using other eigenvectors: [0.525, 0.4375, 0.4875]


### 3d: The best eigenvector should yield the highest accuracy as it captures the most discriminative information for binary classification.

In [130]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score

# Initialize LDA with 1 component
lda = LinearDiscriminantAnalysis(n_components=1)

# Fit the LDA model on the training data
lda.fit(X_train, y_train)

# Predict the test data class
y_test_pred = lda.predict(X_test)

# Report the accuracy
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test accuracy using LDA: {test_accuracy:.4f}")

Test accuracy using LDA: 1.0000
