In [None]:
import numpy as np

# ============================================================================
# Question 8: Feature Engineering and Dimensionality Reduction
# ============================================================================

# Given data
s = np.array([1000, 1500, 1200, 2000, 1800])  # Size in sq. ft.
b = np.array([2, 3, 2, 4, 3])                   # Number of bedrooms
a = np.array([5, 10, 2, 20, 15])                # Age in years
y = np.array([200, 300, 240, 400, 350])         # Prices in thousands

print("=" * 70)
print("GIVEN DATA")
print("=" * 70)
print(f"Size (s): {s}")
print(f"Bedrooms (b): {b}")
print(f"Age (a): {a}")
print(f"Prices (y): {y}")
print()

# ============================================================================
# Part (a): Design Matrix and Linear Dependence
# ============================================================================
print("=" * 70)
print("PART (a): Design Matrix and Linear Dependence")
print("=" * 70)

# TODO: Create the design matrix X with bias term (column of ones) and features
np.set_printoptions(suppress=True, precision=2)

X = np.column_stack([np.ones(5), s, b, a])

print("Design Matrix X:")
print(X.astype(int))
print(f"Shape of X: {X.shape}")
print()

# TODO: Check for linear dependence
rank_X = np.linalg.matrix_rank(X)
num_cols = X.shape[1]

print(f"Rank of X: {rank_X}")
print(f"Number of columns: {num_cols}")
print(f"Are columns linearly independent? {rank_X == num_cols}")
print()

# TODO: Compute determinant of X^T X and its determinant
XTX = X.T @ X
det_XTX = np.linalg.det(XTX)

print(f"Determinant of X^T X: {det_XTX}")
print()

# TODO: Find a non-trivial linear combination (if columns are dependent)
print("Finding linear dependence relationship...")
if rank_X < num_cols:
    U, S, Vt = np.linalg.svd(X)
    null_space_vector = Vt[-1, :]
    print(f"Non-trivial linear combination coefficients: {null_space_vector}")
else:
    print("Columns are linearly independent - no non-trivial linear combination exists")

print()
print("TODO: Write your interpretation of what linear dependence means for ML models")
print("Practical Implication:")
print("""
Linear dependence means that the coefficient of one feature can be expressed as 
a combination of others, preventing unique solutions to a linear regression 
which then leads to infinitely many solutions. Also, models may become unstable 
and sensitive to small changes in data which can lead to overfitting. This is 
known as multicollinearity in regression analysis and this can be remedied by 
removing redundant features or using regularization techniques, or by applying 
PCA to reduce dimensionality.
""")
print()

# ============================================================================
# Part (b): Gram-Schmidt Orthogonalization
# ============================================================================
print("=" * 70)
print("PART (b): Gram-Schmidt Orthogonalization")
print("=" * 70)

# Extract feature vectors (excluding bias term)
v1 = s  # Size
v2 = b  # Bedrooms
v3 = a  # Age

print("Original feature vectors:")
print(f"v1 (size): {v1}")
print(f"v2 (bedrooms): {v2}")
print(f"v3 (age): {v3}")
print()

# TODO: Apply Gram-Schmidt process
u1 = v1.copy()

proj_u1_v2 = (np.dot(v2, u1) / np.dot(u1, u1)) * u1
u2 = v2 - proj_u1_v2

proj_u1_v3 = (np.dot(v3, u1) / np.dot(u1, u1)) * u1
proj_u2_v3 = (np.dot(v3, u2) / np.dot(u2, u2)) * u2
u3 = v3 - proj_u1_v3 - proj_u2_v3

# TODO: Normalize to get orthonormal basis
q1 = u1 / np.linalg.norm(u1)
q2 = u2 / np.linalg.norm(u2)
q3 = u3 / np.linalg.norm(u3)

print("Orthogonal basis vectors:")
print(f"u1: {u1}")
print(f"u2: {u2}")
print(f"u3: {u3}")
print()

# TODO: Verify orthogonality by checking dot products
print("Verification of orthogonality:")
print(f"u1 · u2: {np.dot(u1, u2):.3f}")
print(f"u1 · u3: {np.dot(u1, u3):.3f}")
print(f"u2 · u3: {np.dot(u2, u3):.3f}")

print()

# ============================================================================
# Part (c): Projection of Price Vector
# ============================================================================
print("=" * 70)
print("PART (c): Projection of Price Vector")
print("=" * 70)

# TODO: Compute projection of y onto the column space of features
y_proj = (np.dot(y, q1) * q1 + 
          np.dot(y, q2) * q2 + 
          np.dot(y, q3) * q3)

print(f"Projection of y onto column space: {y_proj}")
print()

# TODO: Compute projection matrix and apply to y
X_feat = np.column_stack([s, b, a])
P = X_feat @ np.linalg.inv(X_feat.T @ X_feat) @ X_feat.T
y_proj_matrix = P @ y

print("Interpretation in context of linear regression:")
print("TODO: Explain what this projection represents")
print("""
This projection represents the predicted prices from a linear model using Size, 
Bedrooms, and Age as features to predict Prices. It's the closest point to the 
prices in the column space of features. This is essentially the output of a 
linear regression model fitted to the data.
""")
print()

# ============================================================================
# Part (d): Cosine Similarity
# ============================================================================
print("=" * 70)
print("PART (d): Cosine Similarity")
print("=" * 70)

# TODO: Compute cosine similarity between s and b
dot_product_sb = np.dot(s, b)
norm_s = np.linalg.norm(s)
norm_b = np.linalg.norm(b)
cos_similarity = dot_product_sb / (norm_s * norm_b)

print(f"Cosine similarity between size and bedrooms: {cos_similarity:.4f}")

# TODO: Compute the angle in degrees
angle_rad = np.arccos(np.clip(cos_similarity, -1.0, 1.0))
angle_deg = np.degrees(angle_rad)

print(f"Angle between vectors: {angle_deg:.4f}°")
print()

print("Interpretation:")
print("TODO: Explain what this cosine similarity tells us about the relationship")
print("between house size and number of bedrooms")
print(f""""
Cosine similarity of {cos_similarity:.4f} indicates very strong positive 
correlation between Size and bedrooms, this means they are almost perfectly 
aligned, suggesting multicollinearity. This means as the size of the house 
increases, the number of bedrooms also tends to increases proportionally and
vice versa.
""")
print()

# ============================================================================
# Additional Analysis (Optional)
# ============================================================================
print("=" * 70)
print("ADDITIONAL ANALYSIS")
print("=" * 70)

print("Correlation matrix of features:")
features = np.column_stack([s, b, a])
correlation_matrix = np.corrcoef(features.T)
print(correlation_matrix)

print()
print("=" * 70)
print("END OF ANALYSIS")
print("=" * 70)

GIVEN DATA
Size (s): [1000 1500 1200 2000 1800]
Bedrooms (b): [2 3 2 4 3]
Age (a): [ 5 10  2 20 15]
Prices (y): [200 300 240 400 350]

PART (a): Design Matrix and Linear Dependence
Design Matrix X:
[[   1 1000    2    5]
 [   1 1500    3   10]
 [   1 1200    2    2]
 [   1 2000    4   20]
 [   1 1800    3   15]]
Shape of X: (5, 4)

Rank of X: 4
Number of columns: 4
Are columns linearly independent? True

Determinant of X^T X: 14860000.000000093

Finding linear dependence relationship...
Columns are linearly independent - no non-trivial linear combination exists

TODO: Write your interpretation of what linear dependence means for ML models
Practical Implication:

Linear dependence means that the coefficient of one feature can be expressed as a combination of others, preventing unique solutions to a linear regression which then leads to infinitely many solutions. Also, models may become unstable and sensitive to small changes in data which can lead to overfitting. This is known as multic