## G -Part

### Imports

In [None]:
from lin_reg import LinearRegression
import numpy as np

datafile = "housing.csv"
model = LinearRegression()


data = np.genfromtxt(datafile, delimiter=",", names=True, dtype=None, 
                     encoding="utf-8",  missing_values="", filling_values=np.nan)

### Cleaning & Organizing

In [None]:
target_y = "median_house_value"
num_features = [
    "longitude", "latitude", "housing_median_age", 
    "total_rooms", "total_bedrooms", "population", 
    "households", "median_income"
]
cat_features = ["ocean_proximity"]
all_features = [target_y] + num_features + cat_features

mask = np.ones(len(data), dtype=bool)

for col in all_features:
    if data[col].dtype.kind in "fi":
        mask &= ~np.isnan(data[col])
    else: 
        mask &= (data[col] != "") & (data[col] != "nan")

clean_data = data[mask]

### Extra - Cleaning stats

In [None]:
print("Missing Values Before Cleaning:")
for col in all_features:
    if data[col].dtype.kind in "fi":
        missing_count = np.sum(np.isnan(data[col]))
    else:
        missing_count = np.sum((data[col] == "") | (data[col] == "nan"))
    print(f"{col}: {missing_count} missing values")

print(f"\nTotal rows before: {len(data)}")
print("\nAfter Cleaning:")
print(f"Total rows remaining: {len(clean_data)}")
print(f"Total rows lost: {len(data) - len(clean_data)}")

### Categorical Features

In [None]:
unique_cats = np.unique(clean_data["ocean_proximity"])
print(f"Unique categories:{unique_cats}")

for cat in unique_cats:
    count = 0
    for row_value in clean_data["ocean_proximity"]:
        if row_value == cat:
            count += 1
    print(f"{cat}:{count} houses")


cats_to_encode = unique_cats[:-1]
encoded_list = []

for cat in cats_to_encode:
    binary_col = np.zeros(len(clean_data))
    
    for i in range(len(clean_data)):
        if clean_data["ocean_proximity"][i] == cat:
            binary_col[i] = 1
    
    encoded_list.append(binary_col)
    print(f"Encoded: {cat}")

ocean_encoded = np.column_stack(encoded_list)
print(f"Encoded shape: {ocean_encoded.shape}")

### Preparing the data for the model

In [None]:
y = clean_data[target_y]
num_cols = []
for feature in num_features:
    num_cols.append(clean_data[feature])
X_num = np.column_stack(num_cols)

X_all = np.hstack([X_num, ocean_encoded])
intercept = np.ones(len(y))
X = np.column_stack([intercept, X_all])

### Train and use the model

In [None]:
model.fit(X,y)

variance = model.variance_calc(X,y)
std_dev = model.st_dev_calc(X,y)
rmse = model.rmse_calc(X,y)

print(f"n (sample size): {model.n}")
print(f"d (features): {model.d}")
print(f"Variance (σ²): {variance:.2f}")
print(f"Standard Deviation: {std_dev:.2f}")
print(f"RMSE: {rmse:.2f}")


## VG Part

### $R^2$

In [None]:
r_squared = model.r_squared_calc(X,y)
print(f"R Squared : {r_squared:.2f}")

### F-Statistic & p-value

In [None]:
f_stat, p_value = model.f_test(X,y)
print(f"F-statistic: {f_stat:.2f} \np_value:{p_value:.10f}")

### Pearson Correlation

In [None]:
pearson = model.pearson_corr(X)
print(f"Correlation between features: {pearson}")