## G -Part

### Imports
*Initialize the environment and the Linear Regression class*

In [1]:
from lin_reg import LinearRegression
import numpy as np

datafile = "housing.csv"
model = LinearRegression()


data = np.genfromtxt(datafile, delimiter=",", names=True, dtype=None, 
                     encoding="utf-8",  missing_values="", filling_values=np.nan)

### Cleaning & Organizing
*Remove incomplete rows*

In [2]:
target_y = "median_house_value"
num_features = [
    "longitude", "latitude", "housing_median_age", 
    "total_rooms", "total_bedrooms", "population", 
    "households", "median_income"
]
cat_features = ["ocean_proximity"]
all_features = [target_y] + num_features + cat_features

mask = np.ones(len(data), dtype=bool)

for col in all_features:
    if data[col].dtype.kind in "fi":
        mask &= ~np.isnan(data[col])
    else: 
        mask &= (data[col] != "") & (data[col] != "nan")

clean_data = data[mask]

### Extra - Cleaning stats
*Summary of the data lost during the filtering process*

In [3]:
print("Missing Values Before Cleaning:")
for col in all_features:
    if data[col].dtype.kind in "fi":
        missing_count = np.sum(np.isnan(data[col]))
    else:
        missing_count = np.sum((data[col] == "") | (data[col] == "nan"))
    print(f"{col}: {missing_count} missing values")

print(f"\nTotal rows before: {len(data)}")
print("\nAfter Cleaning:")
print(f"Total rows remaining: {len(clean_data)}")
print(f"Total rows lost: {len(data) - len(clean_data)}")

Missing Values Before Cleaning:
median_house_value: 0 missing values
longitude: 0 missing values
latitude: 0 missing values
housing_median_age: 0 missing values
total_rooms: 0 missing values
total_bedrooms: 207 missing values
population: 0 missing values
households: 0 missing values
median_income: 0 missing values
ocean_proximity: 0 missing values

Total rows before: 20640

After Cleaning:
Total rows remaining: 20433
Total rows lost: 207


### Categorical Features
*Transform text-based location data into binary numerical features*

In [4]:
unique_cats = np.unique(clean_data["ocean_proximity"])
print(f"Unique categories:{unique_cats}")

for cat in unique_cats:
    count = 0
    for row_value in clean_data["ocean_proximity"]:
        if row_value == cat:
            count += 1
    print(f"{cat}:{count} houses")


cats_to_encode = unique_cats[:-1]
encoded_list = []

for cat in cats_to_encode:
    binary_col = np.zeros(len(clean_data))
    
    for i in range(len(clean_data)):
        if clean_data["ocean_proximity"][i] == cat:
            binary_col[i] = 1
    
    encoded_list.append(binary_col)
    print(f"Encoded: {cat}")

ocean_encoded = np.column_stack(encoded_list)
print(f"Encoded shape: {ocean_encoded.shape}")

Unique categories:['<1H OCEAN' 'INLAND' 'ISLAND' 'NEAR BAY' 'NEAR OCEAN']
<1H OCEAN:9034 houses
INLAND:6496 houses
ISLAND:5 houses
NEAR BAY:2270 houses
NEAR OCEAN:2628 houses
Encoded: <1H OCEAN
Encoded: INLAND
Encoded: ISLAND
Encoded: NEAR BAY
Encoded shape: (20433, 4)


### Preparing the data for the model
*Constructing the feature matrix X and the target vector y*

In [5]:
y = clean_data[target_y]
num_cols = []
for feature in num_features:
    num_cols.append(clean_data[feature])
X_num = np.column_stack(num_cols)

X_all = np.hstack([X_num, ocean_encoded])
intercept = np.ones(len(y))
X = np.column_stack([intercept, X_all])

### Train and use the model
*Executing the OLS fit and calculating standard error metrics*

In [6]:
model.fit(X,y)

variance = model.variance_calc(X,y)
std_dev = model.st_dev_calc(X,y)
rmse = model.rmse_calc(X,y)

print(f"n (sample size): {model.n}")
print(f"d (features): {model.d}")
print(f"Variance (σ²): {variance:.2f}")
print(f"Standard Deviation: {std_dev:.2f}")
print(f"RMSE: {rmse:.2f}")


n (sample size): 20433
d (features): 12
Variance (σ²): 4713776929.50
Standard Deviation: 68656.95
RMSE: 68635.11


## VG Part

### $R^2$
*Evaluating the goodness of fit and how much variance the model captures*

In [7]:
r_squared = model.r_squared_calc(X,y)
print(f"R Squared : {r_squared:.2f}")

R Squared : 0.65


### F-Statistic & p-value
*Testing the overall significance of the regression model*

In [8]:
f_stat, p_value = model.f_test(X,y)
print(f"F-statistic: {f_stat:.2f} \np_value:{p_value:.10f}")

F-statistic: 3111.61 
p_value:0.0000000000


### Pearson Correlation
*Analyzing linear relationships between feature pairs*

In [9]:
pearson = model.pearson_corr(X)
print(f"Correlation between features: {pearson}")

Correlation between features: [[ 1.         -0.92461611 -0.10935655  0.04548017  0.06960802  0.1002703
   0.05651277 -0.01555015  0.32083106 -0.05533745  0.00950071 -0.47471371]
 [-0.92461611  1.          0.01189907 -0.03666681 -0.06698283 -0.10899734
  -0.07177419 -0.07962632 -0.44692798  0.35108357 -0.01666228  0.35878451]
 [-0.10935655  0.01189907  1.         -0.3606283  -0.32045104 -0.2957873
  -0.30276797 -0.11827772  0.04555326 -0.23696771  0.01710531  0.25614946]
 [ 0.04548017 -0.03666681 -0.3606283   1.          0.9303795   0.85728125
   0.91899153  0.19788152 -0.00377684  0.0264775  -0.00760262 -0.02364721]
 [ 0.06960802 -0.06698283 -0.32045104  0.9303795   1.          0.87774674
   0.97972827 -0.00772285  0.01831423 -0.00646289 -0.00436147 -0.01987349]
 [ 0.1002703  -0.10899734 -0.2957873   0.85728125  0.87774674  1.
   0.9071859   0.00508662  0.07344951 -0.01960181 -0.01045053 -0.06147965]
 [ 0.05651277 -0.07177419 -0.30276797  0.91899153  0.97972827  0.9071859
   1.        

### T-tests
*Testing the individual significance of each coefficient*

In [10]:
t_stats, p_values = model.t_tests(X,y)
feature_names = ['intercept', 'longitude', 'latitude', 'housing_median_age', 
                 'total_rooms', 'total_bedrooms', 'population', 'households',
                 'median_income', 'ocean_<1H', 'ocean_INLAND', 'ocean_ISLAND', 
                 'ocean_NEAR_BAY']

for i in range(len(t_stats)):
    sig = "Significant" if p_values[i] <0.05 else "NOT significant"
    print(f"{feature_names[i]:20s}: {t_stats[i]:.4f}, {p_values[i]:.6f} [{sig}]")



intercept           : -25.6395, 0.000000 [Significant]
longitude           : -26.2963, 0.000000 [Significant]
latitude            : -25.3629, 0.000000 [Significant]
housing_median_age  : 24.4389, 0.000000 [Significant]
total_rooms         : -7.8250, 0.000000 [Significant]
total_bedrooms      : 14.6402, 0.000000 [Significant]
population          : -35.2824, 0.000000 [Significant]
households          : 6.6589, 0.000000 [Significant]
median_income       : 116.1510, 0.000000 [Significant]
ocean_<1H           : -2.7258, 0.006421 [Significant]
ocean_INLAND        : -19.3633, 0.000000 [Significant]
ocean_ISLAND        : 4.8327, 0.000001 [Significant]
ocean_NEAR_BAY      : -3.7835, 0.000155 [Significant]


### Extra - Analyzing high correlations:
*Highest collinearity and significance*

In [25]:
corr_matrix = model.pearson_corr(X)
actual_features = feature_names[1:]
high_corr_list = []

for i in range(len(actual_features)):
    for j in range(i + 1, len(actual_features)):
        correlation = corr_matrix[i, j]
        if abs(correlation) > 0.85:
            high_corr_list.append((actual_features[i], actual_features[j], correlation))

high_corr_list.sort(key=lambda x: abs(x[2]), reverse=True)

print("Strongest correlations between features:")
print("----------------------------------------------")

for feat_a, feat_b, val in high_corr_list:
    print(f"{feat_a:18s} & {feat_b:18s}: {val:.2f}")

#---------------------------------------------------------------------------------------

t_stats, _ = model.t_tests(X, y)

impact_list = []
for i in range(1, len(feature_names)):
    impact_list.append((feature_names[i], abs(t_stats[i])))

impact_list.sort(key=lambda x: x[1], reverse=True)
top_5 = impact_list[:5]
print ("")
print(f"{'Feature':20s} | {'Impact (T-Stat)':>15s}")
print("---------------------------------------")

for name, impact in top_5:
    print(f"{name:20s} | {impact:15.2f}")

Strongest correlations between features:
----------------------------------------------
total_bedrooms     & households        : 0.98
total_rooms        & total_bedrooms    : 0.93
longitude          & latitude          : -0.92
total_rooms        & households        : 0.92
population         & households        : 0.91
total_bedrooms     & population        : 0.88
total_rooms        & population        : 0.86

Feature              | Impact (T-Stat)
---------------------------------------
median_income        |          116.15
population           |           35.28
longitude            |           26.30
latitude             |           25.36
housing_median_age   |           24.44


### Confidence Intervals
*Calculating the range of likely values for each parameter*

In [None]:
lower, upper = model.confidence_interval(X,y,alpha=0.05)


print(f"Feature              | Lower bound | Upper bound")
print("------------------------------------------------")

for i in range(len(feature_names)):
    name = feature_names[i]
    low = lower[i]
    high = upper[i]

    print(f"{name:20s} | {low:11.2f} | {high:11.2f}")

Feature              | Lower bound | Upper bound
------------------------------------------------
intercept            | -2438881.36 | -2092470.60
longitude            |   -28811.59 |   -24814.39
latitude             |   -27451.48 |   -23512.89
housing_median_age   |      986.50 |     1158.54
total_rooms          |       -7.74 |       -4.64
total_bedrooms       |       87.09 |      114.02
population           |      -40.08 |      -35.86
households           |       35.01 |       64.22
median_income        |    38597.06 |    39922.09
ocean_<1H            |    -7354.53 |    -1201.74
ocean_INLAND         |   -47972.11 |   -39152.75
ocean_ISLAND         |    88344.04 |   208903.57
ocean_NEAR_BAY       |   -12496.91 |    -3967.47
