## G -Part

### Imports
*Initialize the environment and the Linear Regression class*

In [10]:
from lin_reg import LinearRegression
import numpy as np

datafile = "housing.csv"
model = LinearRegression()

np.set_printoptions(precision=2, suppress=True, linewidth=100)

data = np.genfromtxt(datafile, delimiter=",", names=True, dtype=None, 
                     encoding="utf-8",  missing_values="", filling_values=np.nan)

### Cleaning & Organizing
*Remove incomplete rows*

In [11]:
target_y = "median_house_value"
num_features = [
    "longitude", "latitude", "housing_median_age", 
    "total_rooms", "total_bedrooms", "population", 
    "households", "median_income"
]
cat_features = ["ocean_proximity"]
all_features = [target_y] + num_features + cat_features

mask = np.ones(len(data), dtype=bool)

for col in all_features:
    if data[col].dtype.kind in "fi":
        mask &= ~np.isnan(data[col])
    else: 
        mask &= (data[col] != "") & (data[col] != "nan")

clean_data = data[mask]

### Extra - Cleaning stats
*Summary of the data lost during the filtering process*

In [12]:
missing_results = []
for col in all_features:
    if data[col].dtype.kind in "fi":
        missing_count = np.sum(np.isnan(data[col]))
    else:
        missing_count = np.sum((data[col] == "") | (data[col] == "nan"))
    missing_results.append(f"{col}: {missing_count} missing values")
(missing_results, {
    "Total rows before": len(data),
    "Total rows remaining": len(clean_data),
    "Total rows lost": len(data) - len(clean_data)
})

(['median_house_value: 0 missing values',
  'longitude: 0 missing values',
  'latitude: 0 missing values',
  'housing_median_age: 0 missing values',
  'total_rooms: 0 missing values',
  'total_bedrooms: 207 missing values',
  'population: 0 missing values',
  'households: 0 missing values',
  'median_income: 0 missing values',
  'ocean_proximity: 0 missing values'],
 {'Total rows before': 20640,
  'Total rows remaining': 20433,
  'Total rows lost': 207})

### Categorical Features
*Transform text-based location data into binary numerical features*

In [13]:
unique_cats = np.unique(clean_data["ocean_proximity"])

cats_to_encode = unique_cats[:-1]
encoded_list = []

for cat in cats_to_encode:
    binary_col = np.zeros(len(clean_data))
    
    for i in range(len(clean_data)):
        if clean_data["ocean_proximity"][i] == cat:
            binary_col[i] = 1
    
    encoded_list.append(binary_col)

ocean_encoded = np.column_stack(encoded_list)
ocean_encoded.shape

(20433, 4)

### Preparing the data for the model
*Constructing the feature matrix X and the target vector y*

In [14]:
y = clean_data[target_y]
num_cols = []
for feature in num_features:
    num_cols.append(clean_data[feature])
X_num = np.column_stack(num_cols)

X_all = np.hstack([X_num, ocean_encoded])
intercept = np.ones(len(y))
X = np.column_stack([intercept, X_all])

### Train and use the model
*Executing the OLS fit and calculating standard error metrics*

In [15]:
model.fit(X,y)

variance = model.variance_calc(X,y)
std_dev = model.st_dev_calc(X,y)
rmse = model.rmse_calc(X,y)

[
    f"n (sample size): {model.n}",
    f"d (features): {model.d}",
    f"Variance (σ²): {variance:.2f}",
    f"Standard Deviation: {std_dev:.2f}",
    f"RMSE: {rmse:.2f}"
]


['n (sample size): 20433',
 'd (features): 12',
 'Variance (σ²): 4713776929.50',
 'Standard Deviation: 68656.95',
 'RMSE: 68635.11']

## VG Part

### $R^2$
*Evaluating the goodness of fit and how much variance the model captures*

In [16]:
r_squared = model.r_squared_calc(X,y)
f"R-squared: {r_squared:.4f}"

'R-squared: 0.6465'

### F-Statistic & p-value
*Testing the overall significance of the regression model*

In [17]:
f_stat, p_value = model.f_test(X,y)
{
    "F-Statistic": round(float(f_stat), 2),
    "p-value": f"{float(p_value):.10f}"
}

{'F-Statistic': 3111.61, 'p-value': '0.0000000000'}

### Pearson Correlation
*Analyzing linear relationships between feature pairs*

In [34]:
pearson = model.pearson_corr(X)
feature_names = ['intercept', 'longitude', 'latitude', 'housing_median_age', 
                 'total_rooms', 'total_bedrooms', 'population', 'households',
                 'median_income', 'ocean_<1H', 'ocean_INLAND', 'ocean_ISLAND', 
                 'ocean_NEAR_BAY']
short_names = ["LON", "LAT", "AGE", "RMS", "BED", "POP", "HHD", "INC", "<1H", "INL", "ISL", "BAY"]
rows = [f"{feature_names[i+1]:20s}: {pearson[i]}" for i in range(len(pearson))]

footer = " " * 24 + "   ".join(short_names)
rows.append(footer)
rows

['longitude           : [ 1.   -0.92 -0.11  0.05  0.07  0.1   0.06 -0.02  0.32 -0.06  0.01 -0.47]',
 'latitude            : [-0.92  1.    0.01 -0.04 -0.07 -0.11 -0.07 -0.08 -0.45  0.35 -0.02  0.36]',
 'housing_median_age  : [-0.11  0.01  1.   -0.36 -0.32 -0.3  -0.3  -0.12  0.05 -0.24  0.02  0.26]',
 'total_rooms         : [ 0.05 -0.04 -0.36  1.    0.93  0.86  0.92  0.2  -0.    0.03 -0.01 -0.02]',
 'total_bedrooms      : [ 0.07 -0.07 -0.32  0.93  1.    0.88  0.98 -0.01  0.02 -0.01 -0.   -0.02]',
 'population          : [ 0.1  -0.11 -0.3   0.86  0.88  1.    0.91  0.01  0.07 -0.02 -0.01 -0.06]',
 'households          : [ 0.06 -0.07 -0.3   0.92  0.98  0.91  1.    0.01  0.04 -0.04 -0.01 -0.01]',
 'median_income       : [-0.02 -0.08 -0.12  0.2  -0.01  0.01  0.01  1.    0.17 -0.24 -0.01  0.06]',
 'ocean_<1H           : [ 0.32 -0.45  0.05 -0.    0.02  0.07  0.04  0.17  1.   -0.61 -0.01 -0.31]',
 'ocean_INLAND        : [-0.06  0.35 -0.24  0.03 -0.01 -0.02 -0.04 -0.24 -0.61  1.   -0.01 -0.24]',


### T-tests
*Testing the individual significance of each coefficient*

In [19]:
t_stats, p_values = model.t_tests(X,y)
results =  []


for i in range(len(t_stats)):
    sig = "Significant" if p_values[i] <0.05 else "NOT significant"
    results.append(f"{feature_names[i]:20s}: {t_stats[i]:8.4f}, {p_values[i]:.6f}")

results


['intercept           : -25.6395, 0.000000',
 'longitude           : -26.2963, 0.000000',
 'latitude            : -25.3629, 0.000000',
 'housing_median_age  :  24.4389, 0.000000',
 'total_rooms         :  -7.8250, 0.000000',
 'total_bedrooms      :  14.6402, 0.000000',
 'population          : -35.2824, 0.000000',
 'households          :   6.6589, 0.000000',
 'median_income       : 116.1510, 0.000000',
 'ocean_<1H           :  -2.7258, 0.006421',
 'ocean_INLAND        : -19.3633, 0.000000',
 'ocean_ISLAND        :   4.8327, 0.000001',
 'ocean_NEAR_BAY      :  -3.7835, 0.000155']

### Confidence Intervals
*Calculating the range of likely values for each parameter*

In [20]:
lower, upper = model.confidence_interval(X,y,alpha=0.05)
ci_table = {}

for i in range(len(feature_names)):
    ci_table[feature_names[i]] = {
        "Lower": round(lower[i], 2),
        "Upper": round(upper[i], 2)
    }
ci_table

{'intercept': {'Lower': -2438881.36, 'Upper': -2092470.6},
 'longitude': {'Lower': -28811.59, 'Upper': -24814.39},
 'latitude': {'Lower': -27451.48, 'Upper': -23512.89},
 'housing_median_age': {'Lower': 986.5, 'Upper': 1158.54},
 'total_rooms': {'Lower': -7.74, 'Upper': -4.64},
 'total_bedrooms': {'Lower': 87.09, 'Upper': 114.02},
 'population': {'Lower': -40.08, 'Upper': -35.86},
 'households': {'Lower': 35.01, 'Upper': 64.22},
 'median_income': {'Lower': 38597.06, 'Upper': 39922.09},
 'ocean_<1H': {'Lower': -7354.53, 'Upper': -1201.74},
 'ocean_INLAND': {'Lower': -47972.11, 'Upper': -39152.75},
 'ocean_ISLAND': {'Lower': 88344.04, 'Upper': 208903.57},
 'ocean_NEAR_BAY': {'Lower': -12496.91, 'Upper': -3967.47}}

-------------------------------------------------------------------------------------------------------- 

### Extra - Analyzing high correlations:
*Highest multicollinearity*

In [21]:
corr_matrix = model.pearson_corr(X)
actual_features = feature_names[1:]
high_corr_list = []

for i in range(len(actual_features)):
    for j in range(i + 1, len(actual_features)):
        correlation = corr_matrix[i, j]
        if abs(correlation) > 0.85:
            high_corr_list.append((actual_features[i], actual_features[j], correlation))

high_corr_list.sort(key=lambda x: abs(x[2]), reverse=True)

[f"{feat_a:14s} & {feat_b:18s}: {val:.2f}" for feat_a, feat_b, val in high_corr_list]

['total_bedrooms & households        : 0.98',
 'total_rooms    & total_bedrooms    : 0.93',
 'longitude      & latitude          : -0.92',
 'total_rooms    & households        : 0.92',
 'population     & households        : 0.91',
 'total_bedrooms & population        : 0.88',
 'total_rooms    & population        : 0.86']

*Highest significance*

In [22]:
t_stats, _ = model.t_tests(X, y)

impact_list = []
for i in range(1, len(feature_names)):
    impact_list.append((feature_names[i], abs(t_stats[i])))

impact_list.sort(key=lambda x: x[1], reverse=True)
[f"{name:18s} | {val:10.2f}" for name, val in impact_list[:5]]

['median_income      |     116.15',
 'population         |      35.28',
 'longitude          |      26.30',
 'latitude           |      25.36',
 'housing_median_age |      24.44']