# Housing Linear Regression (OLS)

This notebook loads the provided dataset, builds a linear regression model, and reports the required statistics and tests.

In [1]:
import numpy as np
import csv
from linear_regression import LinearRegression

## Load data and basic preprocessing

In [2]:
path = 'housing.csv'
with open(path, newline='', encoding='utf-8') as f:
    r = csv.DictReader(f)
    rows = list(r)
cols_num = [
    'longitude','latitude','housing_median_age','total_rooms','total_bedrooms',
    'population','households','median_income'
]
y_col = 'median_house_value'
cat_col = 'ocean_proximity'

n = len(rows)
X_num = np.empty((n, len(cols_num)), dtype=float)
y = np.empty(n, dtype=float)
cat = np.empty(n, dtype=object)
for i, row in enumerate(rows):
    for j, c in enumerate(cols_num):
        v = row[c]
        X_num[i, j] = float(v) if v != '' else np.nan
    y[i] = float(row[y_col])
    cat[i] = row[cat_col]

col_means = np.nanmean(X_num, axis=0)
inds = np.where(np.isnan(X_num))
X_num[inds] = np.take(col_means, inds[1])

## Dataset exploration

In [3]:
missing_numeric = {
    name: int(np.isnan(X_num[:, j]).sum())
    for j, name in enumerate(cols_num)
}

ocean_categories, ocean_counts = np.unique(cat.astype(str), return_counts=True)
category_counts = {c: int(n) for c, n in zip(ocean_categories.tolist(), ocean_counts.tolist())}

{
    'n_rows': int(n),
    'numeric_missing_per_column': missing_numeric,
    'ocean_proximity_counts': category_counts,
}

{'n_rows': 20640,
 'numeric_missing_per_column': {'longitude': 0,
  'latitude': 0,
  'housing_median_age': 0,
  'total_rooms': 0,
  'total_bedrooms': 0,
  'population': 0,
  'households': 0,
  'median_income': 0},
 'ocean_proximity_counts': {'<1H OCEAN': 9136,
  'INLAND': 6551,
  'ISLAND': 5,
  'NEAR BAY': 2290,
  'NEAR OCEAN': 2658}}

### Diskussion kring särdragsval (Feature Selection)
Baserat på utforskningen av datasetet har jag valt att inkludera samtliga tillgängliga särdrag i modellen.

 1. **Kategorisk data:**  Särdraget ocean_proximity inkluderas och kommer att transformeras via "one-hot encoding".

2. **Multikollinearitet:** Det finns sannolikt korrelationer mellan särdrag som total_rooms, total_bedrooms och population. Även om detta introducerar multikollinearitet tillåter instruktionerna att man inkluderar alla särdrag ("Ett val... är att inkludera allt"). Jag väljer att behålla dem för att kunna analysera deras individuella signifikans och förlitar mig på modellens stabilitet (genom användning av pseudoinvers) för att hantera beroenden.

## Build design matrix and fit model

In [4]:
model = LinearRegression(confidence_level=0.95, add_intercept=True, drop_first_category=True)
X_cat, categories = model.one_hot_encode(cat, drop_first=model.drop_first_category)
X = np.column_stack([X_num, X_cat])
feature_names = cols_num + [f'{cat_col}={c}' for c in categories]
model.fit(X, y, feature_names=feature_names)
feature_names[:3], len(feature_names), model.n, model.d

(['longitude', 'latitude', 'housing_median_age'], 12, 20640, 12)

## Error metrics

In [5]:
base_metrics = {
    'variance': model.variance(),
    'standard_deviation': model.standard_deviation(),
    'rmse': model.rmse(),
}
base_metrics

{'variance': 4723656867.17214,
 'standard_deviation': 68728.8648762086,
 'rmse': 68707.2172023851}

## Overall model relevance and significance

In [6]:
advanced_overall = {
    'f_test': model.f_test(),
    'r2': model.r2(),
}
advanced_overall

{'f_test': {'f_stat': 3129.586475162512,
  'df1': 12,
  'df2': 20627,
  'p_value': 0.0},
 'r2': 0.6454747751244776}

## Individual coefficient tests and confidence intervals

In [7]:
model.summary()

{'n_observations': 20640,
 'n_features': 12,
 'coefficients': {'names': ['Intercept',
   'longitude',
   'latitude',
   'housing_median_age',
   'total_rooms',
   'total_bedrooms',
   'population',
   'households',
   'median_income',
   'ocean_proximity=INLAND',
   'ocean_proximity=ISLAND',
   'ocean_proximity=NEAR BAY',
   'ocean_proximity=NEAR OCEAN'],
  'values': array([-2.23571682e+06, -2.64582884e+04, -2.51971980e+04,  1.05786379e+03,
         -4.77253030e+00,  7.22895865e+01, -3.92618423e+01,  7.69408627e+01,
          3.87749139e+04, -3.97196778e+04,  1.56041972e+05, -3.68975002e+03,
          4.74740262e+03]),
  'std_errors': array([8.74951138e+04, 1.01403497e+03, 9.99907740e+02, 4.37043110e+01,
         7.71501537e-01, 5.98410703e+00, 1.06411314e+00, 6.69617563e+00,
         3.32446551e+02, 1.73641175e+03, 3.07715289e+04, 1.90595976e+03,
         1.56268831e+03]),
  't_stats': array([-25.55247634, -26.09208681, -25.19952296,  24.20502161,
          -6.18602825,  12.08026296, 

## Dependency checks (Pearson correlation)

Correlations are computed across the feature columns used in the model and **exclude the intercept**.

In [None]:
pearson = model.pearson_pairs(X, include_intercept=False)
r_matrix = pearson['r']
p_matrix = pearson['p_value']



n_feats = len(feature_names)
print(f"Correlation Matrix ({n_feats}x{n_feats}):\n")


print("     " + " ".join([f"{i:>5}" for i in range(n_feats)]))

for i, name in enumerate(feature_names):
   
    row_str = f"{i:>2} | "
   
    row_str += " ".join([f"{r_matrix[i, j]:>5.2f}" for j in range(n_feats)])
    print(row_str)

print("\nLegend:")
for i, name in enumerate(feature_names):
    print(f"{i}: {name}")

Correlation Matrix (12x12):

         0     1     2     3     4     5     6     7     8     9    10    11
 0 |  1.00 -0.92 -0.11  0.04  0.07  0.10  0.06 -0.02 -0.06  0.01 -0.47  0.05
 1 | -0.92  1.00  0.01 -0.04 -0.07 -0.11 -0.07 -0.08  0.35 -0.02  0.36 -0.16
 2 | -0.11  0.01  1.00 -0.36 -0.32 -0.30 -0.30 -0.12 -0.24  0.02  0.26  0.02
 3 |  0.04 -0.04 -0.36  1.00  0.93  0.86  0.92  0.20  0.03 -0.01 -0.02 -0.01
 4 |  0.07 -0.07 -0.32  0.93  1.00  0.87  0.97 -0.01 -0.01 -0.00 -0.02  0.00
 5 |  0.10 -0.11 -0.30  0.86  0.87  1.00  0.91  0.00 -0.02 -0.01 -0.06 -0.02
 6 |  0.06 -0.07 -0.30  0.92  0.97  0.91  1.00  0.01 -0.04 -0.01 -0.01  0.00
 7 | -0.02 -0.08 -0.12  0.20 -0.01  0.00  0.01  1.00 -0.24 -0.01  0.06  0.03
 8 | -0.06  0.35 -0.24  0.03 -0.01 -0.02 -0.04 -0.24  1.00 -0.01 -0.24 -0.26
 9 |  0.01 -0.02  0.02 -0.01 -0.00 -0.01 -0.01 -0.01 -0.01  1.00 -0.01 -0.01
10 | -0.47  0.36  0.26 -0.02 -0.02 -0.06 -0.01  0.06 -0.24 -0.01  1.00 -0.14
11 |  0.05 -0.16  0.02 -0.01  0.00 -0.02  0.00 

## Confidence level example

In [9]:
model_99 = LinearRegression(confidence_level=0.99, add_intercept=True, drop_first_category=True)
model_99.fit(X, y, feature_names=feature_names)

model_99.summary()

{'n_observations': 20640,
 'n_features': 12,
 'coefficients': {'names': ['Intercept',
   'longitude',
   'latitude',
   'housing_median_age',
   'total_rooms',
   'total_bedrooms',
   'population',
   'households',
   'median_income',
   'ocean_proximity=INLAND',
   'ocean_proximity=ISLAND',
   'ocean_proximity=NEAR BAY',
   'ocean_proximity=NEAR OCEAN'],
  'values': array([-2.23571682e+06, -2.64582884e+04, -2.51971980e+04,  1.05786379e+03,
         -4.77253030e+00,  7.22895865e+01, -3.92618423e+01,  7.69408627e+01,
          3.87749139e+04, -3.97196778e+04,  1.56041972e+05, -3.68975002e+03,
          4.74740262e+03]),
  'std_errors': array([8.74951138e+04, 1.01403497e+03, 9.99907740e+02, 4.37043110e+01,
         7.71501537e-01, 5.98410703e+00, 1.06411314e+00, 6.69617563e+00,
         3.32446551e+02, 1.73641175e+03, 3.07715289e+04, 1.90595976e+03,
         1.56268831e+03]),
  't_stats': array([-25.55247634, -26.09208681, -25.19952296,  24.20502161,
          -6.18602825,  12.08026296, 