# Get the data

In [2]:
import os
import tarfile
from six.moves import urllib

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
        
    tgz_path = os.path.join(housing_path, "housing.tgz")
    if not os.path.isfile(tgz_path): #download data if not already there
        urllib.request.urlretrieve(housing_url, tgz_path)
        
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

In [3]:
fetch_housing_data()

In [4]:
import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [5]:
housing = load_housing_data()
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


# Prepare the data for Machine Learning algorithms

In [41]:
housing = strat_train_set.drop("median_house_value", axis=1) # drop labels for training set
housing_labels = strat_train_set["median_house_value"].copy()

### A note on sklearn object design:

    - Estimators: any object estimating some parameters. In the above, the imputer is an estimator. Estimators need to have a fit() method which take the dataset as input. Any other parameters are considered as hyperparameters, e.g. the strategy hyperparameter in the imputer

    - Transformers: these are estimators which can transofrm the dataset. They need to implement the transform() method. All transformers also has a fit_transform() method equivalent to calling fit() and then transform(). Sometimes the fit_transform() method is better optimized for efficiency so usually best to call it instead of fit() and then transform(). The imputer above is actually a transformer.

    - Predictors: these are estimators which can make predictions. LinearRegression model is a predictor. Predictors must implement a predict() method. They also have a score() method that measures the quality of the predictions.


# Implement a LinRegStatsmodels Predictor Class

In [80]:
import statsmodels.api as sm
import numpy as np
from sklearn.base import BaseEstimator, RegressorMixin

class LinRegStatsmodels(BaseEstimator, RegressorMixin):
    def __init__(self, fit_intercept=True):
        self.fit_intercept = fit_intercept
        self.model = None
        self.results = None

    def fit(self, X, y):
        #adding a constant (intercept) to the model
        if self.fit_intercept:
            X = sm.add_constant(X)
        #fitting the model
        self.model = sm.OLS(y, X).fit()
        #saving the results
        self.results = self.model
        return self

    def predict(self, X):
        #adding a constant (intercept) to the model
        if self.fit_intercept:
            X = sm.add_constant(X)
        return self.model.predict(X)


In [81]:
from sklearn.pipeline import Pipeline

full_pipeline_with_predictor = Pipeline([
        ("preparation", full_pipeline),
        ("linear", LinRegStatsmodels())
    ])

In [82]:
full_pipeline_with_predictor.fit(housing, housing_labels)

print(full_pipeline_with_predictor.named_steps["linear"].results.summary())

                            OLS Regression Results                            
Dep. Variable:     median_house_value   R-squared:                       0.648
Model:                            OLS   Adj. R-squared:                  0.648
Method:                 Least Squares   F-statistic:                     2026.
Date:                Tue, 03 Oct 2023   Prob (F-statistic):               0.00
Time:                        23:50:18   Log-Likelihood:            -2.0731e+05
No. Observations:               16512   AIC:                         4.147e+05
Df Residuals:                   16496   BIC:                         4.148e+05
Df Model:                          15                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       1.974e+05   8110.732     24.343      0.0

In [83]:
heteroskedastic_results = full_pipeline_with_predictor.named_steps["linear"].results.get_robustcov_results(cov_type='HC3')

print(heteroskedastic_results.summary())

                            OLS Regression Results                            
Dep. Variable:     median_house_value   R-squared:                       0.648
Model:                            OLS   Adj. R-squared:                  0.648
Method:                 Least Squares   F-statistic:                 1.044e+04
Date:                Tue, 03 Oct 2023   Prob (F-statistic):               0.00
Time:                        23:50:18   Log-Likelihood:            -2.0731e+05
No. Observations:               16512   AIC:                         4.147e+05
Df Residuals:                   16496   BIC:                         4.148e+05
Df Model:                          15                                         
Covariance Type:                  HC3                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       1.974e+05   1.06e+04     18.626      0.0

**Interpretation**:
The model with homoscedasticity appears to have a relatively good fit, explaining about 64.8% of the variance in median_house_value. Several predictors are statistically significant, given their p-values, but it's crucial to understand what each predictor represents for practical interpretation. However, based on the Omnibus and Jarque-Bera tests, the residuals are not normally distributed, which might be a concern. Additionally, the high condition number suggests potential multicollinearity among the predictors, which might mean some predictors are redundant or correlated.

For the model with heteroskedastic standard errors, the coefficients' estimates remain the same, but the standard errors have changed, affecting their significance levels. This indicates the importance of accounting for heteroscedasticity when it's present, as it can affect the conclusions about the importance of some predictors. A few variables (like x10 and x5) which seemed significant or borderline significant in the initial model, now appear to be much less significant in the heteroscedasticity-corrected model. This emphasizes that failing to account for heteroscedasticity can lead to incorrect inferences about the significance of predictors.

Overall, the heteroscedasticity-corrected model offers a more reliable interpretation of the predictors' significance and their standard errors. The model fit measures like R-squared remain unchanged, but the conclusions regarding individual predictors' importance have been adjusted.

In [84]:
median_income_value = housing["median_income"].median()
new_data = pd.DataFrame({
    'longitude': [-118.8],
    'latitude': [34.19],
    'housing_median_age': [4.0],
    'total_rooms': [15572.0],
    'total_bedrooms': [2222.0],
    'population': [5495.0],
    'households': [2152.0],
    'median_income': [median_income_value],
    'ocean_proximity': ["<1H OCEAN"]
})


In [85]:
prepared_new_data = full_pipeline.transform(new_data)

predicted_value = lin_reg.predict(prepared_new_data)
print(f"Predicted House Value: {predicted_value}")

Predicted House Value: [241862.50684923]


### How to estimate

The feature median_income and housing_median_age are missing in the data given. To make the data fit for the pipeline, I used the median of median_income and housing age to substitute these two feature. The reason that I used the median of all median_incomes for median_income is to mimimize the impact of this factor in prediction without changing the trained model. The reason that I used housing age to replace the housing_median_age is because I considered them to be of similar meaning in the context. These two features might affect the prediction accuracy in the final result, but confidence intervals can help with our prediction. 

Since I used sklearn's LinearRegression model trained in the lecture codes, which doesn't provide a built-in way to get prediction intervals or confidence intervals, I will have to compute confidence intervals by hand using the model's coefficients, residuals, and the input data.

## Generalizations: Heteroskedasticity

In [86]:
ff5 = pd.read_csv('/Users/Eric/opt/anaconda3/envs/dsm/F-F_Research_Data_5_Factors_2x3_daily.CSV', skiprows=3)
qmnix = pd.read_csv('/Users/Eric/opt/anaconda3/envs/dsm/QMNIX.csv')
qmnix['Return'] = qmnix['Adj Close'].pct_change()
ff5['Date'] = pd.to_datetime(ff5['Unnamed: 0'].astype(str), format='%Y%m%d')
qmnix['Date'] = pd.to_datetime(qmnix['Date'])
qmnix = qmnix[['Date','Return']]
ff5.drop('Unnamed: 0', axis=1, inplace=True)
merged_data = pd.merge(ff5, qmnix, on='Date', how='inner')
merged_data.dropna(inplace=True)

In [87]:
y = merged_data['Return']

X = merged_data[['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA']]

regressor = LinRegStatsmodels()

# Fit the model with heteroskedasticity-robust standard errors
X_const = sm.add_constant(X)
model = sm.OLS(y, X_const)
results = model.fit(cov_type='HC3')  # heteroskedasticity-robust standard errors

regressor.model = model
regressor.results = results

print(regressor.results.summary())

                            OLS Regression Results                            
Dep. Variable:                 Return   R-squared:                       0.239
Model:                            OLS   Adj. R-squared:                  0.237
Method:                 Least Squares   F-statistic:                     75.58
Date:                Tue, 03 Oct 2023   Prob (F-statistic):           2.77e-73
Time:                        23:50:18   Log-Likelihood:                 8736.4
No. Observations:                2215   AIC:                        -1.746e+04
Df Residuals:                    2209   BIC:                        -1.743e+04
Df Model:                           5                                         
Covariance Type:                  HC3                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0001      0.000      1.293      0.1

## (b)

### Differences:

**Covariance Type**: 

This regression used heteroskedasticity-robust standard errors (`Covariance Type: HC3`). This method provides standard errors that are robust to violations of the homoscedasticity assumption. 
The previous regression used non-robust standard errors (`Covariance Type: nonrobust`). This means that the standard errors in the previous regression may be biased if there's heteroskedasticity in the residuals.

**Coefficients and Significance**:

The coefficients (coef) of the factors in both regressions are fairly similar.
However, when comparing the significance levels (P>|z| vs P>|t|), some discrepancies arise. For instance, the factor `Mkt-RF` is significant at the 10% level in the previous regression but not in the this one. 

**Model Metrics**:
  
The `R-squared` values are the same in both regressions, meaning that the proportion of variance explained by the regressors remains consistent across both models.
The `Adj. R-squared` also remains consistent, which adjusts the R-squared value based on the number of predictors in the model.
The F-statistic is different between the two models, but both have extremely low Prob (F-statistic) values, implying the joint significance of the coefficients.

**Test Statistics**:

Both regressions have Omnibus, Durbin-Watson, Jarque-Bera, Skew, and Kurtosis statistics. The values of these statistics are almost the same, meaning that the overall characteristics of the residuals (like their distribution, autocorrelation, etc.) remain similar.

The main differences are in the standard errors due to heteroskedasticity adjustment. The model above is better equipped to handle potential heteroskedasticity in the data. The model with homoscedasticity assumption offers a more parsimonious approach and could be preferred if heteroskedasticity is not a concern.