## Optimization using the normal equation

This notebook allows one to do conduct uni- and multivariate linear fits using the normal equation method. 


In [1]:
%matplotlib widget

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib.cm as cm

import sys
sys.path.append('/Users/ondrea/MLandstats/OStats/')
from ostats import ML
from ostats import dplot as dp
#from mpltools import color as colors

In [2]:
data_range = np.random.RandomState(1) #make up some fake data for testing
fakex = 10 * data_range.rand(50)
fakey = 3 * fakex - 5 + data_range.randn(50)


!rm -rf ./data/housing*
!wget -P ./data https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/housing/housing.csv
    
#get some data for testing
!wget -P ./data https://datahub.io/core/global-temp/r/annual.csv

--2021-05-31 17:36:10--  https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/housing/housing.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1423529 (1.4M) [text/plain]
Saving to: ‘./data/housing.csv’


2021-05-31 17:36:11 (5.61 MB/s) - ‘./data/housing.csv’ saved [1423529/1423529]

--2021-05-31 17:36:11--  https://datahub.io/core/global-temp/r/annual.csv
Resolving datahub.io (datahub.io)... 104.21.40.221, 172.67.157.38
Connecting to datahub.io (datahub.io)|104.21.40.221|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://pkgstore.datahub.io/core/global-temp/annual_csv/data/a26b154688b061cdd04f1df36e4408be/annual_csv.csv [following]
--2021-05-31 17:36:12--  https://pkgstore.datahub.io/core/global-temp/an

In [3]:
year, tempa = np.loadtxt('./data/annual.csv', skiprows=1, delimiter=",", usecols=(1,2), unpack=True)
year = np.array([int(y) for y in year])/np.max(year); tempa=np.array(tempa)+1

In [4]:
year = 1 + (year - np.mean(year))/(np.max(year)-np.min(year))

In [5]:
#year

In [6]:
housing_data = pd.read_csv('./data/housing.csv')

In [7]:
housing_data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [8]:
housing_data = housing_data[housing_data.median_house_value != 500001] #a bunch of values are set to this
housing_data = housing_data.sample(n=30) #smaller sample

In [9]:
x_house = housing_data['median_house_value'].to_numpy()
y_house = housing_data['total_bedrooms'].to_numpy()

In [10]:
def plotfit(x, theta):
    funct = theta[-1][0]+x*theta[-1][1]
    return(funct)

def plotfit2(x, theta):
    functs=[]
    points = np.linspace(0,len(theta),10)
    points = [int(x) for x in points]
    for i in range(len(points)-1):
        functs.append(theta[points[i]][0]+x*theta[points[i]][1])
    return(functs)

In [11]:
ifig=0;plt.close(ifig);plt.figure(ifig,figsize=(7,6), dpi=120)
theta=ML.Normal_Linear_Regression(x=fakex, y=fakey, l=1)

ynormal = np.array(theta[1])*fakex + np.array(theta[0])

plt.scatter(fakex,fakey, c='darkred',  s=15)
plt.plot(fakex,ynormal)
plt.ylabel('fake y data')
plt.xlabel('fake x data')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Text(0.5, 0, 'fake x data')

In [12]:
ifig=1;plt.close(ifig);plt.figure(ifig,figsize=(7,6), dpi=120)
theta_house=ML.Normal_Linear_Regression(x=x_house, y=y_house, l=1)

y_house_normal = np.array(theta_house[1]) * x_house + np.array(theta_house[0])

plt.scatter(x_house,y_house, c='darkred',  s=15)
plt.plot(x_house,y_house_normal)
plt.ylabel('total bedrooms')
plt.xlabel('median house value')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Text(0.5, 0, 'median house value')