In [1]:
import numpy as np
import pandas as pd
import plotly.offline as py
import plotly.graph_objs as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import cufflinks as cf
cf.set_config_file(offline=True, sharing=False, theme='ggplot')

In [2]:
df = pd.read_csv('diamonds.csv', index_col=0)
df.head()


Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [3]:
df = df[['carat', 'depth', 'table', 'price']]
df.head()

Unnamed: 0,carat,depth,table,price
1,0.23,61.5,55.0,326
2,0.21,59.8,61.0,326
3,0.23,56.9,65.0,327
4,0.29,62.4,58.0,334
5,0.31,63.3,58.0,335


In [4]:
features = ['carat', 'depth', 'table']
to_predict = 'price'

n, d = df[features].shape

In [5]:
n, d

(53940, 3)

In [6]:
X = df[features].values
y = df[[to_predict]].to_numpy()

X, y

(array([[ 0.23, 61.5 , 55.  ],
        [ 0.21, 59.8 , 61.  ],
        [ 0.23, 56.9 , 65.  ],
        ...,
        [ 0.7 , 62.8 , 60.  ],
        [ 0.86, 61.  , 58.  ],
        [ 0.75, 62.2 , 55.  ]]),
 array([[ 326],
        [ 326],
        [ 327],
        ...,
        [2757],
        [2757],
        [2757]]))

In [7]:
def add_ones_column(df):
    return np.hstack([np.ones((n, 1)), df])

X = add_ones_column(X)
X



array([[ 1.  ,  0.23, 61.5 , 55.  ],
       [ 1.  ,  0.21, 59.8 , 61.  ],
       [ 1.  ,  0.23, 56.9 , 65.  ],
       ...,
       [ 1.  ,  0.7 , 62.8 , 60.  ],
       [ 1.  ,  0.86, 61.  , 58.  ],
       [ 1.  ,  0.75, 62.2 , 55.  ]])

In [8]:
def linear_model(X, y):
    return np.linalg.inv(X.T @ X) @ X.T @ y

def dot_product(mat_A, mat_B):
    return mat_A @ mat_B

In [9]:
theta_hat = linear_model(X, y)
theta_hat

array([[13003.44052424],
       [ 7858.77050994],
       [ -151.23634689],
       [ -104.47278016]])

In [10]:
np.random.seed(42)

theta_guess = np.random.randn(d+1, 1)
theta_guess

array([[ 0.49671415],
       [-0.1382643 ],
       [ 0.64768854],
       [ 1.52302986]])

In [11]:
dot_product(X, theta_guess)

array([[124.06440056],
       [132.10427447],
       [136.31533185],
       ...,
       [132.45656072],
       [128.22253935],
       [124.4458851 ]])

In [12]:
def compute_loss(X, y, theta):
    return ((1/y.shape[0]) * (X @ theta - y).T @ (X @ theta - y)).item()

In [13]:
def squarred_loss(X, y, theta):
    return np.mean((X @ theta - y) ** 2)

In [21]:
compute_loss(X, y, theta_guess)

30389793.20618835

In [22]:
theta_hat

array([[13003.44052424],
       [ 7858.77050994],
       [ -151.23634689],
       [ -104.47278016]])

In [20]:
[X[i, :] @ theta_hat for i in range(10)]

[array([-236.08050071]),
 array([-762.99080215]),
 array([-585.12110662]),
 array([-214.08532278]),
 array([-193.02262478]),
 array([-563.04560688]),
 array([-487.42743343]),
 array([-60.81192416]),
 array([-1485.95573555]),
 array([-545.3208532])]

In [23]:
from numpy.linalg import solve

In [24]:
theta_hat_solve = solve(X.T @ X, X.T @ y)
theta_hat_solve

array([[13003.44052424],
       [ 7858.77050994],
       [ -151.23634689],
       [ -104.47278016]])

In [30]:
assert theta_hat.all() == theta_hat_solve.all()

In [31]:
y_hat = X @ theta_hat  

In [32]:
y_hat

array([[-236.08050071],
       [-762.99080215],
       [-585.12110662],
       ...,
       [2738.57048722],
       [4477.14475352],
       [3744.61472164]])

In [35]:
fig = px.scatter(x=y.flatten(), y=y_hat.flatten(), opacity=0.2)
fig.add_trace(go.Scatter(x=[0, 20000], y=[0, 20000], name="y_hat = y"))
fig.update_xaxes(title_text="y")
fig.update_yaxes(title_text="y_hat")
fig.show()