In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('50_startups.csv')
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

In [3]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer(transformers = [('encoder', OneHotEncoder(), [3])], remainder = 'passthrough')
X = np.array(ct.fit_transform(X))

In [19]:
from sklearn.model_selection import train_test_split
import random

seed = random.randint(1, 1000)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 708)
# seed = 708 gives the best result
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:, 3:] = sc.fit_transform(X_train[:, 3:])
X_test[:, 3:] = sc.transform(X_test[:, 3:])

In [5]:
def comp_cost(x, y, w, b):
    m, n = x.shape
    cost = 0.0
    for i in range(m):
        err = (np.dot(w, x[i]) + b) - y[i]
        cost = cost + err ** 2
    cost /= 2*m
    return cost

In [6]:
def compute_gradient(x, y, w, b):
    m, n = x.shape
    dj_dw = np.zeros((n,))
    dj_db = 0.0
    for i in range(m):
        err = (np.dot(x[i], w) + b) - y[i]
        # print(err)
        for j in range(n):
            dj_dw[j] = dj_dw[j] + err * x[i, j]
        dj_db = dj_db + err
    dj_dw /= m
    dj_db /= m
    return dj_dw, dj_db

In [7]:
def gradient_descent(niter, lr, x, y, w_ini, b_ini):
    w = w_ini  #avoid modifying global w within function
    b = b_ini
    m, n = x.shape
    for i in range(niter):
        dj_dw, dj_db = compute_gradient(x, y, w, b)
        # print(w, b)
        w = w - lr*dj_dw
        b = b - lr*dj_db
    return w, b

In [8]:
m, n = X.shape # 50, 3
w_in = np.zeros((n, ))
b_in = 0.0
niter = 30000
lr = 0.0005
wf, bf = gradient_descent(niter, lr, X_train, y_train, w_in, b_in)
print(wf, bf, comp_cost(X_train, y_train, wf, bf))

[30064.36104296 27863.84785658 27581.77095006 35618.48956977
  -719.62687243  4156.4097714 ] 85509.97984960195 26325174.36124923


In [21]:
y_pred = np.zeros_like(y_test)
test_size = len(y_test)
for i in range(test_size):
    y_pred[i] = np.dot(wf, X_test[i]) + bf
from sklearn.metrics import r2_score
print(r2_score(y_test, y_pred))

0.9749228596012881
