In [414]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [415]:
df = pd.read_csv("ex1data2.txt" , header=None, names=['Size', 'Bedrooms', 'Price'])
df.head(5)

Unnamed: 0,Size,Bedrooms,Price
0,2104,3,399900
1,1600,3,329900
2,2400,3,369000
3,1416,2,232000
4,3000,4,539900


In [416]:
df = (df - df.mean()) / df.std()    # standardizing the data
df

Unnamed: 0,Size,Bedrooms,Price
0,0.13001,-0.223675,0.475747
1,-0.50419,-0.223675,-0.084074
2,0.502476,-0.223675,0.228626
3,-0.735723,-1.537767,-0.867025
4,1.257476,1.090417,1.595389
5,-0.019732,1.090417,-0.323998
6,-0.58724,-0.223675,-0.204036
7,-0.721881,-0.223675,-1.130948
8,-0.781023,-0.223675,-1.026973
9,-0.637573,-0.223675,-0.783051


## Train-Test Splitting

In [417]:
def split_train_test(data,test_ratio):
    np.random.seed(42)
    shuffle=np.random.permutation(len(data)) 
    # print(shuffle)
    test_data_size=int(len(data)*test_ratio)
    test_indices=shuffle[:test_data_size]
    train_indices=shuffle[test_data_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [418]:
# add ones column
df.insert(0, 'Ones', 1)

train_data, test_data=split_train_test(df,0.2)
# separate x (independent variable) from y (dependent variable)
cols = train_data.shape[1]
x = train_data.iloc[:,0:cols-1]
y = train_data.iloc[:,cols-1:cols]

In [419]:
# convert to matrices and initialize theta
x = np.matrix(x.values)
y = np.matrix(y.values)
theta = np.matrix(np.array([0,0,0]))
print(len(x))

38


In [420]:
# Computing Cost
def calculateCost(x, y, theta):
    a = np.power(((x * theta.T) - y), 2)
    return np.sum(a) / (2 * len(x))


In [421]:
def gradientDescent(x, y, theta, alpha, n):
    temp = np.matrix(np.zeros(theta.shape))
    params = int(theta.ravel().shape[1])
    cost = np.zeros(n)
    
    for i in range(n):
        error = (x * theta.T) - y
        
        for j in range(params):
            term = np.multiply(error, x[:,j])
            temp[0,j] = theta[0,j] - ((alpha / len(x)) * np.sum(term))
            
        theta = temp
        cost[i] = calculateCost(x, y, theta)
        
    return theta, cost

## 10-fold cross-validation

In [422]:
from sklearn.model_selection import KFold

# Perform 10-fold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)
mse_scores = []

# set variables for learning rate and number of iterations
alpha = 0.1
n = 100

for train, test in kf.split(x):
    # perform linear regression on the data set where p is theta
    p, cost = gradientDescent(x[train], y[train], theta, alpha, n)

    # calculate the cost (error) of the model
    thiscost = calculateCost(x[test], y[test], p)
    mse_scores.append(thiscost)

# calculate the mean of mean squared error scores
mean_mse = np.mean(mse_scores)

print(f'10-fold Cross Validation Mean Squared Error: {mean_mse}')

10-fold Cross Validation Mean Squared Error: 0.1199408539523398


## Model Testing

In [423]:
cols = test_data.shape[1]
x = test_data.iloc[:,0:cols-1]
y = test_data.iloc[:,cols-1:cols]
x = np.matrix(x.values)
y = np.matrix(y.values)

# get the cost (error) of the model
thiscost = calculateCost(x, y, p)
print(f"Mean Squared Error: {thiscost}")

Mean Squared Error: 0.2801937590772049
