### Lab Task 1 - Dataset Preparation, Feature Scaling

In [33]:
import sklearn.datasets
import pandas as pd
import numpy as np

In [3]:
data = sklearn.datasets.fetch_california_housing()
print(data)


{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
          37.88      , -122.23      ],
       [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
          37.86      , -122.22      ],
       [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
          37.85      , -122.24      ],
       ...,
       [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
          39.43      , -121.22      ],
       [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
          39.43      , -121.32      ],
       [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
          39.37      , -121.24      ]], shape=(20640, 8)), 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894], shape=(20640,)), 'frame': None, 'target_names': ['MedHouseVal'], 'feature_names': ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude'], 'DESCR': '.. _california_housing_dataset

In [8]:
data.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'feature_names', 'DESCR'])

In [16]:
data['feature_names']

['MedInc',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude']

In [17]:
data['target_names']

['MedHouseVal']

In [20]:
new_data={}
new_data['data']=data['data'][:,[0,1,2]]
new_data['feature_names']=data['feature_names'][:3]
new_data['feature_names']

['MedInc', 'HouseAge', 'AveRooms']

In [22]:
new_data['target']=data['target']
new_data['target_names']=data['target_names']
new_data['target_names']

['MedHouseVal']

In [29]:
# Splitting dataset randomly
data['data'].shape

(20640, 8)

In [37]:

# create and shuffle a list of indices
idx_list=[i for i in range(len(new_data['data']))]
shuffled_idx_list=np.random.permutation(np.array(idx_list))

# split the indices list into train and val indices
_80thidx=(0.8*len(new_data['data']))
train_indices_list=shuffled_idx_list[:int(_80thidx)]
val_indices_list=shuffled_idx_list[int(_80thidx):]

print(len(train_indices_list)+len(val_indices_list))

[0, 1, 2, 3, 4]
[18872  2691  8354 20278  4355]


In [50]:
# splitting  train data
train_data={}
train_data['data']=new_data['data'][train_indices_list]
train_data['target']=new_data['target'][train_indices_list]
train_data['feature_names']=new_data['feature_names']
train_data['target_names']=new_data['target_names']
train_data['data'].shape

In [62]:
# splitting val data
val_data={}
val_data['data']=new_data['data'][val_indices_list]
val_data['target']=new_data['target'][val_indices_list]
val_data['feature_names']=new_data['feature_names']
val_data['target_names']=new_data['target_names']
val_data['data'].shape

(4128, 3)

In [63]:
# converting to a Dataframe and then to csv

# train data
train_df=pd.DataFrame(train_data['data'],columns=train_data['feature_names'])
train_df['target']=train_data['target']
print(train_df[:3])


val_df=pd.DataFrame(val_data['data'],columns=val_data['feature_names'])
val_df['target']=val_data['target']
print(val_df[:3])


train_df.to_csv('train_data.csv',index=False)
val_df.to_csv('val_data.csv',index=False)

print("data saved in train_data.csv and val_data.csv")

   MedInc  HouseAge  AveRooms  target
0  5.2957      18.0  6.483932   1.719
1  1.9911      10.0  5.575668   0.675
2  5.0912      44.0  6.457576   1.698


### Lab Task 2 - Cost Function without and with Regularization

In [3]:
import pandas as pd
import numpy as np


In [8]:
# load train data
train_df=pd.read_csv("train_data.csv")

# load features and convert to numpy
train_features=train_df.iloc[:,:-1]
X=train_features.to_numpy()

# load labels and convert to numpy
train_labels=train_df.iloc[:,-1]
y=train_labels.to_numpy()

In [10]:
print(X.shape)
print(y.shape)

(16512, 3)
(16512,)


In [50]:
import random
def cost_function(features,target,lambd,w_bias=None):
    # add bias to features (a column of ones) and weights
    m=features.shape[0]
    n=features.shape[1]
    target=target.reshape(-1,1)
    w_bias=np.random.rand(n+1,1)
    features_bias=np.c_[np.ones((m,1)),features]
    # find hypothesis
    h_x=np.dot(features_bias,w_bias)
    #find cost
    MSE=np.sum(np.square(h_x-target))/(2*m) 
    reg_term=(lambd* np.sum(np.square(w_bias[1:])))/2*m
    cost=MSE+reg_term
    print(f"features shape: {features_bias.shape}")
    print(f"weights shape: {w_bias.shape}")
    print(f"hypothesis shape: {h_x.shape}")
    print(f"target shape: {target.shape}")
    print(f"top 5 predictions: {h_x[:5]}")
    print(f"top 5 labels: {target[:5]}")
    print(f"cost: {cost}")

    return cost

In [51]:
cost_function(X,y,0.5)
print(y.shape)

features shape: (16512, 4)
weights shape: (4, 1)
hypothesis shape: (16512, 1)
target shape: (16512, 1)
top 5 predictions: [[13.87445997]
 [ 8.43555387]
 [27.08657708]
 [13.86963806]
 [25.05490842]]
top 5 labels: [[1.719  ]
 [0.675  ]
 [1.698  ]
 [2.185  ]
 [5.00001]]
cost: 152.83171501442507
(16512,)


### Lab Task 3 –Gradient Descent without and with Regularization

In [None]:
def compute_gradient(X,y,W):
    dj_dw_sum=0

    m=X.shape[0]

    h_x=np.dot(X,W)
    h_x-y

