In [1]:
#importing the libraries
import numpy as np
import pandas as pd

In [2]:
#reading the data
df = pd.read_csv("FoDS-Assignment-2.csv")

# shuffle the DataFrame rows
df = df.sample(frac = 1)

df.isnull().sum()

bedrooms          0
bathrooms         0
sqft_living      14
sqft_lot          0
floors           13
waterfront        0
view              0
condition         0
grade             0
sqft_above       14
sqft_basement     0
sqft_living15     0
sqft_lot15        0
price             0
dtype: int64

In [3]:
df['sqft_living'] = df.fillna(value = df['sqft_living'].mean())
df['floors'] = df.fillna(value = df['floors'].mean())
df['sqft_above'] = df.fillna(value = df['sqft_above'].mean())

df.isnull().sum()

bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
grade            0
sqft_above       0
sqft_basement    0
sqft_living15    0
sqft_lot15       0
price            0
dtype: int64

In [4]:
Features_list = ["bedrooms", "bathrooms", "sqft_living", "sqft_lot", "floors", "waterfront", "view", "condition", "grade", "sqft_above", "sqft_basement", "sqft_living15", "sqft_lot15"]

In [5]:
#Feature Scaling
#normalising the data values
df = (df - df.min()) / (df.max() - df.min())

X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

In [6]:
#splitting the dataset into training data and testing data
splitData = int(0.7*len(X))
train_X, test_X, train_y, test_y = X[:splitData], X[splitData:], y[:splitData], y[splitData:]

In [7]:
WeightV = np.zeros(14)
for n in range(14):
    WeightV[n] = np.random.randn()

In [8]:
#finding the weights based on the training data
def fit(X, Y, iters, learning_rate, F_selected, F_trial):
    for n in range(14):
        WeightV[n] = np.random.randn()
    for itr in range(iters):
        sumItrError = 0    
        for z in range(len(X)):        # each row in input data
            dataP_error = 0            # calculating error in each data point
            
            for m in range(13):
                dataP_error += (F_selected[m] + F_trial[m]) * WeightV[m+1] * X[z][m]    # summation of (w1*x1 + w2*x2 + w3*x3 + w4*x4 ...)
                
            dataP_error += WeightV[0]
            dataP_error -= Y[z]      # (w0 + w1*x1 + w2*x1^2 + w3*x1*x2 + w4*x2^2 ...) - yn

            # for each parameter(w0, w1, w2,...)    
            for m in range(14):                   
                if(m == 0):
                    WeightV[m] -= (learning_rate/len(X)) * dataP_error
                else:
                    WeightV[m] -= (learning_rate/len(X)) * dataP_error * X[z][m-1]   # calculating w0, w1, w2,... for each iteration
        
            dataP_error = (dataP_error**2)
            sumItrError += dataP_error/(2*len(Y))
            
        sumItrError = (sumItrError)**0.5
    return sumItrError    

In [9]:
# def predict(X, Y, F_selected, F_trial):
#     sumItrError = 0
#     for z in range(len(X)):        # each row in input data
#         dataP_error = 0            # calculating error in each data point
            
#         for m in range(13):
#             dataP_error += (F_selected[m] + F_trial[m]) * WeightV[m+1] * X[z][m]    # summation of (w1*x1 + w2*x2 + w3*x3 + w4*x4 ...)
#         dataP_error += WeightV[0]
        
#         dataP_error -= Y[z]      # (w0 + w1*x1 + w2*x1^2 + w3*x1*x2 + w4*x2^2 ...) - yn
#         dataP_error = (dataP_error**2)
#         sumItrError += dataP_error/(2*len(X))
            
#     sumItrError = (sumItrError)**0.5
#     return sumItrError 

In [10]:
F_selected = np.zeros(13)
F_trial = np.zeros(13)
finalFeatures = np.zeros(13)
finalMinE = float('inf')

for i in range(13):
    minErrorIn_i = float('inf')
    for j in range(13):
        if(F_selected[j]==1):
            continue
        F_trial[j] = 1
        error_j = fit(train_X, train_y, 10, 1, F_selected, F_trial)
        if(error_j < minErrorIn_i):
            minErrorIn_i = error_j
            minIndex = j
        F_trial[j] = 0
    F_selected[minIndex] = 1     
    if(minErrorIn_i < finalMinE):
        finalMinE = minErrorIn_i
        finalIndex = i
        finalFeatures = F_selected
        print(finalFeatures)
        for k in range(13):
            if(finalFeatures[k]==1):
                print(Features_list[k])
    print("Minimum error for", i+1, "feature(s) is", minErrorIn_i)

print("-----------------------------------------------------------------------------------------------------------------------")    
print("Minimum training error is ", finalMinE) 
print("Minimum features needed are ", finalIndex+1)   
print("List of features giving minimum training error - ")
for k in range(13):
    if(finalFeatures[k]==1):
        print(Features_list[k])


[0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
view
Minimum error for 1 feature(s) is 0.04659271235352309
Minimum error for 2 feature(s) is 0.051209864880621486
Minimum error for 3 feature(s) is 0.08062831060329456
Minimum error for 4 feature(s) is 0.0970131628707308
Minimum error for 5 feature(s) is 0.057381552095481576
Minimum error for 6 feature(s) is 0.10531701238859435
Minimum error for 7 feature(s) is 0.1115069421776948
Minimum error for 8 feature(s) is 0.11595484822058405
Minimum error for 9 feature(s) is 0.13723600488611745
Minimum error for 10 feature(s) is 0.1739090326484805
Minimum error for 11 feature(s) is 0.16340110347108155
Minimum error for 12 feature(s) is 0.2120204392815617
Minimum error for 13 feature(s) is 0.2883313623190377
-----------------------------------------------------------------------------------------------------------------------
Minimum training error is  0.04659271235352309
Minimum features needed are  1
List of features giving minimum training error - 
bed

In [11]:
finalFeatures

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [12]:
# fit(train_X, train_y, 500, 1, finalFeatures, F_trial)

# #making predictions on test data
# testingE = predict(test_X, test_y, finalFeatures, F_trial)
# print("Minimum testing error considering these", finalIndex+1, "features is", testingE)
 