<a href="https://colab.research.google.com/github/TreeLiquid/Intro-to-ML/blob/main/4105_HW2_Shigapov.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [170]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

Pre-Processing

In [None]:
from google.colab import drive
drive.mount('/content/drive')

#Obtain file data
filepath = '/content/drive/My Drive/Fall-2023/Intro-to-ML/Datasets/Housing.csv'
HD = pd.DataFrame(pd.read_csv(filepath))
HD.head()

In [None]:
HD.shape

In [None]:
#Null Checking
HD.isnull().sum()*100/HD.shape[0]

In [None]:
#Converting True/False to 1/0
varlist = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea']

#Map Func (I am not converting furnishing status as the HW doesn't require me to)
def binary_map(x):
  return x.map({'yes': 1, 'no': 0})

HD[varlist] = HD[varlist].apply(binary_map)
HD.head()

Problem 1A

In [None]:
#Training & Test Set Setup + Feature scaling
np.random.seed(0)
HD_train, HD_test = train_test_split(HD, train_size = 0.8, test_size = 0.2, random_state = 100)

scaler = MinMaxScaler()
num_vars = ['area', 'bedrooms', 'bathrooms', 'stories', 'parking', 'price']
HD_train[num_vars] = scaler.fit_transform(HD_train[num_vars])
HD_train.head()


In [None]:
#Training Set
y_train = HD_train.pop('price')
x_train = HD_train
print(x_train.head())

#Validation Sets
y_val = HD_test.pop('price')
x_val = HD_test
x_val.head()

In [45]:
# @title
#Functions (Adjusted function names to fit HW requirements better :))
def compute_price(x, y, theta):
 """
 Compute price for linear regression.
 Input Parameters
 ----------------
 X : 2D array where each row represent the training example and each column represent
 m= number of training examples
 n= number of features (including X_0 column of ones)
 y : 1D array of labels/target value for each traing example. dimension(1 x m)
 theta : 1D array of fitting parameters or weights. Dimension (1 x n)

 Output Parameters
 -----------------
 J : Scalar value.
 """
 predictions = x.dot(theta)
 errors = np.subtract(predictions, y)
 sqrErrors = np.square(errors)
 J = 1 / (2 * m) * np.sum(sqrErrors)
 return J

def gradient_descent(x, y, theta, alpha, iterations):
    """
    Compute price for linear regression.

    Input Parameters
    ----------------
    X : 2D array where each row represent the training example and each column represent
    m= number of training examples
    n= number of features (including X_0 column of ones)
    y : 1D array of labels/target value for each traing example. dimension(m x 1)
    theta : 1D array of fitting parameters or weights. Dimension (1 x n)
    alpha : Learning rate. Scalar value
    iterations: No of iterations. Scalar value.

    Output Parameters
    -----------------
    theta : Final Value. 1D array of fitting parameters or weights. Dimension (1 x n)
    price_estimate: Conatins value of cost for each iteration. 1D array. Dimansion(m x 1)
    """
    price_estimate = np.zeros(iterations)
    for i in range(iterations):
        predictions = x.dot(theta)
        errors = np.subtract(predictions, y)
        sum_delta = (alpha / m) * x.transpose().dot(errors);
        theta = theta - sum_delta;
        price_estimate[i] = compute_price(x, y, theta)
    return theta, price_estimate

In [None]:
x_train.shape

In [None]:
#Parameters
m = len(y_train)
theta = np.zeros((13,m), dtype=np.int8)
print(theta)

#Graph
X_train1 = x_train['area'] + x_train['bedrooms'] + x_train['bathrooms'] + x_train['stories'] + x_train['parking']

plt.scatter(X_train1, y_train, color='red',marker= '+')
#plt.scatter(x_train['bedrooms'], y_train, color='blue',marker= '+')
#plt.scatter(, y_train, color='green',marker= '+')
#plt.scatter(, y_train, color='yellow',marker= '+')
#plt.scatter(x_train['parking'], y_train, color='black',marker= '+')
plt.grid()
plt.xlabel('X')
plt.ylabel('Y')
plt.title('Scatter plot of training data')

In [81]:
#Gradient Descent
itr = 150;
a = 0.01;

#theta, price_estimate = gradient_descent(X_train1, y_train, theta, a, itr)
#print('Final value of Theta =', theta)
#print('Price estimate  =', price_estimate)


In [None]:
#Training and Validation Losses for 1A
x_train_1A = np.c_[np.ones((len(x_train), 1)), x_train]
x_val_1A = np.c_[np.ones((len(x_val), 1)), x_val]

train_losses_1A = []
val_losses_1A = []

for iteration in range(itr):
    gradients = 2/m * x_train_1A.T.dot(x_train_1A.dot(theta) - y_train)
    theta -= a * gradients

    train_loss = (1/m) * np.sum(np.square(x_train_1A.dot(theta) - y_train))
    val_loss = (1/len(x_val)) * np.sum(np.square(x_val.dot(theta) - y_val))

    train_losses_1A.append(train_loss)
    val_losses_1A.append(val_loss)

In [None]:
plt.plot(train_losses_1A, label="Training Loss")
plt.plot(val_losses_1A, label="Validation Loss")
plt.xlabel("Iteration")
plt.ylabel("Mean Squared Error")
plt.legend()
plt.title("Training and Validation Loss Over Iterations (For 1A)")
plt.show()

Problem 1b

In [None]:
#Organization
HD_train_1b, HD_test_1b = train_test_split(HD, train_size = 0.8, test_size = 0.2, random_state = 100)

#mainroad, guestroom, basement, hotwaterheating, airconditioning, prefarea
num_vars = ['area', 'bedrooms', 'bathrooms', 'stories', 'price','mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'parking', 'prefarea']
HD_train_1b[num_vars] = scaler.fit_transform(HD_train_1b[num_vars])
HD_train_1b = HD_train_1b.drop(columns = ['furnishingstatus'])
HD_test_1b = HD_test_1b.drop(columns = ['furnishingstatus'])
HD_train_1b.head()

In [None]:
#Training Set
y_train_1b = HD_train_1b.pop('price')
x_train_1b = HD_train_1b
#x_train_1b.head()
print(x_train_1b.shape)
print(y_train_1b.shape)

#Validation Sets
y_val_1b = HD_test_1b.pop('price')
x_val_1b = HD_test_1b
print(x_val_1b.shape)
print(y_val_1b.shape)
x_val_1b.head()

In [None]:
#Training and Validation Losses for 1B
m = len(x_train_1b)
itr_1B = 150
a_1B = 0.01
theta_1B = np.zeros((12,len(x_train_1b)))

x_train_1B = np.c_[np.ones((len(x_train_1b), 1)), x_train_1b]
x_val_1B = np.c_[np.ones((len(x_val_1b), 1)), x_val_1b]

train_losses_1B = []
val_losses_1B = []

for iteration in range(itr_1B):
    gradients_1B = 2/m * x_train_1B.T.dot(x_train_1B.dot(theta_1B) - y_train_1b)
    theta -= a_1B * gradients_1B

    train_loss_1b = (1/m) * np.sum(np.square(x_train_1b.dot(theta_1B) - y_train_1b))
    val_loss_1b = (1/len(x_val_1B)) * np.sum(np.square(x_val_1B.dot(theta) - y_val_1b))

    train_losses_1B.append(train_loss_1b)
    val_losses_1B.append(val_loss_1b)

In [None]:
plt.plot(train_losses_1B, label="Training Loss")
plt.plot(val_losses_1B, label="Validation Loss")
plt.xlabel("Iteration")
plt.ylabel("Mean Squared Error")
plt.legend()
plt.title("Training and Validation Loss Over Iterations (For 1B)")
plt.show()

Question 2A

In [181]:
#Preprocessing: Normilaztion (0-1) and Standardization (mean removal)
scaler = MinMaxScaler() #Normilization
mScaler = preprocessing.StandardScaler() #Standardizer
HD_train_2A_N, HD_test_2A_N = train_test_split(HD, train_size = 0.8, test_size = 0.2, random_state = 100)
HD_train_2A_S, HD_test_2A_S = train_test_split(HD, train_size = 0.8, test_size = 0.2, random_state = 100)

Q2A - Normalization

In [None]:
#Norming
num_vars_2A_N = ['area', 'bedrooms', 'bathrooms', 'stories', 'parking', 'price']
HD_train_2A_N[num_vars_2A_N] = scaler.fit_transform(HD_train_2A_N[num_vars_2A_N])
HD_train_2A_N.head()

In [None]:
#Delaring Sets & Removing "extra" Data
#Training Set [Normalizing]
y_train_2A_N = HD_train_2A_N.pop('price')
x_train_2A_N = HD_train_2A_N
print(x_train_2A_N.shape)
print(y_train_2A_N.shape)



x_train_2A_N.head()

In [None]:
#Validation Sets [Normalizing]
y_val_2A_N = HD_test_2A_N.pop('price')
x_val_2A_N = HD_test_2A_N
print(x_val_2A_N.shape)
print(y_val_2A_N.shape)



x_train_2A_N.head()

In [None]:
theta_2A_N =
a_2A_N =
itr_2A_N =

In [None]:
#Training and Validation Losses for 2A
x_train_2A_N = np.c_[np.ones((len(x_train_2A_N), 1)), x_train_2A_N]
x_val_2A_N = np.c_[np.ones((len(x_val_2A_N), 1)), x_val_2A_N]

train_losses_2A_N = []
val_losses_2A_N = []

for iteration in range(itr_2A_N):
    gradients_2A_N = 2/m * x_train_2A_N.T.dot(x_train_2A_N.dot(theta_2A_N) - y_train_2A_N)
    theta_2A_N -= a_2A_N * gradients_2A_N

    train_loss_2A_N = (1/m) * np.sum(np.square(x_train_2A_N.dot(theta_2A_N) - y_train_2A_N))
    val_loss_2A_N = (1/len(x_val_2A_N)) * np.sum(np.square(x_val_2A_N.dot(theta_2A_N) - y_val_2A_N))

    train_losses_2A_N.append(train_loss_2A_N)
    val_losses_2A_N.append(val_loss_2A_N)

In [None]:
#Training and validation losses for both training and validation set based on input standardization and input normalization
plt.plot(train_losses_2A_N, label="Training Loss")
plt.plot(val_losses_2A_N, label="Validation Loss")
plt.xlabel("Iteration")
plt.ylabel("Mean Squared Error")
plt.legend()
plt.title("Training and Validation Loss Over Iterations (For 2A)")
plt.show()

Q2A - Standardization

In [None]:
#Standardizing
num_vars_2A_S = ['area', 'bedrooms', 'bathrooms', 'stories', 'parking', 'price']
HD_train_2A_S[num_vars_2A_S] = mScaler.fit_transform(HD_train_2A_S[num_vars_2A_S])
HD_train_2A_S.head()

In [None]:
#Delaring Sets & Removing "extra" Data
#Training Set [Standardizing]
y_train_2A_S = HD_train_2A_S.pop('price')
x_train_2A_S = HD_train_2A_S
print(x_train_2A_S.shape)
print(y_train_2A_S.shape)



x_train_2A_S.head()

In [None]:
#Validation Set [Standardizing]
y_val_2A_S = HD_test_2A_S.pop('price')
x_val_2A_S = HD_test_2A_S
print(x_val_2A_S.shape)
print(y_val_2A_S.shape)



x_train_2A_S.head()

In [189]:
theta_2A_S = []
a_2A_S = 100
itr_2A_S = 0.1

In [None]:
#Training and Validation Losses for 2A
x_train_2A_S = np.c_[np.ones((len(x_train_2A_S), 1)), x_train_2A_S]
x_val_2A_S = np.c_[np.ones((len(x_val_2A_S), 1)), x_val_2A_S]

train_losses_2A_S = []
val_losses_2A_S = []

for iteration in range(itr_2A_S):
    gradients_2A_S = 2/m * x_train_2A_S.T.dot(x_train_2A_S.dot(theta_2A_S) - y_train_2A_S)
    theta_2A_S -= a_2A_S * gradients_2A_S

    train_loss_2A_S = (1/m) * np.sum(np.square(x_train_2A_S.dot(theta_2A_S) - y_train_2A_S))
    val_loss_2A_S = (1/len(x_val_2A_S)) * np.sum(np.square(x_val_2A_S.dot(theta_2A_S) - y_val_2A_S))

    train_losses_2A_S.append(train_loss_2A_S)
    val_losses_2A_S.append(val_loss_2A_S)

In [None]:
#Training and validation losses for both training and validation set based on input standardization and input normalization [Stando]
plt.plot(train_losses_2A_S, label="Training Loss")
plt.plot(val_losses_2A_S, label="Validation Loss")
plt.xlabel("Iteration")
plt.ylabel("Mean Squared Error")
plt.legend()
plt.title("Training and Validation Loss Over Iterations (For 2A)")
plt.show()

Question 2B

In [192]:
HD_train_2B_N, HD_test_2B_N = train_test_split(HD, train_size = 0.8, test_size = 0.2, random_state = 100)
HD_train_2B_S, HD_test_2B_S = train_test_split(HD, train_size = 0.8, test_size = 0.2, random_state = 100)

Q2B - Normalization

In [None]:
#Norming
num_vars_2B_N = ['area', 'bedrooms', 'bathrooms', 'stories', 'parking', 'price']
HD_train_2B_N[num_vars_2B_N] = scaler.fit_transform(HD_train_2B_N[num_vars_2B_N])
HD_train_2B_N.head()

In [None]:
#Delaring Sets & Removing "extra" Data
#Training Set [Normalizing]
y_train_2B_N = HD_train_2B_N.pop('price')
x_train_2B_N = HD_train_2B_N
print(x_train_2B_N.shape)
print(y_train_2B_N.shape)



x_train_2B_N.head()

In [None]:
#Validation Sets [Normalizing]
y_val_2B_N = HD_test_2B_N.pop('price')
x_val_2B_N = HD_test_2B_N
print(x_val_2B_N.shape)
print(y_val_2B_N.shape)



x_train_2B_N.head()

In [None]:
theta_2B_N =
a_2B_N =
itr_2B_N =

In [None]:
#Training and Validation Losses for 2B [Norm]
x_train_2B_N = np.c_[np.ones((len(x_train_2B_N), 1)), x_train_2B_N]
x_val_2B_N = np.c_[np.ones((len(x_val_2B_N), 1)), x_val_2B_N]

train_losses_2B_N = []
val_losses_2B_N = []

for iteration in range(itr_2B_N):
    gradients_2B_N = 2/m * x_train_2B_N.T.dot(x_train_2B_N.dot(theta_2B_N) - y_train_2B_N)
    theta_2B_N -= a_2B_N * gradients_2B_N

    train_loss_2B_N = (1/m) * np.sum(np.square(x_train_2B_N.dot(theta_2B_N) - y_train_2B_N))
    val_loss_2B_N = (1/len(x_val_2B_N)) * np.sum(np.square(x_val_2B_N.dot(theta_2B_N) - y_val_2B_N))

    train_losses_2B_N.append(train_loss_2B_N)
    val_losses_2B_N.append(val_loss_2B_N)

In [None]:
plt.plot(train_losses_2B_N, label="Training Loss")
plt.plot(val_losses_2B_N, label="Validation Loss")
plt.xlabel("Iteration")
plt.ylabel("Mean Squared Error")
plt.legend()
plt.title("Training and Validation Loss Over Iterations (For 1B)")
plt.show()

Q2B - Standardization

In [None]:
#Stando
num_vars_2B_S = ['area', 'bedrooms', 'bathrooms', 'stories', 'parking', 'price']
HD_train_2B_S[num_vars_2B_N] = scaler.fit_transform(HD_train_2B_S[num_vars_2B_S])
HD_train_2B_S.head()

In [None]:
#Delaring Sets & Removing "extra" Data
#Training Set [Stando]
y_train_2B_S = HD_train_2B_S.pop('price')
x_train_2B_S = HD_train_2B_S
print(x_train_2B_S.shape)
print(y_train_2B_S.shape)



x_train_2B_S.head()

In [None]:
#Validation Sets [Stando]
y_val_2B_S = HD_test_2B_S.pop('price')
x_val_2B_S = HD_test_2B_S
print(x_val_2B_S.shape)
print(y_val_2B_S.shape)



x_train_2B_S.head()

In [None]:
theta_2B_S =
a_2B_S =
itr_2B_S =

In [None]:
#Training and Validation Losses for 2B [Stando]
x_train_2B_S = np.c_[np.ones((len(x_train_2B_S), 1)), x_train_2B_S]
x_val_2B_S = np.c_[np.ones((len(x_val_2B_S), 1)), x_val_2B_S]

train_losses_2B_S = []
val_losses_2B_S = []

for iteration in range(itr_2B_S):
    gradients_2B_S = 2/m * x_train_2B_S.T.dot(x_train_2S_N.dot(theta_2B_S) - y_train_2B_S)
    theta_2B_S -= a_2B_S * gradients_2B_S

    train_loss_2B_S = (1/m) * np.sum(np.square(x_train_2B_S.dot(theta_2B_S) - y_train_2B_S))
    val_loss_2B_S = (1/len(x_val_2B_S)) * np.sum(np.square(x_val_2B_S.dot(theta_2B_S) - y_val_2B_S))

    train_losses_2B_S.append(train_loss_2B_S)
    val_losses_2B_S.append(val_loss_2B_S)

In [None]:
plt.plot(train_losses_2B_S, label="Training Loss")
plt.plot(val_losses_2B_S, label="Validation Loss")
plt.xlabel("Iteration")
plt.ylabel("Mean Squared Error")
plt.legend()
plt.title("Training and Validation Loss Over Iterations (For 1B)")
plt.show()