<a href="https://colab.research.google.com/github/SelenaNahra/MachineLearning/blob/main/HW2PROB3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Mount Google Drive and load the dataset
from google.colab import drive
drive.mount('/content/drive')
file_path = '/content/drive/My Drive/Housing.csv'
housing = pd.read_csv(file_path)

# List of variables to map
varlist =  ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea']

# Defining the map function
def binary_map(x):
    return x.map({'yes': 1, 'no': 0})

# Applying the function to the housing list
housing[varlist] = housing[varlist].apply(binary_map)
housing = housing.drop('furnishingstatus', axis=1)

# We specify this so that the train and test data set always have the same rows, respectively
np.random.seed(0)
df_training, df_validation = train_test_split(housing, train_size = 0.8, test_size = 0.2, random_state = 100)

normalized_training = df_training
normalized_validation =  df_validation
standardized_training = df_training
standardized_validation =  df_validation

# Normalize the data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
normalize = MinMaxScaler()
normalized_training = normalize.fit_transform(normalized_training)
normalized_validation = normalize.fit_transform(normalized_validation)
normalized_training = pd.DataFrame(normalized_training, columns=df_training.columns)
normalized_validation = pd.DataFrame(normalized_validation, columns=df_validation.columns)

# Standardize the data
from sklearn.preprocessing import StandardScaler
standardize = StandardScaler()
standardized_training = standardize.fit_transform(standardized_training)
standardized_validation = standardize.fit_transform(standardized_validation)
standardized_training = pd.DataFrame(standardized_training, columns=df_training.columns)
standardized_validation = pd.DataFrame(standardized_validation, columns=df_validation.columns)

# Extract output columns
y_train = df_training.pop('price')
y_valid = df_validation.pop('price')

# Normalized outputs
normalized_y_train = normalized_training.pop('price')
normalized_y_valid = normalized_validation.pop('price')

# Standardized outputs
standardized_y_valid = standardized_training.pop('price')
standardized_y_valid = standardized_validation.pop('price')

def train(X, df_training, df_validation, y_train, y_valid, learning_rate, lambd, iterations):
    training_inputs = df_training[X]
    x_train = np.c_[np.ones((len(training_inputs), 1)), training_inputs]

    val_inputs = df_validation[X]
    x_val = np.c_[np.ones((len(val_inputs), 1)), val_inputs]

    n = x_train.shape[1]
    m_train = len(x_train)
    m_valid = len(x_val)
    theta = np.zeros(n)

    train_losses = []
    val_losses = []

    for i in range(iterations):
        h_theta = x_train.dot(theta)
        error = np.subtract(h_theta, y_train)
        gradient = (1 / m_train) * (x_train.transpose().dot(error))

        theta = theta*(1 - learning_rate * (lambd / m_train)) - (learning_rate * gradient)

        train_loss = 1 / (2 * m_train) * np.sum(np.square((h_theta - y_train)))
        val_loss = 1 / (2 * m_valid) * np.sum(np.square((x_val.dot(theta) - y_valid)))

        train_losses.append(train_loss)
        val_losses.append(val_loss)

    return train_losses, val_losses

# Create data for 2.a
X = ['area', 'bedrooms', 'bathrooms', 'stories', 'parking']
learning_rate = 0.01
lambd = 5
iterations = 1000

# Training with normalized inputs
normalized_train_losses_a, normalized_valid_losses_a = train(X, normalized_training, normalized_validation, y_train, y_valid, learning_rate, lambd, iterations)

# Training with standardized inputs
standardized_train_losses_a, standardized_valid_losses_a = train(X, standardized_training, standardized_validation, y_train, y_valid, learning_rate, lambd, iterations)

plt.plot(normalized_train_losses_a, label="Training Loss")
plt.plot(normalized_valid_losses_a, label="Validation Loss")
plt.xlabel("Iteration")
plt.ylabel("Loss")
plt.legend()
plt.title("3a. Loss using Normalized Inputs")
plt.show()

plt.plot(standardized_train_losses_a, label="Training Loss")
plt.plot(standardized_valid_losses_a, label="Validation Loss")
plt.xlabel("Iteration")
plt.ylabel("Loss")
plt.legend()
plt.title("3a. Loss using Standardized Inputs")
plt.show()

# Create data for 2.b
X = ['area', 'bedrooms', 'bathrooms', 'stories', 'mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'parking', 'prefarea' ]
learning_rate = 0.01
lambd = 5
iterations = 1000

# Training with normalized inputs
normalized_train_losses_b, normalized_valid_losses_b = train(X, normalized_training, normalized_validation, y_train, y_valid, learning_rate, lambd, iterations)

# Training with standardized inputs
standardized_train_losses_b, standardized_valid_losses_b = train(X, standardized_training, standardized_validation, y_train, y_valid, learning_rate, lambd, iterations)

plt.plot(normalized_train_losses_b, label="Training Loss")
plt.plot(normalized_valid_losses_b, label="Validation Loss")
plt.xlabel("Iteration")
plt.ylabel("Loss")
plt.legend()
plt.title("3b. Loss using Normalized Inputs")
plt.show()

plt.plot(standardized_train_losses_b, label="Training Loss")
plt.plot(standardized_valid_losses_b, label="Validation Loss")
plt.xlabel("Iteration")
plt.ylabel("Loss")
plt.legend()
plt.title("3b. Loss using Standardized Inputs")
plt.show()



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
