In [None]:
#Loading libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import wget

In [None]:
#downloading dataset
#data = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv'
#wget.download(data)

# Problem statement
Creating a model that would predict the house prices "Median_house_value"

# Exploratory Data Analysis

In [None]:
#viewing the dataframe
df = pd.read_csv('housing.csv')
df.info()

In [None]:
#visualiding the distribution of the median_house_value

sns.displot(df['median_house_value'])

The plot has some skewness in it, resulting in a longtail

In [None]:
df['ocean_proximity'].unique()

In [None]:
df_filtered = df[df['ocean_proximity'].isin(['INLAND', '<1H OCEAN'])].reset_index()
df_filtered['ocean_proximity'].unique()

In [None]:
base = ['latitude',
'longitude',
'housing_median_age',
'total_rooms',
'total_bedrooms',
'population',
'households',
'median_income',]

df_filtered[base]

In [None]:
#chacking the sum of the missing value in each column
df_filtered.isnull().sum()

In [None]:
#Meadian of the population
df_filtered['population'].median()

In [None]:
df_filtered

## Shuffling the index


In [None]:
n = len(df_filtered)

In [None]:
idx = np.arange(n)
idx

In [None]:
#setting the seed to 42 for reproducibility
np.random.seed(42)

#shuffle the idx array
np.random.shuffle(idx)
idx

splitting the data to 60% train, 20% validation, and 20% test

In [None]:
#we split the number before we apply them to the dataframe
n_test = int(n *0.2)
n_val = int(n*0.2)
n_train = n - n_test - n_val

n, n_test+n_val+n_train

In [None]:
#splitting the dataframe
df_train = df_filtered.iloc[idx[:n_train]]
df_val = df_filtered.iloc[idx[n_train:n_train + n_val]]
df_test = df_filtered.iloc[idx[n_train+n_val:]]

In [None]:
#trainsforming the target y, "house_median_value" to log + 1 function
y_train = np.log1p(df_train['median_house_value']).values #getting the values in a vector form/ array
y_test = np.log1p(df_test['median_house_value']).values
y_val = np.log1p(df_val['median_house_value']).values

In [None]:
#deleting the y variables from the train, test, val datafram to avoid accidentaly using them
del df_train['median_house_value']
del df_test['median_house_value']
del df_val['median_house_value']

- We need to deal with missing values for the column from 'total_bedrooms'.
- We have two options: fill it with 0 or with the mean of this variable.
- Try both options. For each, train a linear regression model without regularization.
- For computing the mean, we'll use the training only!
- Use the validation dataset to evaluate the models and compare the RMSE of each option.
- Round the RMSE scores to 2 decimal digits using round(score, 2)
- Which option gives better RMSE

In [None]:
#filling in the missing values using the mean in the train set
df_train_mean = df_train.fillna(df_train['total_bedrooms'].mean())
df_test_mean = df_test.fillna(df_test['total_bedrooms'].mean())
df_val_mean = df_val.fillna(df_val['total_bedrooms'].mean())

In [None]:
#filling in the missing values using the 0 value 
df_train_0 = df_train.fillna(0)
df_test_0 = df_test.fillna(0)
df_val_0 = df_val.fillna(0)

df_train_0['total_bedrooms'].mean()

In [None]:
#dropping and reseting index
df_train_mean = df_train_mean.copy()
df_train_mean.reset_index(drop=True)

In [None]:
categorical_columns = ['ocean_proximity']

In [None]:
categories = {}

for c in categorical_columns:
    categories[c] = list(df[c].value_counts().head().index)
categories

In [None]:
def prepare_X(df):
    df = df.copy()
    features = base.copy()  # Assuming base is defined somewhere in your code
    
    # Get the dummy columns.
    dummies = pd.get_dummies(df['ocean_proximity'], prefix='ocean_proximity')
    
    # Add the dummy columns to the dataframe.
    df = pd.concat([df, dummies], axis=1)
    
    # Drop the original categorical column
    df = df.drop('ocean_proximity', axis=1)
    
    # Update features list
    features.extend(dummies.columns.tolist())
    
    df_nums = df[features]
    X = df_nums.values
    return X 


In [None]:
def train_linear_regression(X, y):
    '''
    args:
          X - m*n matrix
          y - m*1 matrix
          
    return:
            weights
    '''
    ones = np.ones(X.shape[0]) #generate the ones to add in a matrix
    X = np.column_stack([ones, X]) #adding the ones in X matrix
    
    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    return w[0], w[1:] # returning the intercept/bias wieght and the feature weight(s)
    
    

In [None]:
def rmse(y, y_pred):
    error = y_pred - y # A
    mse = (error ** 2).mean() # B
    return np.sqrt(mse)

In [None]:
#training and getting the weights for the dataframe filled with mean
X_train_mean = prepare_X(df_train_mean)
w0_mean, w_mean = train_linear_regression(X_train_mean, y_train)


#fill in the missing data with the mean
df_val_mean = df_val.fillna(df_train['total_bedrooms'].mean())
X_val_mean = prepare_X(df_val_mean)

#taking the weights obtained in the training set to predict the validation set
y_pred_mean = w0_mean + X_val_mean.dot(w_mean)


#calculating the mean root square error (rmse)
rmse(y_pred_mean, y_val)

In [None]:
#training and getting the weights for the dataframe filled the 0 value
X_train_0 = prepare_X(df_train_0)
w0_0, w_0 = train_linear_regression(X_train_0, y_train)

#fill in the missing data with 0 value
df_val_0 = df_val.fillna(0)
X_val_0 = prepare_X(df_val_0)

#taking the weights obtain in the training set to predict the validation set
y_val_pred_0 = w0_0 + X_val_0.dot(w_0)

#calculating the mean root square error(rmse)
round(rmse(y_val_pred_0, y_val),2)

### Training function with regularisation

In [None]:
def train_linear_regression_reg(X, y, r=0.001): #added the r to tune
    '''
    args:
          X - m*n matrix
          y - m*1 matrix
          
    return:
            weights
    '''
    ones = np.ones(X.shape[0]) #generate the ones to add in a matrix
    X = np.column_stack([ones, X]) #adding the ones in X matrix
    
    XTX = X.T.dot(X)
    XTX = XTX + r*np.eye(XTX.shape[0])
    
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    return w[0], w[1:] # returning the intercept/bias wieght and the feature weight(s)
    
    

In [None]:
errors = []

for r in [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]:
    w0, w = train_linear_regression_reg(X_train_0, y_train, r)
    
    #prediction to validation set
    y_pred_val = w0 + X_val_0.dot(w)
    error_term = rmse(y_pred_val, y_val)
    
    print('r                   error_term')
    print(r,'               ', error_term)
    errors.append(error_term)
    print('------------------------------')
    print()
    
    

In [None]:
#getting the smallest error term
min(errors)

In [None]:
df_filtered_1 = df_filtered.copy()

- We used seed 42 for splitting the data. Let's find out how selecting the seed influences our score.
- Trying different seed values: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9].
- For each seed, do the train/validation/test split with 60%/20%/20% distribution.
- Fill the missing values with 0 and train a model without regularization.
- For each seed, evaluate the model on the validation dataset and collect the RMSE scores.
- What's the standard deviation of all the scores? To compute the standard deviation, use np.std.
- Round the result to 3 decimal digits (round(std, 3))

In [None]:
# Fill missing values with 0
df_filtered_1 = df_filtered_1.fillna(0)

# List of seed values
seed_values = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

# Initialize an empty list to collect RMSE scores
rmse_scores = []

# Iterate over seed values
for seed in seed_values:

    # Set the seed and shuffle the data
    np.random.seed(seed)
    n = len(df_filtered_1)
    idx = np.arange(n)
    np.random.shuffle(idx)

    # Split the data into train, validation, and test sets
    n_test = int(n * 0.2)
    n_val = int(n * 0.2)
    n_train = n - n_test - n_val

    df_train = df_filtered_1.iloc[idx[:n_train]]
    df_val = df_filtered_1.iloc[idx[n_train:n_train + n_val]]
    df_test = df_filtered_1.iloc[idx[n_train + n_val:]]

    # Get the target values for each set
    y_train = np.log1p(df_train['median_house_value'].values)
    y_val = np.log1p(df_val['median_house_value'].values)

    # Delete the target variable from the DataFrames
    del df_train['median_house_value']
    del df_val['median_house_value']
    
    # Prepare X for train and validation sets
    X_train = prepare_X(df_train)
    X_val = prepare_X(df_val)

    # Train a model without regularization and get weights
    w0, w = train_linear_regression_reg(X_train, y_train)

    # Predict on validation set
    y_pred_val = w0 + X_val.dot(w)

    # Calculate RMSE
    rmse_1 = rmse(y_pred_val, y_val)
    rmse_scores.append(rmse_1)

# Calculate standard deviation of RMSE scores
std_dev = np.std(rmse_scores)

# Round the result to 3 decimal digits
std_dev = round(std_dev, 3)

print(f"The standard deviation of RMSE scores is: {std_dev}")


In [42]:
# Fill missing values with 0
df2= df.fillna(0)

# Set the seed and shuffle the data
seed = 9
np.random.seed(seed)
n = len(df2)
idx = np.arange(n)
np.random.shuffle(idx)

# Split the data into train, validation, and test sets
n_test = int(n * 0.2)
n_val = int(n * 0.2)
n_train = n - n_test - n_val

df_train_2 = df2.iloc[idx[:n_train]]
df_val_2 = df2.iloc[idx[n_train:n_train + n_val]]
df_test_2 = df2.iloc[idx[n_train + n_val:]]

# Combine train and validation datasets
df_train_val = pd.concat([df_train_2, df_val_2], axis=0)

# Get the target values for each set
y_train_val = np.log1p(df_train_val['median_house_value'].values)
y_test = np.log1p(df_test_2['median_house_value'].values)

In [None]:
# Delete the target variable from the DataFrames
del df_train_val['median_house_value']
del df_test['median_house_value']

In [43]:
# Prepare X for train and validation sets
X_train_val = prepare_X(df_train_val)
X_test = prepare_X(df_test_2)

# Train a model with regularization parameter r=0.001
r = 0.001
w0, w = train_linear_regression_reg(X_train_val, y_train_val, r)

# Predict on test set
y_pred_test = w0 + X_test.dot(w)

# Calculate RMSE on test set
rmse_test = rmse(y_pred_test, y_test)
print(f"The RMSE on the test dataset is: {rmse_test}")

The RMSE on the test dataset is: 0.33278932696231767
