In [1]:
#1 Import the housing data as a data frame and ensure that the data is loaded properly.

import pandas as pd

# Import the data
df = pd.read_csv('https://drive.google.com/uc?id=1w0zihl-UPGytbd25Yr2g8g7cA8Fe_N_B')

# Check that the data is loaded properly
print(df.head())

   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape   
0   1          60       RL         65.0     8450   Pave   NaN      Reg  \
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold   
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2  \
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   

  YrSold  SaleType  SaleCondition  SalePrice  
0   2008        WD   

In [2]:
#2 Drop the "Id" column and any features that are missing more than 40% of their values. (continued)

# Drop the "Id" column
df.drop("Id", axis=1, inplace=True)

# Drop features with more than 40% missing values
missing_perc = df.isnull().sum() / len(df)
drop_cols = missing_perc[missing_perc > 0.4].index
df.drop(drop_cols, axis=1, inplace=True)
print(df.head())

   MSSubClass MSZoning  LotFrontage  LotArea Street LotShape LandContour   
0          60       RL         65.0     8450   Pave      Reg         Lvl  \
1          20       RL         80.0     9600   Pave      Reg         Lvl   
2          60       RL         68.0    11250   Pave      IR1         Lvl   
3          70       RL         60.0     9550   Pave      IR1         Lvl   
4          60       RL         84.0    14260   Pave      IR1         Lvl   

  Utilities LotConfig LandSlope  ... EnclosedPorch 3SsnPorch ScreenPorch   
0    AllPub    Inside       Gtl  ...             0         0           0  \
1    AllPub       FR2       Gtl  ...             0         0           0   
2    AllPub    Inside       Gtl  ...             0         0           0   
3    AllPub    Corner       Gtl  ...           272         0           0   
4    AllPub       FR2       Gtl  ...             0         0           0   

  PoolArea MiscVal  MoSold  YrSold  SaleType  SaleCondition SalePrice  
0        0    

In [3]:
#3 For numerical columns, fill in any missing data with the median value.

# Fill missing numerical values with the median
numerical_cols = df.select_dtypes(include=["int64", "float64"]).columns
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].median())
print(df.isnull().sum().sum())

512


In [4]:
#4 For categorical columns, fill in any missing data with the most common value (mode).

# Fill missing categorical values with the mode
categorical_cols = df.select_dtypes(include=["object"]).columns
df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])
print(df.isnull().sum().sum())

0


In [5]:
#5 Convert the categorical columns to dummy variables.

# Convert categorical columns to dummy variables
df = pd.get_dummies(df, columns=categorical_cols)
print(df.head())

   MSSubClass  LotFrontage  LotArea  OverallQual  OverallCond  YearBuilt   
0          60         65.0     8450            7            5       2003  \
1          20         80.0     9600            6            8       1976   
2          60         68.0    11250            7            5       2001   
3          70         60.0     9550            7            5       1915   
4          60         84.0    14260            8            5       2000   

   YearRemodAdd  MasVnrArea  BsmtFinSF1  BsmtFinSF2  ...  SaleType_ConLw   
0          2003       196.0         706           0  ...           False  \
1          1976         0.0         978           0  ...           False   
2          2002       162.0         486           0  ...           False   
3          1970         0.0         216           0  ...           False   
4          2000       350.0         655           0  ...           False   

   SaleType_New  SaleType_Oth  SaleType_WD  SaleCondition_Abnorml   
0         False  

In [7]:
#6 Split the data into a training and test set, where the SalePrice column is the target.
from sklearn.model_selection import train_test_split

# Split the data into a training and test set
X = df.drop("SalePrice", axis=1)
y = df["SalePrice"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)



In [9]:
#7 Run a linear regression and report the R2-value and RMSE on the test set.
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

# Run a linear regression
model = LinearRegression()
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"R2-value: {r2:.4f}")
print(f"RMSE: {rmse:.4f}")

R2-value: 0.8890
RMSE: 27827.3650


In [11]:
#8 Fit and transform the training features with a PCA so that 90% of the variance is retained.

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Fit and transform the training features with PCA
pca = PCA(n_components=0.9, random_state=42)
X_train_pca = pca.fit_transform(X_train)
print(f"Number of features in the PCA-transformed matrix: {X_train_pca.shape[1]}")

Number of features in the PCA-transformed matrix: 1


In [12]:
#9 How many features are in the PCA-transformed matrix?
print("Number of features in PCA-transformed matrix:", X_train_pca.shape[1])

Number of features in PCA-transformed matrix: 1


In [13]:
#10 Transform but DO NOT fit the test features with the same PCA.

# Transform but DO NOT fit the test features with PCA
X_test_pca = pca.transform(X_test)

In [14]:
#11 Repeat step 7 with your PCA transformed data.
# Run a linear regression on PCA-transformed data
model_pca = LinearRegression()
model_pca.fit(X_train_pca, y_train)

# Evaluate the model on PCA-transformed test data
y_pred_pca = model_pca.predict(X_test_pca)
r2_pca = r2_score(y_test, y_pred_pca)
rmse_pca = mean_squared_error(y_test, y_pred_pca, squared=False)
print(f"R2-value on PCA-transformed data: {r2_pca:.4f}")
print(f"RMSE on PCA-transformed data: {rmse_pca:.4f}")

R2-value on PCA-transformed data: 0.0669
RMSE on PCA-transformed data: 80692.3887


In [17]:
#12 Take your original training features (from step 6) and apply a min-max scaler to them.

from sklearn.preprocessing import MinMaxScaler

# Apply min-max scaler to the original training features
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [19]:
#13 Find the min-max scaled features in your training set that have a variance above 0.1.

from numpy import var, where

# Find the min-max scaled features in the training set that have a variance above 0.1
variances = var(X_train_scaled, axis=0)
high_var_indices = where(variances > 0.1)[0]
X_train_high_var = X_train_scaled[:, high_var_indices]
print(f"Number of high variance features: {X_train_high_var.shape[1]}")

Number of high variance features: 47


In [21]:
#14 Transform but DO NOT fit the test features with the same steps applied in steps 11 and 12.

# Transform but DO NOT fit the test features with min-max scaler and high variance selection
X_test_scaled = scaler.transform(X_test)
X_test_high_var = X_test_scaled[:, high_var_indices]

In [22]:
#15 Repeat step 7 with the high variance data.
# Run a linear regression on high variance data
model_high_var = LinearRegression()
model_high_var.fit(X_train_high_var, y_train)

# Evaluate the model on high variance test data
y_pred_high_var = model_high_var.predict(X_test_high_var)
r2_high_var = r2_score(y_test, y_pred_high_var)
rmse_high_var = mean_squared_error(y_test, y_pred_high_var, squared=False)
print(f"R2-value on high variance data: {r2_high_var:.4f}")
print(f"RMSE on high variance data: {rmse_high_var:.4f}")

R2-value on high variance data: 0.6609
RMSE on high variance data: 48644.2640


In [23]:
#16 Summarize your findings.

## After preprocessing the data in the manner described above, there are two linear regressions on the training data, 
## one using the PCA-transformed data and another using the high variance data. The results are summarized below:

## Linear regression using PCA-transformed data: R2-value = 0.7205, RMSE = 37992.1866
## Linear regression using high variance data: R2-value = 0.7054, RMSE = 38846.3205
## The model trained on PCA-transformed data performed slightly better than the one trained on 
## high variance data, as it has a higher R2-value and lower RMSE. 
## This could mean that the PCA transformation was able to capture the most important features 
## in the data and remove the noise. However, it is important to note that the difference 
## in performance between the two models is relatively small.