In [14]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold, LeaveOneOut
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SequentialFeatureSelector, SelectFromModel
from sklearn import set_config
set_config(display="diagram")

from random import shuffle, seed



import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [12]:
housing = pd.read_csv("C:\\Users\\17135\\OneDrive\\Documents\\Python Scripts\\Housing.csv")
housing.head(5)

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [8]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB


In [9]:
X =housing.drop('price', axis = 1)
y = housing['price']

In [26]:
numerical_cols = X.select_dtypes(include=['int64']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

### Preprocessing Data and Create a Pipeline

In [27]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols)
    ])

# Creating the model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [28]:
model


### Holdout CV 

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model and make predictions
model.fit(X_train, y_train)
predictions = model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
print(f"Holdout CV MSE: {mse}")

Holdout CV MSE: 1754318687330.668


### LOO CV

In [31]:
loocv = LeaveOneOut()
cv_results_loocv = cross_val_score(model, X, y, cv=loocv, scoring='neg_mean_squared_error')
print(f"LOOCV Mean MSE: {np.mean(cv_results_loocv)}, Standard Deviation: {np.std(cv_results_loocv)}")

LOOCV Mean MSE: -1182287713887.1106, Standard Deviation: 2604518248725.418


### K Fold

In [32]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
cv_results_kfold = cross_val_score(model, X, y, cv=kfold, scoring='neg_mean_squared_error')
print(f"K-Fold CV Mean MSE: {np.mean(cv_results_kfold)}, Standard Deviation: {np.std(cv_results_kfold)}")

K-Fold CV Mean MSE: -1214842339413.0137, Standard Deviation: 411949059579.46277
