In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, SGDRegressor, RidgeCV, LassoCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import matplotlib.pyplot as plt

# Step 1: Import dataset
df = pd.read_csv("ames_house.csv")
print("Head:\n", df.head())
print("\nShape:", df.shape)
print("\nColumns:", df.columns)
print("\nData Types:\n", df.dtypes)
print("\nInfo:\n", df.info())
print("\nValue Counts:\n", df["SalePrice"].value_counts())

# Step 2: Predict Sale Price without Categorical features
X = df.drop(['BldgType', 'CentralAir', 'SalePrice'], axis=1)
y = df['SalePrice']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("\nMean Squared Error (without Categorical features):", mse)

# Step 3: Create Scatter Plot
plt.scatter(y_test, y_pred)
plt.xlabel("Actual Sale Price")
plt.ylabel("Predicted Sale Price")
plt.title("Scatter Plot between y_test and y_pred")
plt.show()

# Step 4: Encode Categorical columns
df_encoded = pd.get_dummies(df, columns=['BldgType', 'CentralAir'])

# Step 5: Predict Sale Price with Categorical features
X_encoded = df_encoded.drop('SalePrice', axis=1)
y_encoded = df_encoded['SalePrice']
X_train_encoded, X_test_encoded, y_train_encoded, y_test_encoded = train_test_split(X_encoded, y_encoded, test_size=0.25, random_state=42)

model_encoded = LinearRegression()
model_encoded.fit(X_train_encoded, y_train_encoded)
y_pred_encoded = model_encoded.predict(X_test_encoded)
mse_encoded = mean_squared_error(y_test_encoded, y_pred_encoded)
print("\nMean Squared Error (with Categorical features):", mse_encoded)

# Step 6: Normalize using Standard Scaler and Predict Sale Price
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_encoded)
X_test_scaled = scaler.transform(X_test_encoded)

model_scaled = LinearRegression()
model_scaled.fit(X_train_scaled, y_train_encoded)
y_pred_scaled = model_scaled.predict(X_test_scaled)
mse_scaled = mean_squared_error(y_test_encoded, y_pred_scaled)
print("\nMean Squared Error (Standard Scaler):", mse_scaled)

# Step 7: Normalize using MinMaxScaler and Predict Sale Price
minmax_scaler = MinMaxScaler()
X_train_minmax = minmax_scaler.fit_transform(X_train_encoded)
X_test_minmax = minmax_scaler.transform(X_test_encoded)

model_minmax = LinearRegression()
model_minmax.fit(X_train_minmax, y_train_encoded)
y_pred_minmax = model_minmax.predict(X_test_minmax)
mse_minmax = mean_squared_error(y_test_encoded, y_pred_minmax)
print("\nMean Squared Error (MinMax Scaler):", mse_minmax)

# Step 8: Predict using SGD Regressor
sgd_scaler = StandardScaler()
X_train_sgd = sgd_scaler.fit_transform(X_train_encoded)
X_test_sgd = sgd_scaler.transform(X_test_encoded)

sgd_model = SGDRegressor()
sgd_model.fit(X_train_sgd, y_train_encoded)
y_pred_sgd = sgd_model.predict(X_test_sgd)
mse_sgd = mean_squared_error(y_test_encoded, y_pred_sgd)
print("\nMean Squared Error (SGD Regressor):", mse_sgd)

# Step 9: Predict using Ridge Regression
ridge_model = RidgeCV()
ridge_model.fit(X_train_sgd, y_train_encoded)
y_pred_ridge = ridge_model.predict(X_test_sgd)
mse_ridge = mean_squared_error(y_test_encoded, y_pred_ridge)
print("\nMean Squared Error (Ridge Regression):", mse_ridge)

# Step 10: Predict using Lasso Regression
lasso_model = LassoCV()
lasso_model.fit(X_train_sgd, y_train_encoded)
y_pred_lasso = lasso_model.predict(X_test_sgd)
mse_lasso = mean_squared_error(y_test_encoded, y_pred_lasso)
print("\nMean Squared Error (Lasso Regression):", mse_lasso)

# Step 11: RMSE
rmse_no_encoding = np.sqrt(mse)
rmse_encoding = np.sqrt(mse_encoded)
rmse_standard_scaled = np.sqrt(mse_scaled)
rmse_minmax_scaled = np.sqrt(mse_minmax)
rmse_sgd = np.sqrt(mse_sgd)
rmse_ridge = np.sqrt(mse_ridge)
rmse_lasso = np.sqrt(mse_lasso)

print("\nRMSE without one hot encoding:", rmse_no_encoding)
print("RMSE with One hot encoding:", rmse_encoding)
print("RMSE with OHE and Standard Scaling:", rmse_standard_scaled)
print("RMSE with OHE and MinMax Scaling:", rmse_minmax_scaled)
print("RMSE of SGDRegressor with OHE and Standard Scaler:", rmse_sgd)
print("RMSE of RidgeCV with OHE and Standard Scaler:", rmse_ridge)
print("RMSE of LassoCV with OHE and Standard Scaler:", rmse_lasso)


Head:
   BldgType CentralAir  SalePrice
0     1Fam          Y     215000
1     1Fam          Y     105000
2     1Fam          Y     172000
3     1Fam          Y     244000
4     1Fam          Y     189900

Shape: (2930, 3)

Columns: Index(['BldgType', 'CentralAir', 'SalePrice'], dtype='object')

Data Types:
 BldgType      object
CentralAir    object
SalePrice      int64
dtype: object
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2930 entries, 0 to 2929
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   BldgType    2930 non-null   object
 1   CentralAir  2930 non-null   object
 2   SalePrice   2930 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 68.8+ KB

Info:
 None

Value Counts:
 SalePrice
135000    34
140000    33
130000    29
155000    28
145000    26
          ..
219990     1
159895     1
187687     1
217300     1
150900     1
Name: count, Length: 1032, dtype: int64


ValueError: at least one array or dtype is required