In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
file_path = "C:/Users/Windows 11/Desktop/New folder (5)/housePricepredict.csv"
data = pd.read_csv(file_path)

# Displaying the first few rows of the dataset
print("Dataset Overview:")
print(data.head())

Dataset Overview:
   Id  MSSubClass MSZoning  LotArea LotConfig BldgType  OverallCond  \
0   0          60       RL     8450    Inside     1Fam            5   
1   1          20       RL     9600       FR2     1Fam            8   
2   2          60       RL    11250    Inside     1Fam            5   
3   3          70       RL     9550    Corner     1Fam            5   
4   4          60       RL    14260       FR2     1Fam            5   

   YearBuilt  YearRemodAdd Exterior1st  BsmtFinSF2  TotalBsmtSF  SalePrice  
0       2003          2003     VinylSd         0.0        856.0   208500.0  
1       1976          1976     MetalSd         0.0       1262.0   181500.0  
2       2001          2002     VinylSd         0.0        920.0   223500.0  
3       1915          1970     Wd Sdng         0.0        756.0   140000.0  
4       2000          2000     VinylSd         0.0       1145.0   250000.0  


In [3]:
# Handle missing values in X (features)
X = data.drop(columns=['SalePrice'])  # Features
X_numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
X_categorical_features = X.select_dtypes(include=['object']).columns

X_preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), X_numeric_features),
        ('cat', SimpleImputer(strategy='constant', fill_value='missing'), X_categorical_features)
    ])

# Handle missing values in y (target)
y = data['SalePrice']  # Target
y_preprocessor = SimpleImputer(strategy='mean')

# Apply preprocessing to fill missing values
X = X_preprocessor.fit_transform(X)
y = y_preprocessor.fit_transform(np.array(y).reshape(-1, 1)).ravel()

# Convert X back to DataFrame for convenience (optional)
X = pd.DataFrame(X, columns=X_numeric_features.tolist() + X_categorical_features.tolist())

# Display the first few rows of preprocessed X and y
print("\nPreprocessed Features (X):")
print(X.head())
print("\nPreprocessed Target (y):")
print(y[:5])  # Displaying first 5 elements of y




Preprocessed Features (X):
    Id MSSubClass  LotArea OverallCond YearBuilt YearRemodAdd BsmtFinSF2  \
0  0.0       60.0   8450.0         5.0    2003.0       2003.0        0.0   
1  1.0       20.0   9600.0         8.0    1976.0       1976.0        0.0   
2  2.0       60.0  11250.0         5.0    2001.0       2002.0        0.0   
3  3.0       70.0   9550.0         5.0    1915.0       1970.0        0.0   
4  4.0       60.0  14260.0         5.0    2000.0       2000.0        0.0   

  TotalBsmtSF MSZoning LotConfig BldgType Exterior1st  
0       856.0       RL    Inside     1Fam     VinylSd  
1      1262.0       RL       FR2     1Fam     MetalSd  
2       920.0       RL    Inside     1Fam     VinylSd  
3       756.0       RL    Corner     1Fam     Wd Sdng  
4      1145.0       RL       FR2     1Fam     VinylSd  

Preprocessed Target (y):
[208500. 181500. 223500. 140000. 250000.]


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shape of training and test sets
print("\nTraining set shape:")
print("X_train:", X_train.shape)
print("y_train:", y_train.shape)
print("\nTest set shape:")
print("X_test:", X_test.shape)
print("y_test:", y_test.shape)


Training set shape:
X_train: (2335, 12)
y_train: (2335,)

Test set shape:
X_test: (584, 12)
y_test: (584,)


In [5]:
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, X_numeric_features),
        ('cat', categorical_transformer, X_categorical_features)
    ])
print("\nNumeric features:")
print(X_numeric_features)
print("\nCategorical features:")
print(X_categorical_features)


Numeric features:
Index(['Id', 'MSSubClass', 'LotArea', 'OverallCond', 'YearBuilt',
       'YearRemodAdd', 'BsmtFinSF2', 'TotalBsmtSF'],
      dtype='object')

Categorical features:
Index(['MSZoning', 'LotConfig', 'BldgType', 'Exterior1st'], dtype='object')


In [6]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', ElasticNet())
])
# Display the defined model pipeline
print("\nModel Pipeline:")
print(model)


Model Pipeline:
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  Index(['Id', 'MSSubClass', 'LotArea', 'OverallCond', 'YearBuilt',
       'YearRemodAdd', 'BsmtFinSF2', 'TotalBsmtSF'],
      dtype='object')),
                                                 ('cat',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  Index(['MSZoning', 'LotConfig', 'BldgType', 'Exterior1st'], dtype='object'))])),
                ('regressor', ElasticNet())])


In [7]:
model.fit(X_train, y_train)

print("\nModel fitted successfully.")


Model fitted successfully.


In [8]:
y_train_pred = model.predict(X_train)

# Display the predicted values on training set
print("\nPredictions on Training Set:")
print(y_train_pred)


Predictions on Training Set:
[165682.36379193 202683.07585379 187980.43673878 ... 156040.87615389
 173902.30069549 171063.65054082]


In [9]:
train_mse = mean_squared_error(y_train, y_train_pred)
train_rmse = np.sqrt(train_mse)
train_r2 = r2_score(y_train, y_train_pred)

# Display the evaluation metrics for training set
print(f"\nTraining Set Performance:")
print(f"RMSE = {train_rmse}, R^2 = {train_r2}")


Training Set Performance:
RMSE = 47568.92651822236, R^2 = 0.2581225777812258


In [10]:
y_test_pred = model.predict(X_test)

# Display the predicted values on test set
print("\nPredictions on Test Set:")
print(y_test_pred)
print(y_test_pred.shape)


Predictions on Test Set:
[163428.84652816 181921.15598911 169787.81132705 184407.59024905
 191877.96661685 212867.63291935 144979.7527681  198648.58003773
 142856.7822485  192126.49687113 180237.4623223  190830.50638906
 172124.80080768 196552.95748526 177458.84237582 188428.88899899
 167395.13588316 189158.09959123 146338.03534985 162322.18466269
 194263.45537772 144498.40817536 147930.86417039 195519.06974176
 197898.49191495 191433.17782656 186481.47287    154472.95467181
 195541.23902371 164602.37527575 213795.01325452 166319.52592857
 190769.60483164 190502.65081227 198935.22328478 205753.59145057
 175012.13791424 169416.19176179 164441.89002305 167391.71725401
 144327.82906989 191555.90934535 213624.43370822 180122.37854495
 173883.17160631 182611.46852138 198092.7079779  181358.94409723
 176062.05101959 224724.7889     154569.75855037 180390.78918841
 148098.79691687 199124.47921518 174433.24516674 145197.13528658
 159422.05871391 175947.70897103 207196.61247349 171357.62756788

In [11]:
test_mse = mean_squared_error(y_test, y_test_pred)
test_rmse = np.sqrt(test_mse)
test_r2 = r2_score(y_test, y_test_pred)

# Display the evaluation metrics for test set
print(f"\nTest Set Performance:")
print(f"RMSE = {test_rmse}, R^2 = {test_r2}")



Test Set Performance:
RMSE = 50355.19583064155, R^2 = 0.2890799497342048


In [13]:
print(f"\nTraining Set: RMSE = {train_rmse}, R^2 = {train_r2}")
print(f"Test Set: RMSE = {test_rmse}, R^2 = {test_r2}")



Training Set: RMSE = 47568.92651822236, R^2 = 0.2581225777812258
Test Set: RMSE = 50355.19583064155, R^2 = 0.2890799497342048
