In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [26]:
# Step 1: Load the dataset
data = pd.read_csv('/content/house_price_data_1000.csv')
data.head()

Unnamed: 0,SquareFootage,Bedrooms,Bathrooms,AgeOfHouse,DistanceToCityCenter,Garage,Pool,SalePrice
0,1360,2,1,32,14.253712,1,0,173745
1,4272,3,3,67,14.177796,0,0,474041
2,3592,1,2,67,21.884937,0,1,78885
3,966,1,2,28,49.028253,1,0,221128
4,4926,2,1,44,3.841376,0,1,446419


In [27]:
# Step 2: Data Preprocessing
# Handle missing values (you can adjust the method based on your dataset)
data.fillna(method='ffill', inplace=True)
# One-hot encode categorical variables
data = pd.get_dummies(data, drop_first=True)
data.head()

  data.fillna(method='ffill', inplace=True)


Unnamed: 0,SquareFootage,Bedrooms,Bathrooms,AgeOfHouse,DistanceToCityCenter,Garage,Pool,SalePrice
0,1360,2,1,32,14.253712,1,0,173745
1,4272,3,3,67,14.177796,0,0,474041
2,3592,1,2,67,21.884937,0,1,78885
3,966,1,2,28,49.028253,1,0,221128
4,4926,2,1,44,3.841376,0,1,446419


In [28]:
# Feature-target split
X = data.drop('SalePrice', axis=1)
X.head()

Unnamed: 0,SquareFootage,Bedrooms,Bathrooms,AgeOfHouse,DistanceToCityCenter,Garage,Pool
0,1360,2,1,32,14.253712,1,0
1,4272,3,3,67,14.177796,0,0
2,3592,1,2,67,21.884937,0,1
3,966,1,2,28,49.028253,1,0
4,4926,2,1,44,3.841376,0,1


In [29]:
y = data['SalePrice']
y.head()

Unnamed: 0,SalePrice
0,173745
1,474041
2,78885
3,221128
4,446419


In [30]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.head()

Unnamed: 0,SquareFootage,Bedrooms,Bathrooms,AgeOfHouse,DistanceToCityCenter,Garage,Pool
29,2028,2,3,29,31.370615,1,0
535,3519,5,3,29,43.179047,1,1
695,4507,2,3,67,29.963035,1,0
557,3371,4,2,17,9.186836,0,1
836,2871,5,1,47,20.065336,0,1


In [31]:
X_test.head()

Unnamed: 0,SquareFootage,Bedrooms,Bathrooms,AgeOfHouse,DistanceToCityCenter,Garage,Pool
521,4012,3,1,73,31.387925,1,1
737,2310,3,1,65,45.169445,1,1
740,4708,1,3,98,33.103737,0,0
660,4932,2,1,13,17.581645,1,1
411,3646,1,1,42,32.650692,1,0


In [32]:
y_train.head()

Unnamed: 0,SalePrice
29,190399
535,123606
695,348162
557,219957
836,484044


In [33]:
y_test.head()

Unnamed: 0,SalePrice
521,74502
737,378634
740,306323
660,399880
411,249665


In [36]:
# Step 3: Model 1 - Linear Regression using Scikit-learn
lr_model = LinearRegression()

In [40]:
lr_model.fit(X_train, y_train)
lr_model

In [41]:
# Evaluation - Mean Squared Error
mse_lr = mean_squared_error(y_test, y_pred_lr)
print(f"Linear Regression Mean Squared Error: {mse_lr}")

Linear Regression Mean Squared Error: 14406209423.28844


In [42]:
# Predictions
y_pred_lr = lr_model.predict(X_test)
y_pred_lr

array([280413.02749981, 270990.63442486, 274394.23865373, 293725.13396183,
       292682.79137476, 273762.48558369, 272539.73310166, 274187.16896923,
       294419.52903833, 284296.09658955, 281631.58045091, 272695.75910573,
       285294.07236065, 283961.394622  , 297308.1708301 , 281986.56076139,
       279417.33862292, 283145.64840142, 270019.89025443, 259459.03594403,
       282066.56840199, 275093.45348346, 274768.71453487, 264827.62759516,
       255166.35866017, 257401.02634534, 267693.59110513, 275772.76646879,
       259902.18806202, 254825.87913976, 272185.81189639, 290379.1352762 ,
       279638.38354311, 282280.01461231, 275307.64487661, 292955.64193843,
       269145.39475605, 274801.82465979, 274874.92058228, 271445.63604563,
       263431.08223286, 259729.62888102, 266837.44972243, 269455.76913184,
       287456.15661649, 258739.40500821, 274562.5949542 , 282356.26030968,
       267368.65072236, 288605.78268968, 272980.01212534, 259025.86623415,
       281161.16646038, 2

In [43]:
lr_model.score(X_train, y_train)

0.008706554838243785