In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv("Housing.csv")

# Display the first few rows of the dataset
df.head()


Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# Separate features and target
X = df.drop("price", axis=1)
y = df["price"]

# Identify categorical and numerical columns
categorical_cols = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 
                    'airconditioning', 'prefarea', 'furnishingstatus']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Define preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first'), categorical_cols)
    ],
    remainder='passthrough'  # numerical columns will be passed through as-is
)

# Create pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
y_pred


array([5164653.90033953, 7224722.29802166, 3109863.24240344,
       4612075.32722562, 3294646.25725962, 3532275.09556553,
       5611774.56836464, 6368145.98732723, 2722856.95689983,
       2629405.61585783, 9617039.5031557 , 2798087.30447893,
       3171096.76847059, 3394639.09125525, 3681088.65424279,
       5263187.74621481, 3035963.47612398, 4786122.80040048,
       4349551.92005717, 3572362.09930449, 5774875.21395654,
       5886993.5791988 , 2730836.19518465, 4727316.47323643,
       5244847.52716788, 7555324.21605586, 3220790.84680277,
       5191898.79934216, 8143726.91009756, 3398814.09825045,
       6490693.05027933, 3315105.90747819, 6708457.36761325,
       4201738.21071671, 3557571.06735188, 5836974.5047863 ,
       4808660.67448478, 4362878.73613269, 3191242.95701504,
       4596554.9322525 , 4566042.86048401, 3517779.52374155,
       7205844.79365835, 3983597.27861108, 3749338.70271051,
       4274731.09125893, 6757442.10783739, 4037320.43665854,
       3769334.90397115,

In [7]:
r2_score(y_test,y_pred) * 100

65.29242642153143