# Practical-5
Build a machine learning pipeline that predicts the
sale price of a house based on its features like size,
number of bedrooms/bathrooms, location, and construction quality.

## Step-1: Importing Required Libraries

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

## Step-2: Data Collection

In [4]:
df = pd.read_csv('/content/kc_house_data.csv', on_bad_lines='skip')

In [5]:
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [8]:
df.shape

(21613, 21)

In [9]:
df.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15'],
      dtype='object')

## Step-3: Data Cleaning and Preprocessing

In [10]:
# Droping unnecessary columns
df.drop(['id','date'],axis=1, inplace=True)

In [12]:
# Checking missing values
df.isnull().sum()

Unnamed: 0,0
price,0
bedrooms,0
bathrooms,0
sqft_living,0
sqft_lot,0
floors,0
waterfront,0
view,0
condition,0
grade,0


## Step-4: Defining Features (X) and Target (y)

In [14]:
X = df.drop('price',axis=1)
y = df['price']

## Step-5: Splitting Data

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Step-6: Building ML Pipeline

In [18]:
model = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', RandomForestRegressor(
        n_estimators=150, random_state=42,max_depth=20,min_samples_split=5
    ))
])

## Step-7: Model Training

In [19]:
model.fit(X_train, y_train)

## Step-8: Making Predictions

In [21]:
y_pred = model.predict(X_test)

## Step-9: Model Evaluation

In [22]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("\nðŸ“Š Model Performance:")
print("Mean Squared Error (MSE):", round(mse, 2))
print("Root Mean Squared Error (RMSE):", round(rmse, 2))
print("RÂ² Score:", round(r2, 3))


ðŸ“Š Model Performance:
Mean Squared Error (MSE): 21810448494.7
Root Mean Squared Error (RMSE): 147683.61
RÂ² Score: 0.856


## Step-10: Predicting New House Price (Example)

In [24]:
new_house = pd.DataFrame({'bedrooms': [3],'bathrooms': [2],'sqft_living': [2000],
    'sqft_lot': [6000],'floors': [1],'waterfront': [0],'view': [0],
    'condition': [3],'grade': [7],'sqft_above': [2000],'sqft_basement': [0],
    'yr_built': [2005],'yr_renovated': [0],'zipcode': [98105],'lat': [47.65],
    'long': [-122.30],'sqft_living15': [2100],'sqft_lot15': [5000]
})

pred_price = model.predict(new_house)
print("Predicted Sale Price for new house: $", round(pred_price[0], 2))

Predicted Sale Price for new house: $ 702624.22
