<a href="https://colab.research.google.com/github/Vishwanathamrish/House-Price-Prediction/blob/main/Amitysoft.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [None]:
train_data = pd.read_csv("/content/train.csv")
avg_rent_data = pd.read_csv("/content/avg_rent.csv")
distance_data = pd.read_csv("/content/distfrom city centre.csv")

In [None]:
train_data

Unnamed: 0,ID,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.00
2,2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.00
3,3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.00
4,4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.00
...,...,...,...,...,...,...,...,...,...,...
10651,10651,Plot Area,Ready To Move,Parappana Agrahara,1 Bedroom,,1200,1.0,0.0,45.00
10652,10652,Super built-up Area,Ready To Move,Bhoganhalli,2 BHK,Srhemay,1260,2.0,1.0,69.90
10653,10653,Super built-up Area,Ready To Move,Doddanakundi Industrial Area 2,2 BHK,,1092,2.0,1.0,49.00
10654,10654,Super built-up Area,Ready To Move,Kachanayakanahalli,2 BHK,ICoolon,827,2.0,0.0,34.00


In [None]:
avg_rent_data

Unnamed: 0,location,avg_2bhk_rent
0,Krishnarajapura,11954
1,Sarjapur,45000
2,Whitefield Hope Farm Junction,26370
3,Devanahalli,17302
4,Whitefield,14981
...,...,...
152,Bagaluru,8500
153,seegehalli,12000
154,Rayasandra,11000
155,JP Nagar Phase 6,21500


In [None]:
distance_data

Unnamed: 0,location,dist_from_city
0,Whitefield,17.3
1,Sarjapur Road,17.2
2,Electronic City,18.1
3,Kanakpura Road,26.5
4,Thanisandra,11.5
...,...,...
495,Kirloskar Layout,19.0
496,Sai Gardens,24.7
497,Raja Rajeshwari Nagar 5th Stage,16.4
498,4th Phase JP Nagar,10.5


In [None]:
train_data = train_data.merge(avg_rent_data, on='location', how='left')
train_data = train_data.merge(distance_data, on='location', how='left')


In [None]:
train_data

Unnamed: 0,ID,area_type,availability,location,size,society,total_sqft,bath,balcony,price,avg_2bhk_rent,dist_from_city
0,0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07,11500.0,19.3
1,1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.00,,34.6
2,2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.00,19750.0,12.9
3,3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.00,,21.4
4,4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.00,16375.0,11.8
...,...,...,...,...,...,...,...,...,...,...,...,...
10651,10651,Plot Area,Ready To Move,Parappana Agrahara,1 Bedroom,,1200,1.0,0.0,45.00,,14.6
10652,10652,Super built-up Area,Ready To Move,Bhoganhalli,2 BHK,Srhemay,1260,2.0,1.0,69.90,,15.3
10653,10653,Super built-up Area,Ready To Move,Doddanakundi Industrial Area 2,2 BHK,,1092,2.0,1.0,49.00,,13.4
10654,10654,Super built-up Area,Ready To Move,Kachanayakanahalli,2 BHK,ICoolon,827,2.0,0.0,34.00,,


In [None]:
train_data.fillna({
    'avg_2bhk_rent': train_data['avg_2bhk_rent'].median(),
    'dist_from_city': train_data['dist_from_city'].median(),
    'balcony': train_data['balcony'].median(),
    'bath': train_data['bath'].median()
}, inplace=True)


In [None]:
train_data['BHK'] = train_data['size'].str.extract('(\\d+)').astype(float)


In [None]:
train_data['BHK']

Unnamed: 0,BHK
0,2.0
1,4.0
2,3.0
3,3.0
4,2.0
...,...
10651,1.0
10652,2.0
10653,2.0
10654,2.0


In [None]:
train_data['availability'] = train_data['availability'].apply(lambda x: 1 if x == 'Ready To Move' else 0)

In [None]:
train_data['availability']

Unnamed: 0,availability
0,0
1,1
2,1
3,1
4,1
...,...
10651,1
10652,1
10653,1
10654,1


In [None]:
train_data['total_sqft'] = train_data['total_sqft'].apply(lambda x: np.mean([float(i) for i in str(x).split('-')]) if '-' in str(x) else x)
train_data['total_sqft'] = pd.to_numeric(train_data['total_sqft'], errors='coerce')


In [None]:
train_data['total_sqft']

Unnamed: 0,total_sqft
0,1056.0
1,2600.0
2,1440.0
3,1521.0
4,1200.0
...,...
10651,1200.0
10652,1260.0
10653,1092.0
10654,827.0


In [None]:
X = train_data.drop(columns=['price', 'size', 'society', 'location'])
X = pd.get_dummies(X, columns=['area_type'], drop_first=True)
y = train_data['price']

In [None]:
X

Unnamed: 0,ID,availability,total_sqft,bath,balcony,avg_2bhk_rent,dist_from_city,BHK,area_type_Carpet Area,area_type_Plot Area,area_type_Super built-up Area
0,0,0,1056.0,2.0,1.0,11500.0,19.3,2.0,False,False,True
1,1,1,2600.0,5.0,3.0,14981.0,34.6,4.0,False,True,False
2,2,1,1440.0,2.0,3.0,19750.0,12.9,3.0,False,False,False
3,3,1,1521.0,3.0,1.0,14981.0,21.4,3.0,False,False,True
4,4,1,1200.0,2.0,1.0,16375.0,11.8,2.0,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...
10651,10651,1,1200.0,1.0,0.0,14981.0,14.6,1.0,False,True,False
10652,10652,1,1260.0,2.0,1.0,14981.0,15.3,2.0,False,False,True
10653,10653,1,1092.0,2.0,1.0,14981.0,13.4,2.0,False,False,True
10654,10654,1,827.0,2.0,0.0,14981.0,14.4,2.0,False,False,True


In [None]:
y

Unnamed: 0,price
0,39.07
1,120.00
2,62.00
3,95.00
4,51.00
...,...
10651,45.00
10652,69.90
10653,49.00
10654,34.00


In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
numerical_features = ['total_sqft', 'bath', 'balcony', 'avg_2bhk_rent', 'dist_from_city', 'BHK']
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features)
    ], remainder='passthrough')

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(random_state=42))
])

In [None]:
pipeline.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [None]:
predictions = pipeline.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, predictions))
print(f"Validation RMSE: {rmse}")


Validation RMSE: 85.74195793421049


In [None]:
predictions

array([ 44.63595, 100.9184 ,  67.5719 , ..., 116.00265,  68.99   ,
        49.5335 ])

In [None]:
rmse

85.74195793421049

In [None]:
test_data = pd.read_csv("/content/test.csv")
test_data = test_data.merge(avg_rent_data, on='location', how='left')
test_data = test_data.merge(distance_data, on='location', how='left')
test_data.fillna({
    'avg_2bhk_rent': test_data['avg_2bhk_rent'].median(),
    'dist_from_city': test_data['dist_from_city'].median(),
    'balcony': test_data['balcony'].median(),
    'bath': test_data['bath'].median()
}, inplace=True)
test_data['BHK'] = test_data['size'].str.extract('(\\d+)').astype(float)
test_data['availability'] = test_data['availability'].apply(lambda x: 1 if x == 'Ready To Move' else 0)
test_data['total_sqft'] = test_data['total_sqft'].apply(lambda x: np.mean([float(i) for i in str(x).split('-')]) if '-' in str(x) else x)
test_data['total_sqft'] = pd.to_numeric(test_data['total_sqft'], errors='coerce')
test_data = pd.get_dummies(test_data, columns=['area_type'], drop_first=True)

In [None]:
final_predictions = pipeline.predict(test_data)

In [None]:
final_predictions


array([ 49.7745 ,  69.8168 , 120.7215 , ...,  53.5458 , 542.915  ,
        23.77315])

In [None]:
submission = pd.DataFrame({"Id": test_data.index, "Predicted Price": final_predictions})
submission.to_csv("predicted_prices.csv", index=False)

In [None]:
submission

Unnamed: 0,Id,Predicted Price
0,0,49.77450
1,1,69.81680
2,2,120.72150
3,3,50.66325
4,4,68.81185
...,...,...
2659,2659,203.56000
2660,2660,439.47000
2661,2661,53.54580
2662,2662,542.91500


In [None]:
submission.iloc[[20]]

Unnamed: 0,Id,Predicted Price
20,20,50.3559
