<a href="https://colab.research.google.com/github/Vishvesh-Bhardwaj/House_Price_Prediction/blob/main/House_Price_Prediction_Linear_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib
matplotlib.rcParams["figure.figsize"] = (15,10)

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [16]:
data = 'https://raw.githubusercontent.com/Vishvesh-Bhardwaj/House_Price_Prediction/main/Bengaluru%20House%20price%20data.csv'
df = pd.read_csv(data)

Describing about my DATASET:

The Bengaluru House Price dataset contains information about the sale prices of houses in Bengaluru, India. The dataset includes information on a variety of factors that can impact the sale price of a house, including the total area, number of bedrooms and bathrooms, location, age, and amenities such as swimming pool, garden, etc.

In [17]:
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [19]:
df.shape

(13320, 9)

In [26]:
# Preprocess the total_sqft column
def convert_to_sqft(x):
    tokens = x.split(' ')
    if len(tokens) == 2:
        value = float(tokens[0])
        unit = tokens[1]
        if unit == 'Sq. Meter':
            value *= 10.7639
        return value
    try:
        return float(x)
    except:
        return np.nan

In [34]:
df['total_sqft'].unique()


array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [35]:
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

In [36]:
df[~df['total_sqft'].apply(is_float)].head(10)

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
30,Super built-up Area,19-Dec,Yelahanka,4 BHK,LedorSa,2100 - 2850,4.0,0.0,186.0
56,Built-up Area,20-Feb,Devanahalli,4 Bedroom,BrereAt,3010 - 3410,,,192.0
81,Built-up Area,18-Oct,Hennur Road,4 Bedroom,Gollela,2957 - 3450,,,224.5
122,Super built-up Area,18-Mar,Hebbal,4 BHK,SNontle,3067 - 8156,4.0,0.0,477.0
137,Super built-up Area,19-Mar,8th Phase JP Nagar,2 BHK,Vaarech,1042 - 1105,2.0,0.0,54.005
165,Super built-up Area,18-Dec,Sarjapur,2 BHK,Kinuerg,1145 - 1340,2.0,0.0,43.49
188,Super built-up Area,Ready To Move,KR Puram,2 BHK,MCvarar,1015 - 1540,2.0,0.0,56.8
224,Super built-up Area,19-Dec,Devanahalli,3 BHK,Jurdsig,1520 - 1740,,,74.82
410,Super built-up Area,Ready To Move,Kengeri,1 BHK,,34.46Sq. Meter,1.0,0.0,18.5
549,Super built-up Area,18-Sep,Hennur Road,2 BHK,Shxorm,1195 - 1440,2.0,0.0,63.77


In [37]:
def convert(x):
    tokens=x.split('-')
    if len(tokens)==2:
        return (float(tokens[0])+float(tokens[1]))//2
    try:
        return float(x)
    except:
        return None

In [38]:
df1=df.copy()
df1['total_sqft']=df1['total_sqft'].apply(convert)
df1=df1[df1['total_sqft'].notnull()]
df1.head(5)

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056.0,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600.0,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440.0,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521.0,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200.0,2.0,1.0,51.0


In [42]:
#Data Cleaning
df1.isnull().sum()

area_type          0
availability       0
location           1
size              16
society         5472
total_sqft         0
bath              73
balcony          605
price              0
dtype: int64

In [43]:
#Removing null values
df2=df1.dropna()
df2.isnull().sum()

area_type       0
availability    0
location        0
size            0
society         0
total_sqft      0
bath            0
balcony         0
price           0
dtype: int64

In [45]:
# Split the data into training and testing sets
X = df2[['total_sqft', 'bath', 'balcony']]
y = df2['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [46]:
# Train the linear regression model
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In [47]:
# Make predictions on the testing set and compute the evaluation metrics
y_pred = lr.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

In [48]:
print('MAE:', mae)
print('MSE:', mse)
print('RMSE:', rmse)

MAE: 31.112726783676763
MSE: 6655.512831497066
RMSE: 81.58132648772676


NORMALIZATION

In [50]:
from sklearn.preprocessing import StandardScaler


In [51]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [52]:
# Train the linear regression model
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)

LinearRegression()

In [53]:
# Make predictions on the testing set and compute the evaluation metrics
y_pred = lr.predict(X_test_scaled)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

In [54]:
print('MAE:', mae)
print('MSE:', mse)
print('RMSE:', rmse)


MAE: 31.11272678367713
MSE: 6655.512831497305
RMSE: 81.58132648772822
