In [236]:
import numpy as np 
import pandas as pd
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [237]:
df = pd.read_csv('bengaluru_house_prices.csv')
df

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.00
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.00
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.00
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.00
...,...,...,...,...,...,...,...,...,...
13315,Built-up Area,Ready To Move,Whitefield,5 Bedroom,ArsiaEx,3453,4.0,0.0,231.00
13316,Super built-up Area,Ready To Move,Richards Town,4 BHK,,3600,5.0,,400.00
13317,Built-up Area,Ready To Move,Raja Rajeshwari Nagar,2 BHK,Mahla T,1141,2.0,1.0,60.00
13318,Super built-up Area,18-Jun,Padmanabhanagar,4 BHK,SollyCl,4689,4.0,1.0,488.00


In [238]:
df.shape

(13320, 9)

In [239]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [240]:
df.isna().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [241]:
# Drop Society column 
df.drop(columns=['society' , 'balcony','availability'],inplace=True)
df.head()

Unnamed: 0,area_type,location,size,total_sqft,bath,price
0,Super built-up Area,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Plot Area,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Built-up Area,Uttarahalli,3 BHK,1440,2.0,62.0
3,Super built-up Area,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Super built-up Area,Kothanur,2 BHK,1200,2.0,51.0


In [242]:
cat_col = df.select_dtypes(include='O')
cat_col.head()

Unnamed: 0,area_type,location,size,total_sqft
0,Super built-up Area,Electronic City Phase II,2 BHK,1056
1,Plot Area,Chikka Tirupathi,4 Bedroom,2600
2,Built-up Area,Uttarahalli,3 BHK,1440
3,Super built-up Area,Lingadheeranahalli,3 BHK,1521
4,Super built-up Area,Kothanur,2 BHK,1200


In [243]:
num_col = df.select_dtypes(exclude='O')
num_col

Unnamed: 0,bath,price
0,2.0,39.07
1,5.0,120.00
2,2.0,62.00
3,3.0,95.00
4,2.0,51.00
...,...,...
13315,4.0,231.00
13316,5.0,400.00
13317,2.0,60.00
13318,4.0,488.00


In [244]:
df.describe()

Unnamed: 0,bath,price
count,13247.0,13320.0
mean,2.69261,112.565627
std,1.341458,148.971674
min,1.0,8.0
25%,2.0,50.0
50%,2.0,72.0
75%,3.0,120.0
max,40.0,3600.0


In [245]:
df['area_type'].value_counts()

area_type
Super built-up  Area    8790
Built-up  Area          2418
Plot  Area              2025
Carpet  Area              87
Name: count, dtype: int64

In [246]:
#  Data Transformation
area_type_dict = {'Super built-up  Area ': 1, 'Built-up  Area': 2, 'Plot  Area': 3, 'Carpet  Area': 4}



In [247]:
# Identify unmapped values
unmapped_values = df[~df['area_type'].isin(area_type_dict.keys())]['area_type'].unique()
print("Unmapped values:", unmapped_values)

Unmapped values: ['Super built-up  Area']


In [248]:

# Update dictionary or DataFrame values based on findings
# For example, if updating dictionary:
area_type_dict = {'Super built-up  Area': 1, 'Built-up  Area': 2, 'Plot  Area': 3, 'Carpet  Area': 4}

In [249]:
#apply mapping
df['area_type'] = df['area_type'].map(area_type_dict)
df['area_type']

0        1
1        3
2        2
3        1
4        1
        ..
13315    2
13316    1
13317    2
13318    1
13319    1
Name: area_type, Length: 13320, dtype: int64

In [250]:
df.isna().sum()

area_type      0
location       1
size          16
total_sqft     0
bath          73
price          0
dtype: int64

In [251]:
df['size'] = df['size'].fillna('2 BHK')

In [252]:
df['bath'] = df['bath'].fillna(df['bath'].median())

In [253]:
df['location'] = df['location'].fillna('Sarjapr Road')

In [254]:
df.isna().sum()

area_type     0
location      0
size          0
total_sqft    0
bath          0
price         0
dtype: int64

In [255]:
df['BHK'] = df['size'].str.split().str.get(0).astype(int)

In [256]:
df[df.BHK > 20 ]

Unnamed: 0,area_type,location,size,total_sqft,bath,price,BHK
1718,1,2Electronic City Phase II,27 BHK,8000,27.0,230.0,27
4684,3,Munnekollal,43 Bedroom,2400,40.0,660.0,43


In [257]:
df['total_sqft'].unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [258]:
def convertRange(x):
    temp = x.split('-')
    if len(temp)  == 2:
        return(float(temp[0]) + float(temp[1]))/2
    try:
        return float(x)
    except:
        return None
    

In [259]:
df['total_sqft'] = df['total_sqft'].apply(convertRange)

In [260]:
df.head()

Unnamed: 0,area_type,location,size,total_sqft,bath,price,BHK
0,1,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2
1,3,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4
2,2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3
3,1,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3
4,1,Kothanur,2 BHK,1200.0,2.0,51.0,2


In [261]:
df['price_per_sqft'] = df['price'] *100000 / df['total_sqft']

In [262]:
df['price_per_sqft']

0         3699.810606
1         4615.384615
2         4305.555556
3         6245.890861
4         4250.000000
             ...     
13315     6689.834926
13316    11111.111111
13317     5258.545136
13318    10407.336319
13319     3090.909091
Name: price_per_sqft, Length: 13320, dtype: float64

In [263]:
df.describe()

Unnamed: 0,area_type,total_sqft,bath,price,BHK,price_per_sqft
count,13320.0,13274.0,13320.0,13320.0,13320.0,13274.0
mean,1.50518,1559.626694,2.688814,112.565627,2.802778,7907.501
std,0.770234,1238.405258,1.338754,148.971674,1.294496,106429.6
min,1.0,1.0,1.0,8.0,1.0,267.8298
25%,1.0,1100.0,2.0,50.0,2.0,4266.865
50%,1.0,1276.0,2.0,72.0,3.0,5434.306
75%,2.0,1680.0,3.0,120.0,3.0,7311.746
max,4.0,52272.0,40.0,3600.0,43.0,12000000.0


In [264]:
df['location'].value_counts()

location
Whitefield                                      540
Sarjapur  Road                                  399
Electronic City                                 302
Kanakpura Road                                  273
Thanisandra                                     234
                                               ... 
Maragondana Halli, kr puram, old madras road      1
Chikkajala                                        1
Udayagiri                                         1
pavitra paradise                                  1
Chikbasavanapura                                  1
Name: count, Length: 1306, dtype: int64

In [265]:
df['location'] = df['location'].apply(lambda x: x.strip())
location_count = df['location'].value_counts()

In [266]:
location_count

location
Whitefield                541
Sarjapur  Road            399
Electronic City           304
Kanakpura Road            273
Thanisandra               237
                         ... 
Xavier Layout               1
Ramanagara Channapatna      1
Maheswari Nagar             1
Hsr layout sector3          1
Thyagraj Nagar              1
Name: count, Length: 1295, dtype: int64

In [267]:
location_count_less_10 = location_count[location_count<=10]
location_count_less_10

location
1st Block Koramangala     10
Dairy Circle              10
Nagadevanahalli           10
Sadashiva Nagar           10
Naganathapura             10
                          ..
Xavier Layout              1
Ramanagara Channapatna     1
Maheswari Nagar            1
Hsr layout sector3         1
Thyagraj Nagar             1
Name: count, Length: 1054, dtype: int64

In [268]:
df['location'] = df['location'].apply(lambda x : 'Other' if x in location_count_less_10 else x)


In [269]:
df['location'].value_counts()

location
Other                        2886
Whitefield                    541
Sarjapur  Road                399
Electronic City               304
Kanakpura Road                273
                             ... 
Tindlu                         11
Marsur                         11
2nd Phase Judicial Layout      11
Thyagaraja Nagar               11
HAL 2nd Stage                  11
Name: count, Length: 242, dtype: int64

In [270]:
df.describe()

Unnamed: 0,area_type,total_sqft,bath,price,BHK,price_per_sqft
count,13320.0,13274.0,13320.0,13320.0,13320.0,13274.0
mean,1.50518,1559.626694,2.688814,112.565627,2.802778,7907.501
std,0.770234,1238.405258,1.338754,148.971674,1.294496,106429.6
min,1.0,1.0,1.0,8.0,1.0,267.8298
25%,1.0,1100.0,2.0,50.0,2.0,4266.865
50%,1.0,1276.0,2.0,72.0,3.0,5434.306
75%,2.0,1680.0,3.0,120.0,3.0,7311.746
max,4.0,52272.0,40.0,3600.0,43.0,12000000.0


In [271]:
(df['total_sqft']/df['BHK']).describe()

count    13274.000000
mean       575.074878
std        388.205175
min          0.250000
25%        473.333333
50%        552.500000
75%        625.000000
max      26136.000000
dtype: float64

In [272]:
df = df[((df['total_sqft']/df['BHK']) >= 300)]

In [273]:
df.describe()

Unnamed: 0,area_type,total_sqft,bath,price,BHK,price_per_sqft
count,12530.0,12530.0,12530.0,12530.0,12530.0,12530.0
mean,1.426895,1594.564544,2.559537,111.382401,2.650838,6303.979357
std,0.710543,1261.271296,1.077938,152.077329,0.976678,4162.237981
min,1.0,300.0,1.0,8.44,1.0,267.829813
25%,1.0,1116.0,2.0,49.0,2.0,4210.526316
50%,1.0,1300.0,2.0,70.0,3.0,5294.117647
75%,2.0,1700.0,3.0,115.0,3.0,6916.666667
max,4.0,52272.0,16.0,3600.0,16.0,176470.588235


In [274]:
df.shape

(12530, 8)

In [275]:
df.price_per_sqft.describe()

count     12530.000000
mean       6303.979357
std        4162.237981
min         267.829813
25%        4210.526316
50%        5294.117647
75%        6916.666667
max      176470.588235
Name: price_per_sqft, dtype: float64

In [276]:
def remove_outliers_sqft(df):
    df_output = pd.DataFrame()
    for key,subdf in df.groupby('location'):
        m = np.mean(subdf.price_per_sqft)
        st = np.std(subdf.price_per_sqft)

        gen_df = subdf[(subdf.price_per_sqft > (m-st)) & (subdf.price_per_sqft <= (m+st))]
        df_output = pd.concat([df_output,gen_df],ignore_index=True)
    return df_output
df = remove_outliers_sqft(df)
df.describe()

Unnamed: 0,area_type,total_sqft,bath,price,BHK,price_per_sqft
count,10301.0,10301.0,10301.0,10301.0,10301.0,10301.0
mean,1.355985,1508.440608,2.471702,91.286372,2.574896,5659.062876
std,0.642326,880.694214,0.979449,86.342786,0.897649,2265.774749
min,1.0,300.0,1.0,10.0,1.0,1250.0
25%,1.0,1110.0,2.0,49.0,2.0,4244.897959
50%,1.0,1286.0,2.0,67.0,2.0,5175.600739
75%,2.0,1650.0,3.0,100.0,3.0,6428.571429
max,4.0,30400.0,16.0,2200.0,16.0,24509.803922


In [277]:
def BHK_outlier_remover(df):
    exclude_indices = np.array([])
    for location , location_df in df.groupby('location'):
        BHK_stats ={}
        for BHK,BHK_df in location_df.groupby('BHK'):
            BHK_stats[BHK] = {
            'mean': np.mean(BHK_df.price_per_sqft),
            'std' : np.std(BHK_df.price_per_sqft),
            'count': BHK_df.shape[0]
            }
        #print(location,BHK_stats)
        for BHK ,BHK_df in location_df.groupby('BHK'):
             stats = BHK_stats.get(BHK-1)
             if stats and stats['count']>5:
                 exclude_indices = np.append(exclude_indices,BHK_df[BHK_df.price_per_sqft<(stats['mean'])].index.values)
    return df.drop(exclude_indices,axis='index')

In [278]:
df =BHK_outlier_remover(df)


In [279]:
df.shape

(7361, 8)

In [280]:
df

Unnamed: 0,area_type,location,size,total_sqft,bath,price,BHK,price_per_sqft
0,1,1st Block Jayanagar,4 BHK,2850.0,4.0,428.0,4,15017.543860
1,1,1st Block Jayanagar,3 BHK,1630.0,3.0,194.0,3,11901.840491
2,1,1st Block Jayanagar,3 BHK,1875.0,2.0,235.0,3,12533.333333
3,2,1st Block Jayanagar,3 BHK,1200.0,2.0,130.0,3,10833.333333
4,1,1st Block Jayanagar,2 BHK,1235.0,2.0,148.0,2,11983.805668
...,...,...,...,...,...,...,...,...
10290,1,Yeshwanthpur,2 BHK,1195.0,2.0,100.0,2,8368.200837
10291,1,Yeshwanthpur,3 BHK,1692.0,3.0,108.0,3,6382.978723
10293,2,Yeshwanthpur,6 Bedroom,2500.0,5.0,185.0,6,7400.000000
10298,1,Yeshwanthpur,3 BHK,1855.0,3.0,135.0,3,7277.628032


In [281]:
df.drop(columns=['size','price_per_sqft'],inplace = True)

In [282]:
df

Unnamed: 0,area_type,location,total_sqft,bath,price,BHK
0,1,1st Block Jayanagar,2850.0,4.0,428.0,4
1,1,1st Block Jayanagar,1630.0,3.0,194.0,3
2,1,1st Block Jayanagar,1875.0,2.0,235.0,3
3,2,1st Block Jayanagar,1200.0,2.0,130.0,3
4,1,1st Block Jayanagar,1235.0,2.0,148.0,2
...,...,...,...,...,...,...
10290,1,Yeshwanthpur,1195.0,2.0,100.0,2
10291,1,Yeshwanthpur,1692.0,3.0,108.0,3
10293,2,Yeshwanthpur,2500.0,5.0,185.0,6
10298,1,Yeshwanthpur,1855.0,3.0,135.0,3


In [283]:
df.to_csv("Cleaned_data.csv")


In [284]:
x =df.drop(columns=['price'])
y = df[['price']]

In [285]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score

In [286]:
# Split in training and testing
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)

In [287]:
x_train.shape ,x_test.shape

((5888, 5), (1473, 5))

In [None]:
#APPLY Linear Regression

In [294]:
column_trans = make_column_transformer(
    (OneHotEncoder(sparse_output=False), ['location']),  # Apply OneHotEncoder to the 'location' column
    remainder='passthrough'  # Leave all other columns unchanged
)

In [296]:
scaler = StandardScaler()

In [299]:
lnr = LinearRegression()

In [301]:
pipe = make_pipeline(column_trans,scaler,lnr)

In [303]:
pipe.fit(x_train,y_train)

In [304]:
y_pred_lnr = pipe.predict(x_test)



In [305]:
r2_score(y_test ,y_pred_lnr)

0.8651198904989619

In [None]:
#APPLYING LASSO

In [308]:
lasso = Lasso()

In [309]:
pipe = make_pipeline(column_trans,scaler,lasso)

In [310]:
pipe.fit(x_train,y_train)

In [313]:
y_pred_lasso = pipe.predict(x_test)



In [314]:
r2_score(y_test ,y_pred_lasso)

0.8523619616146046

APPLYING RIDGE 

In [315]:
ridge = Ridge()

In [316]:
pipe = make_pipeline(column_trans,scaler,ridge)

In [317]:
pipe.fit(x_train,y_train)

In [318]:
y_pred_ridge = pipe.predict(x_test)



In [319]:
r2_score(y_test ,y_pred_ridge)

0.8651924527170949

In [321]:
print("LinearRegression :" , r2_score(y_test,y_pred_lnr)*100)
print("Lasso :" , r2_score(y_test,y_pred_lasso)*100)
print("Ridge:" , r2_score(y_test,y_pred_ridge)*100)

LinearRegression : 85.23619616146047
Lasso : 85.23619616146047
Ridge: 86.5192452717095


In [322]:
# Saving models
import os,joblib

In [323]:
os.makedirs('models',exist_ok=True)

In [324]:
lnr ,ridge , lasso

(LinearRegression(), Ridge(), Lasso())

In [325]:
joblib.dump(lnr,'./models/lineraRegression.lb')
joblib.dump(lasso,'./models/LassoRegression.lb')
joblib.dump(ridge,'./models/RidgeRegression.lb')

print("MODELS ARE SUCCESSFULLY SAVED")

MODELS ARE SUCCESSFULLY SAVED
