In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('Bengaluru_House_Data.csv')

In [3]:
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [4]:
df.info

<bound method DataFrame.info of                   area_type   availability                  location  \
0      Super built-up  Area         19-Dec  Electronic City Phase II   
1                Plot  Area  Ready To Move          Chikka Tirupathi   
2            Built-up  Area  Ready To Move               Uttarahalli   
3      Super built-up  Area  Ready To Move        Lingadheeranahalli   
4      Super built-up  Area  Ready To Move                  Kothanur   
...                     ...            ...                       ...   
13315        Built-up  Area  Ready To Move                Whitefield   
13316  Super built-up  Area  Ready To Move             Richards Town   
13317        Built-up  Area  Ready To Move     Raja Rajeshwari Nagar   
13318  Super built-up  Area         18-Jun           Padmanabhanagar   
13319  Super built-up  Area  Ready To Move              Doddathoguru   

            size  society total_sqft  bath  balcony   price  
0          2 BHK  Coomee        1056   2.

In [5]:
for column in df.columns:
    print(df[column].value_counts())
    print("*"*20)

area_type
Super built-up  Area    8790
Built-up  Area          2418
Plot  Area              2025
Carpet  Area              87
Name: count, dtype: int64
********************
availability
Ready To Move    10581
18-Dec             307
18-May             295
18-Apr             271
18-Aug             200
                 ...  
15-Aug               1
17-Jan               1
16-Nov               1
16-Jan               1
14-Jul               1
Name: count, Length: 81, dtype: int64
********************
location
Whitefield                        540
Sarjapur  Road                    399
Electronic City                   302
Kanakpura Road                    273
Thanisandra                       234
                                 ... 
Bapuji Layout                       1
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Abshot Layout                       1
Name: count, Length: 1305, dtype: int64
********************
size
2 BHK    

In [6]:
df.isnull().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

## Handling missing data 
* We do not require area type "area_type","availability","society","balcony"to predict the price 
* There are some missing variables but the missing values are not many so instead of filling them , we will just drop them 

In [7]:
df.drop(columns=["area_type","availability","society","balcony"],inplace = True)

In [8]:
df = df.dropna(subset=['location','size','bath'])

In [9]:
df

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.00
2,Uttarahalli,3 BHK,1440,2.0,62.00
3,Lingadheeranahalli,3 BHK,1521,3.0,95.00
4,Kothanur,2 BHK,1200,2.0,51.00
...,...,...,...,...,...
13315,Whitefield,5 Bedroom,3453,4.0,231.00
13316,Richards Town,4 BHK,3600,5.0,400.00
13317,Raja Rajeshwari Nagar,2 BHK,1141,2.0,60.00
13318,Padmanabhanagar,4 BHK,4689,4.0,488.00


In [10]:
df['total_sqft'].unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

#### Range of data 
There is range of data **'1133 - 1384'** in the data set so we need to convet this range into a fixed number

In [11]:
def convert_range(x):
    temp = x.split('-')
    if len('temp')==2:
        return (float(temp[0])+float(temp[1]))/2
    try:
        return float(x)
    except:
        return None

In [12]:
df.loc[:, 'total_sqft'] = df['total_sqft'].apply(convert_range)

In [13]:
df.loc[:,'bhk']=df['size'].str.split().str.get(0).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:,'bhk']=df['size'].str.split().str.get(0).astype(int)


In [14]:
df

Unnamed: 0,location,size,total_sqft,bath,price,bhk
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.00,4
2,Uttarahalli,3 BHK,1440.0,2.0,62.00,3
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.00,3
4,Kothanur,2 BHK,1200.0,2.0,51.00,2
...,...,...,...,...,...,...
13315,Whitefield,5 Bedroom,3453.0,4.0,231.00,5
13316,Richards Town,4 BHK,3600.0,5.0,400.00,4
13317,Raja Rajeshwari Nagar,2 BHK,1141.0,2.0,60.00,2
13318,Padmanabhanagar,4 BHK,4689.0,4.0,488.00,4


In [15]:
df.isnull().sum()

location        0
size            0
total_sqft    190
bath            0
price           0
bhk             0
dtype: int64

In [16]:
df['total_sqft'] = df['total_sqft'].fillna(df['total_sqft'].mean())

  df['total_sqft'] = df['total_sqft'].fillna(df['total_sqft'].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['total_sqft'] = df['total_sqft'].fillna(df['total_sqft'].mean())


In [17]:
df['price_per_sqft'] = df['price'] * 100000 / df['total_sqft']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['price_per_sqft'] = df['price'] * 100000 / df['total_sqft']


In [18]:
df['price_per_sqft']

0         3699.810606
1         4615.384615
2         4305.555556
3         6245.890861
4         4250.000000
             ...     
13315     6689.834926
13316    11111.111111
13317     5258.545136
13318    10407.336319
13319     3090.909091
Name: price_per_sqft, Length: 13246, dtype: float64

In [19]:
df

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.00,4,4615.384615
2,Uttarahalli,3 BHK,1440.0,2.0,62.00,3,4305.555556
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.00,3,6245.890861
4,Kothanur,2 BHK,1200.0,2.0,51.00,2,4250.000000
...,...,...,...,...,...,...,...
13315,Whitefield,5 Bedroom,3453.0,4.0,231.00,5,6689.834926
13316,Richards Town,4 BHK,3600.0,5.0,400.00,4,11111.111111
13317,Raja Rajeshwari Nagar,2 BHK,1141.0,2.0,60.00,2,5258.545136
13318,Padmanabhanagar,4 BHK,4689.0,4.0,488.00,4,10407.336319


In [20]:
 df.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,13246.0,13246.0,13246.0,13246.0,13246.0
mean,1554.458192,2.692586,112.389392,2.801902,7934.705
std,1229.564729,1.341506,149.076587,1.295758,106543.9
min,1.0,1.0,8.0,1.0,267.8298
25%,1100.0,2.0,50.0,2.0,4250.0
50%,1282.0,2.0,72.0,3.0,5444.165
75%,1660.0,3.0,120.0,3.0,7352.544
max,52272.0,40.0,3600.0,43.0,12000000.0


In [21]:
df.isnull().sum()

location          0
size              0
total_sqft        0
bath              0
price             0
bhk               0
price_per_sqft    0
dtype: int64

In [22]:
df

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.00,4,4615.384615
2,Uttarahalli,3 BHK,1440.0,2.0,62.00,3,4305.555556
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.00,3,6245.890861
4,Kothanur,2 BHK,1200.0,2.0,51.00,2,4250.000000
...,...,...,...,...,...,...,...
13315,Whitefield,5 Bedroom,3453.0,4.0,231.00,5,6689.834926
13316,Richards Town,4 BHK,3600.0,5.0,400.00,4,11111.111111
13317,Raja Rajeshwari Nagar,2 BHK,1141.0,2.0,60.00,2,5258.545136
13318,Padmanabhanagar,4 BHK,4689.0,4.0,488.00,4,10407.336319


In [23]:
location_count = df['location'].value_counts()
location_count

location
Whitefield           534
Sarjapur  Road       392
Electronic City      302
Kanakpura Road       266
Thanisandra          233
                    ... 
Vidyapeeta             1
Maruthi Extension      1
Okalipura              1
Old Town               1
Abshot Layout          1
Name: count, Length: 1304, dtype: int64

All the values less than 10 will be considered as **others**

In [24]:
location_count_less_10 = location_count[location_count<=10]
location_count_less_10

location
Dairy Circle         10
Kalkere              10
Dodsworth Layout     10
Sadashiva Nagar      10
Naganathapura        10
                     ..
Vidyapeeta            1
Maruthi Extension     1
Okalipura             1
Old Town              1
Abshot Layout         1
Name: count, Length: 1063, dtype: int64

In [25]:
df['location'] =df['location'].apply(lambda x: 'other' if x in location_count_less_10 else x)
location_count = df['location'].value_counts()
location_count

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['location'] =df['location'].apply(lambda x: 'other' if x in location_count_less_10 else x)


location
other                 2896
Whitefield             534
Sarjapur  Road         392
Electronic City        302
Kanakpura Road         266
                      ... 
Marsur                  11
Banjara Layout          11
LB Shastri Nagar        11
Pattandur Agrahara      11
Narayanapura            11
Name: count, Length: 242, dtype: int64

In [26]:
df.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,13246.0,13246.0,13246.0,13246.0,13246.0
mean,1554.458192,2.692586,112.389392,2.801902,7934.705
std,1229.564729,1.341506,149.076587,1.295758,106543.9
min,1.0,1.0,8.0,1.0,267.8298
25%,1100.0,2.0,50.0,2.0,4250.0
50%,1282.0,2.0,72.0,3.0,5444.165
75%,1660.0,3.0,120.0,3.0,7352.544
max,52272.0,40.0,3600.0,43.0,12000000.0


We can see there is 1 sqft column in our dataset we need to filter out these **outliers**

In [27]:
df = df[((df['total_sqft']/df['bhk'])>=300)]

In [28]:
df.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,12498.0,12498.0,12498.0,12498.0,12498.0
mean,1589.177818,2.56321,111.273449,2.648984,6326.470372
std,1252.476465,1.081187,152.090369,0.976932,4224.302842
min,300.0,1.0,9.0,1.0,267.829813
25%,1125.0,2.0,49.0,2.0,4199.070664
50%,1305.0,2.0,70.0,3.0,5294.736984
75%,1693.0,3.0,115.0,3.0,6949.660121
max,52272.0,16.0,3600.0,16.0,176470.588235


The price per sqft. still have outliers wiht its max value so we need to remove it.

In [29]:
def remove_outliers_sqft(data):
    df_output = pd.DataFrame()
    for key,subdf in data.groupby('location'):
        m=np.mean(subdf.price_per_sqft)
        s=np.std(subdf.price_per_sqft)
        gen_df = subdf[(subdf.price_per_sqft>(m-s)) & (subdf.price_per_sqft<=(m+s))]
        df_output = pd.concat([df_output,gen_df],ignore_index = True)
    return df_output
df = remove_outliers_sqft(df)
df.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,10354.0,10354.0,10354.0,10354.0,10354.0
mean,1506.694782,2.475565,91.409577,2.57572,5669.926227
std,874.441683,0.974826,86.576969,0.892737,2287.249477
min,300.0,1.0,10.0,1.0,1250.0
25%,1111.25,2.0,49.0,2.0,4229.897746
50%,1290.0,2.0,67.0,2.0,5175.159236
75%,1650.0,3.0,100.0,3.0,6463.878327
max,30400.0,16.0,2200.0,16.0,24509.803922


We now need to remove outlies in bhk column

In [30]:
def bhk_outliers(data):
    exclude_indices = np.array([])
    for location, location_df in data.groupby('location'):
        bhk_stats = {}
        for bhk, bhk_df in location_df.groupby('bhk'):  # Grouping each location based on bhk
            bhk_stats[bhk] = {
                'mean': np.mean(bhk_df.price_per_sqft),
                'std': np.std(bhk_df.price_per_sqft),
                'count': bhk_df.shape[0]
            }
        # if the price of n-bhk is less than mean of (n-1) bhk then we exclude it
        for bhk, bhk_df in location_df.groupby('bhk'):
            stats = bhk_stats.get(bhk-1)
            if stats and stats['count'] > 5:
                exclude_indices = np.append(exclude_indices, bhk_df[bhk_df.price_per_sqft < stats['mean']].index.values)
    return data.drop(exclude_indices, axis='index')

# Assuming df is your dataframe
df = bhk_outliers(df)
print(df.describe())


         total_sqft         bath        price          bhk  price_per_sqft
count   7363.000000  7363.000000  7363.000000  7363.000000     7363.000000
mean    1498.569197     2.457151    99.519108     2.506994     6143.081574
std      860.757264     1.013843    93.740080     0.926268     2438.846096
min      300.000000     1.000000    10.000000     1.000000     1273.755712
25%     1100.000000     2.000000    50.000000     2.000000     4572.780532
50%     1260.000000     2.000000    74.000000     2.000000     5691.854760
75%     1680.000000     3.000000   115.000000     3.000000     6951.566952
max    30000.000000    16.000000  2200.000000    16.000000    24509.803922


In [31]:
df.shape

(7363, 7)

In [32]:
df

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
1,Devarachikkanahalli,2 BHK,1250.0,2.0,40.0,2,3200.000000
2,Devarachikkanahalli,2 Bedroom,1200.0,2.0,83.0,2,6916.666667
3,Devarachikkanahalli,2 BHK,1170.0,2.0,40.0,2,3418.803419
4,Devarachikkanahalli,3 BHK,1425.0,2.0,65.0,3,4561.403509
5,Devarachikkanahalli,2 BHK,947.0,2.0,43.0,2,4540.654699
...,...,...,...,...,...,...,...
10345,other,2 BHK,1200.0,2.0,70.0,2,5833.333333
10346,other,1 BHK,1800.0,1.0,200.0,1,11111.111111
10349,other,2 BHK,1353.0,2.0,110.0,2,8130.081301
10350,other,1 Bedroom,812.0,1.0,26.0,1,3201.970443


In [33]:
df = df.drop(columns=['size','price_per_sqft'])
#price_per_sqft was only to remove the outliers 
#no need of size since it is stored in bhk

In [34]:
df.to_csv("cleaned_data.csv")

PermissionError: [Errno 13] Permission denied: 'cleaned_data.csv'

## Data is cleaned 
### X stores input 
### y stores the output

In [None]:
X = df.drop(columns=['price'])
y = df['price']
X

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print(X_train.shape)
print(X_test.shape)

### Applying Linear Regression

In [None]:
column_trans = make_column_transformer(
    (OneHotEncoder(sparse_output=False), ['location']),
    remainder='passthrough'
)

In [None]:
scaler = StandardScaler()
lr = LinearRegression()
pipe = make_pipeline(column_trans,scaler,lr)
pipe.fit(X_train,y_train)

In [None]:
y_pred_lr = pipe.predict(X_test)

In [None]:
r2_score(y_test,y_pred_lr)

## Applying Lasso

In [None]:
lasso = Lasso()
pipe = make_pipeline(column_trans,scaler,lasso)
pipe.fit(X_train,y_train)

In [None]:
y_pred_lasso = pipe.predict(X_test)
r2_score(y_test,y_pred_lasso)

## Applying Ridge

In [None]:
ridge = Ridge()
pipe = make_pipeline(column_trans,scaler,ridge)
pipe.fit(X_train,y_train)

In [None]:
y_pred_ridge = pipe.predict(X_test)
r2_score(y_test,y_pred_ridge)

In [None]:
import pickle

pickle.dump(pipe,open('RidgeModel.pkl','wb'))