In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib
matplotlib.rcParams["figure.figsize"]=(20,10)

### Step 0. Data Load from CSV file (Hyderabad house rent data)

In [3]:
df1 = pd.read_csv("Hyderabad_HouseRent_Data.csv", index_col=0)
df1.head(10)

Unnamed: 0,bedrooms,bathrooms,furnish,tennants,total_sqft,price,location
0,3 BHK Builder Floor,2,Furnished,Bachelors/Family,1800 sqft,34000,"Bhagyalaxmi Nagar, Kavadiguda"
1,3 BHK Apartment,2,Semi-Furnished,Family,2500 sqft,45000,"Gachibowli, Outer Ring Road"
2,1 BHK Builder Floor,Immediately,Furnished,Bachelors/Family,read more,18000,Gachibowli
3,3 BHK Apartment,Immediately,Furnished,Bachelors/Family,2160 sqft,40000,"Moosapet, NH"
4,3 BHK Apartment,2,Semi-Furnished,Family,1580 sqft,23000,Raghavendra Colony kondapur
5,3 BHK Apartment,2,Furnished,Bachelors/Family,2600 sqft,50000,"Banjara Darwaja, Golconda Fort"
6,3 BHK Service Apartment,1,Semi-Furnished,Bachelors/Family,1350 sqft,32000,"Mohans Sai Signature, Nanakramguda"
7,Studio Apartment,1,Furnished,Bachelors/Family,read more,13000,Gowlidoddy
8,3 BHK Apartment,2,Furnished,Bachelors/Family,1700 sqft,35000,"Somajiguda, NH"
9,3 BHK Service Apartment,1,Unfurnished,Family,1150 sqft,11000,Kapra


In [4]:
df1.shape

(1171, 7)

## Data Cleaning

In [5]:
df2 = df1.drop(["bathrooms", "tennants"], axis="columns")

In [6]:
df2.shape

(1171, 5)

In [7]:
df2.head(5)

Unnamed: 0,bedrooms,furnish,total_sqft,price,location
0,3 BHK Builder Floor,Furnished,1800 sqft,34000,"Bhagyalaxmi Nagar, Kavadiguda"
1,3 BHK Apartment,Semi-Furnished,2500 sqft,45000,"Gachibowli, Outer Ring Road"
2,1 BHK Builder Floor,Furnished,read more,18000,Gachibowli
3,3 BHK Apartment,Furnished,2160 sqft,40000,"Moosapet, NH"
4,3 BHK Apartment,Semi-Furnished,1580 sqft,23000,Raghavendra Colony kondapur


In [8]:
df2.isnull().sum()

bedrooms       0
furnish        0
total_sqft    22
price          0
location       0
dtype: int64

In [9]:
df2.dropna(how="any", axis=0, inplace=True)

In [10]:
df2.isnull().sum()

bedrooms      0
furnish       0
total_sqft    0
price         0
location      0
dtype: int64

In [11]:
df2.head(5)

Unnamed: 0,bedrooms,furnish,total_sqft,price,location
0,3 BHK Builder Floor,Furnished,1800 sqft,34000,"Bhagyalaxmi Nagar, Kavadiguda"
1,3 BHK Apartment,Semi-Furnished,2500 sqft,45000,"Gachibowli, Outer Ring Road"
2,1 BHK Builder Floor,Furnished,read more,18000,Gachibowli
3,3 BHK Apartment,Furnished,2160 sqft,40000,"Moosapet, NH"
4,3 BHK Apartment,Semi-Furnished,1580 sqft,23000,Raghavendra Colony kondapur


In [12]:
df3 = df2[~df2["furnish"].str.contains("Family", case=False)]

In [13]:
df3.furnish.unique()

array(['Furnished', 'Semi-Furnished', 'Unfurnished'], dtype=object)

In [14]:
df3.furnish.value_counts()

Semi-Furnished    614
Unfurnished       327
Furnished         181
Name: furnish, dtype: int64

## Feature Engineering (creating another variable here as 'bhk')

In [15]:
df3['bhk']=df3['bedrooms'].apply(lambda x:int(x.split(' ')[0]) if x.split(' ')[0].isnumeric() else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3['bhk']=df3['bedrooms'].apply(lambda x:int(x.split(' ')[0]) if x.split(' ')[0].isnumeric() else 0)


In [16]:
df3.bhk.value_counts()

3    518
2    406
0    123
1     54
4     20
5      1
Name: bhk, dtype: int64

In [17]:
df3.shape

(1122, 6)

In [18]:
df4 = df3[~df3["bhk"].isin([0,5])]

In [19]:
df4.bhk.value_counts()

3    518
2    406
1     54
4     20
Name: bhk, dtype: int64

In [20]:
df4.drop("bedrooms", axis="columns", inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df4.drop("bedrooms", axis="columns", inplace=True)


In [21]:
df4.head()

Unnamed: 0,furnish,total_sqft,price,location,bhk
0,Furnished,1800 sqft,34000,"Bhagyalaxmi Nagar, Kavadiguda",3
1,Semi-Furnished,2500 sqft,45000,"Gachibowli, Outer Ring Road",3
2,Furnished,read more,18000,Gachibowli,1
3,Furnished,2160 sqft,40000,"Moosapet, NH",3
4,Semi-Furnished,1580 sqft,23000,Raghavendra Colony kondapur,3


In [22]:
df4.location.unique()

array([' Bhagyalaxmi Nagar, Kavadiguda ', ' Gachibowli, Outer Ring Road ',
       ' Gachibowli', ' Moosapet, NH ', ' Raghavendra Colony kondapur ',
       ' Banjara Darwaja, Golconda Fort ',
       ' Mohans Sai Signature, Nanakramguda ', ' Somajiguda, NH ',
       ' Kapra ', ' Tilak Nagar, New Nallakunta, NH ',
       ' Neknampur, Manikonda, Outer Ring Road ',
       ' Mayuri Nagar, Miyapur ',
       ' Accurate Wind Chimes, Narsingi, Outer Ring Road ',
       ' Hyder Nagar, NH ',
       ' Vajras Jasmine County, Financial district gucchibowli ',
       ' Tirumala Gardens, Banjara Hills, NH ',
       ' Paragon Venkatadri Apartments, Kachiguda, NH ',
       ' Meenakshi Homes, Miyapur, NH ', ' Papulguda ',
       ' Bachupally, Outer Ring Road ', ' Nanakram Guda ',
       ' L&T Serene County, Gachibowli, Outer Ring Road ',
       ' SM Plaza, ECIL ', ' Chatta Bazaar, Pathar Gatti ',
       ' Lodha Meridian, Kukatpally Housing Board Colony, NH ',
       ' Hitech City ', ' Banjara Hills, NH ',

In [23]:
df4['total_sqft']=df4['total_sqft'].apply(lambda x:int(x.split(' ')[0]) if x.split(' ')[0].isnumeric() else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df4['total_sqft']=df4['total_sqft'].apply(lambda x:int(x.split(' ')[0]) if x.split(' ')[0].isnumeric() else 0)


In [24]:
df4.head()

Unnamed: 0,furnish,total_sqft,price,location,bhk
0,Furnished,1800,34000,"Bhagyalaxmi Nagar, Kavadiguda",3
1,Semi-Furnished,2500,45000,"Gachibowli, Outer Ring Road",3
2,Furnished,0,18000,Gachibowli,1
3,Furnished,2160,40000,"Moosapet, NH",3
4,Semi-Furnished,1580,23000,Raghavendra Colony kondapur,3


## Outlier removal (deleting some rows based the total sqft values)

In [25]:
df5 = df4[~((df4["total_sqft"]>=0) & (df4["total_sqft"]<=300))]

In [26]:
df5.head()

Unnamed: 0,furnish,total_sqft,price,location,bhk
0,Furnished,1800,34000,"Bhagyalaxmi Nagar, Kavadiguda",3
1,Semi-Furnished,2500,45000,"Gachibowli, Outer Ring Road",3
3,Furnished,2160,40000,"Moosapet, NH",3
4,Semi-Furnished,1580,23000,Raghavendra Colony kondapur,3
5,Furnished,2600,50000,"Banjara Darwaja, Golconda Fort",3


In [27]:
df5.shape

(895, 5)

In [28]:
df5.total_sqft.describe()

count     895.000000
mean     1499.756425
std       480.535504
min       500.000000
25%      1161.000000
50%      1436.000000
75%      1750.000000
max      6000.000000
Name: total_sqft, dtype: float64

In [29]:
len(df5['location'].unique())

473

## Dimensionality Reduction (based on location frequency)

In [30]:
location_stats = df5.groupby('location')['location'].agg('count').sort_values(ascending=False)

In [31]:
pd.set_option('display.max_rows', None)
print(location_stats)

location
 Attapur                                                                       21
 My Home Avatar, Narsingi, Outer Ring Road                                     17
 Nizampet                                                                      16
 Kondapur                                                                      15
 Gachibowli, Outer Ring Road                                                   13
 Bachupally, Outer Ring Road                                                   12
 Manikonda, Outer Ring Road                                                    10
 Banjara Hills, NH                                                             10
 Aditya Imperial Heights, Hafeezpet, NH                                         9
 Gajularamaram                                                                  9
 Narsingi, Outer Ring Road                                                      9
 Toli Chowki                                                                    9
 Chanda

In [32]:
len(location_stats[location_stats<5])-473

-34

In [33]:
location_stats_less_than_5 = location_stats[location_stats<5]

In [34]:
df5.location = df5.location.apply(lambda x: 'other' if x in location_stats_less_than_5 else x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df5.location = df5.location.apply(lambda x: 'other' if x in location_stats_less_than_5 else x)


In [35]:
len(df5['location'].unique())

35

In [36]:
df5.tail()

Unnamed: 0,furnish,total_sqft,price,location,bhk
1163,Semi-Furnished,1200,20000,other,2
1164,Semi-Furnished,1480,34000,other,3
1166,Unfurnished,900,15000,other,2
1168,Unfurnished,1515,14000,other,3
1170,Semi-Furnished,1100,12000,other,2


In [44]:
df5["furnish2"] = df5["furnish"].map({'Unfurnished':1, 'Semi-Furnished':2, 'Furnished':3})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df5["furnish2"] = df5["furnish"].map({'Unfurnished':1, 'Semi-Furnished':2, 'Furnished':3})


In [45]:
df6=df5.drop("furnish", axis="columns")

In [47]:
df6.rename(columns={'furnish2': 'furnish'}, inplace=True)

In [48]:
df6.head()

Unnamed: 0,total_sqft,price,location,bhk,furnish
0,1800,34000,other,3,3
1,2500,45000,"Gachibowli, Outer Ring Road",3,2
3,2160,40000,other,3,3
4,1580,23000,other,3,2
5,2600,50000,other,3,3


## One hot encoding (i.e. converting categorical var to numeric using get_dummies)

In [49]:
dummies = pd.get_dummies(df6["location"])

In [50]:
dummies.head()

Unnamed: 0,"Aditya Imperial Heights, Hafeezpet, NH","Ameerpet, NH",Attapur,"Bachupally, Outer Ring Road",Bandlaguda Jagir,"Banjara Hills, NH",Begumpet,"Chandanagar, NH",Dr A.S. Rao Nagar,"Gachibowli, Outer Ring Road",...,"My Home Avatar, Narsingi, Outer Ring Road","My Home Vihanga, Gachibowli, Outer Ring Road","Narsingi, Outer Ring Road",Nizampet,Puppalaguda,Shaikpet,Suchitra Circle,Toli Chowki,"Upparpally, Hyderabad Expressway",other
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [51]:
df7 = pd.concat([df6, dummies.drop("other", axis="columns")], axis="columns")

In [52]:
df8 = df7.drop("location", axis="columns")

In [53]:
df8["price"]

0       34,000
1       45,000
3       40,000
4       23,000
5       50,000
6       32,000
8       35,000
9       11,000
10      19,000
11      30,000
12      18,000
13      30,000
14      31,000
15      49,000
16      38,000
17      18,000
18      26,000
19      25,000
20      12,000
21      34,000
22      39,000
23      37,000
24      13,000
25      35,000
26      45,000
27      38,000
28      45,000
29      35,000
30      35,000
31      35,000
32      40,000
33      25,000
34      25,000
36      19,000
37      35,000
38      18,000
39      40,000
40      24,000
41      35,000
42      30,000
44      16,000
45      20,000
46      12,000
47      28,000
48      17,000
49      45,000
50      15,500
51      16,000
52      20,000
53      36,000
54      45,000
56      20,000
57      15,000
58      30,000
59      35,000
60      36,000
61      13,000
62      35,000
63      22,000
64      25,000
65      30,000
66      17,500
67      44,000
68      24,000
69      43,000
70      35,000
71      25

In [54]:
df8['price'] = pd.to_numeric(df8['price'].str.replace(',', ''), errors='coerce')

In [55]:
df8 = df8.reset_index(drop=True)


In [66]:
df8.rename(columns=lambda x: x.strip(), inplace=True)

In [67]:
X = df8.drop("price", axis="columns")
y = df8["price"]

In [68]:
X.head()

Unnamed: 0,total_sqft,bhk,furnish,"Aditya Imperial Heights, Hafeezpet, NH","Ameerpet, NH",Attapur,"Bachupally, Outer Ring Road",Bandlaguda Jagir,"Banjara Hills, NH",Begumpet,...,"Miyapur, NH","My Home Avatar, Narsingi, Outer Ring Road","My Home Vihanga, Gachibowli, Outer Ring Road","Narsingi, Outer Ring Road",Nizampet,Puppalaguda,Shaikpet,Suchitra Circle,Toli Chowki,"Upparpally, Hyderabad Expressway"
0,1800,3,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2500,3,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2160,3,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1580,3,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2600,3,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [69]:
y

0      34000
1      45000
2      40000
3      23000
4      50000
5      32000
6      35000
7      11000
8      19000
9      30000
10     18000
11     30000
12     31000
13     49000
14     38000
15     18000
16     26000
17     25000
18     12000
19     34000
20     39000
21     37000
22     13000
23     35000
24     45000
25     38000
26     45000
27     35000
28     35000
29     35000
30     40000
31     25000
32     25000
33     19000
34     35000
35     18000
36     40000
37     24000
38     35000
39     30000
40     16000
41     20000
42     12000
43     28000
44     17000
45     45000
46     15500
47     16000
48     20000
49     36000
50     45000
51     20000
52     15000
53     30000
54     35000
55     36000
56     13000
57     35000
58     22000
59     25000
60     30000
61     17500
62     44000
63     24000
64     43000
65     35000
66     25000
67     25000
68     30000
69     12000
70     50000
71     48000
72     30000
73     19000
74     38000
75     35000
76     10000

In [70]:
X.columns

Index(['total_sqft', 'bhk', 'furnish',
       'Aditya Imperial Heights, Hafeezpet, NH', 'Ameerpet, NH', 'Attapur',
       'Bachupally, Outer Ring Road', 'Bandlaguda Jagir', 'Banjara Hills, NH',
       'Begumpet', 'Chandanagar, NH', 'Dr A.S. Rao Nagar',
       'Gachibowli, Outer Ring Road', 'Gajularamaram', 'Hafeezpet, NH',
       'Hitech City', 'Hyder Nagar, NH',
       'Jains Carlton Creek, Gachibowli, Outer Ring Road',
       'Kokapet, Outer Ring Road', 'Kondapur',
       'Kukatpally Housing Board Colony, NH', 'Kukatpally, NH', 'LB Nagar, NH',
       'Manikonda, Outer Ring Road',
       'Mantri Celestia, Gachibowli, Outer Ring Road', 'Masab Tank',
       'Mehdipatnam', 'Miyapur, NH',
       'My Home Avatar, Narsingi, Outer Ring Road',
       'My Home Vihanga, Gachibowli, Outer Ring Road',
       'Narsingi, Outer Ring Road', 'Nizampet', 'Puppalaguda', 'Shaikpet',
       'Suchitra Circle', 'Toli Chowki', 'Upparpally, Hyderabad Expressway'],
      dtype='object')

## Building regression model by splitting data for training and testing

In [72]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=10)

In [73]:
from sklearn.linear_model import LinearRegression
lr_clf = LinearRegression()
lr_clf.fit(X_train,y_train)
lr_clf.score(X_test,y_test)

0.5631436433321091

## K-Fold cross validation for testing the accuracy of LinearRegression model

In [74]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)

cross_val_score(LinearRegression(), X, y, cv=cv)

array([0.49010746, 0.50626682, 0.55858976, 0.48469708, 0.46656977])

In [75]:
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor

def find_best_model_using_gridsearchcv(X,y):
    algos = {
        'linear_regression' : {
            'model': LinearRegression(),
            'params': {
                'copy_X': [True, False]
            }
        },
        'lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [1,2],
                'selection': ['random', 'cyclic']
            }
        },
        'decision_tree': {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion' : ['mse','friedman_mse'],
                'splitter': ['best','random']
            }
        }
    }
    scores = []
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    
    for algo_name, config in algos.items():
        gs =  GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
        gs.fit(X,y)
        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })

    return pd.DataFrame(scores,columns=['model','best_score','best_params'])

In [76]:
find_best_model_using_gridsearchcv(X,y)

10 fits failed out of a total of 20.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\USER\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\USER\anaconda3\lib\site-packages\sklearn\tree\_classes.py", line 1247, in fit
    super().fit(
  File "C:\Users\USER\anaconda3\lib\site-packages\sklearn\tree\_classes.py", line 177, in fit
    self._validate_params()
  File "C:\Users\USER\anaconda3\lib\site-packages\sklearn\base.py", line 581, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\USER\anaconda3\lib\site-packages

Unnamed: 0,model,best_score,best_params
0,linear_regression,0.501246,{'copy_X': True}
1,lasso,0.5015,"{'alpha': 2, 'selection': 'cyclic'}"
2,decision_tree,0.318569,"{'criterion': 'friedman_mse', 'splitter': 'ran..."


## testing the model for few known properties

In [77]:
def predict_price(location,total_sqft,bhk,furnish):
    loc_index = np.where(X.columns==location)[0][0]
    
    x = np.zeros(len(X.columns))
    x[0] = total_sqft
    x[1] = bhk
    x[2] = furnish
    if loc_index >= 0:
        x[loc_index]=1
    
    return round(lr_clf.predict([x])[0],0)

In [78]:
X.columns

Index(['total_sqft', 'bhk', 'furnish',
       'Aditya Imperial Heights, Hafeezpet, NH', 'Ameerpet, NH', 'Attapur',
       'Bachupally, Outer Ring Road', 'Bandlaguda Jagir', 'Banjara Hills, NH',
       'Begumpet', 'Chandanagar, NH', 'Dr A.S. Rao Nagar',
       'Gachibowli, Outer Ring Road', 'Gajularamaram', 'Hafeezpet, NH',
       'Hitech City', 'Hyder Nagar, NH',
       'Jains Carlton Creek, Gachibowli, Outer Ring Road',
       'Kokapet, Outer Ring Road', 'Kondapur',
       'Kukatpally Housing Board Colony, NH', 'Kukatpally, NH', 'LB Nagar, NH',
       'Manikonda, Outer Ring Road',
       'Mantri Celestia, Gachibowli, Outer Ring Road', 'Masab Tank',
       'Mehdipatnam', 'Miyapur, NH',
       'My Home Avatar, Narsingi, Outer Ring Road',
       'My Home Vihanga, Gachibowli, Outer Ring Road',
       'Narsingi, Outer Ring Road', 'Nizampet', 'Puppalaguda', 'Shaikpet',
       'Suchitra Circle', 'Toli Chowki', 'Upparpally, Hyderabad Expressway'],
      dtype='object')

In [80]:
predict_price('Ameerpet, NH',1000,2,1)



16072.0

## Export the tested model to a pickle file

In [81]:
import pickle

with open('Hyderabd_house_rent_model.pickle','wb') as f:
    pickle.dump(lr_clf,f)

## Export location and column information to a json which will be used during deployment

In [82]:
import json
columns = {
    'data_columns' : [col.lower() for col in X.columns]
}
with open("columns.json","w") as f:
    f.write(json.dumps(columns))