In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib
matplotlib.rcParams["figure.figsize"] = (20,10)

### __Data Load: Load banglore home prices into a dataframe__

In [2]:
data=pd.read_csv(r"C:\Users\guptdraj\OneDrive - TietoEVRY\Desktop\Python notebook\Bengaluru_House_Data.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\guptdraj\\OneDrive - TietoEVRY\\Desktop\\Python notebook\\Bengaluru_House_Data.csv'

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.groupby('area_type')['area_type'].agg('count')

__Drop features that are not required to build our model__

In [None]:
data1=data.drop(['area_type','availability','society','balcony'],axis='columns')

In [None]:
data1.head()

__DATA CLEANING PROCESS__

In [None]:
data1.isnull().sum()

Since the dataset has large number of rows and null values are very less we can drop these rows.

In [None]:
data2 = data1.dropna()
data2.isnull().sum()

In [None]:
data2.shape

In [None]:
data2['size'].unique()

### __Feature Engineering__

__Add new feature(integer) for bhk (Bedrooms Hall Kitchen)__

In [None]:
data2['bhk'] = data2['size'].apply(lambda x: int(x.split(' ')[0]))

In [None]:
data2.head()

In [None]:
data2['bhk'].unique()

In [None]:
data2[data2.bhk>20]

__Explore total_sqft feature__

data2.total_sqft.unique()

In [None]:
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

In [None]:
data2[~data2['total_sqft'].apply(is_float)].head(10)

__Above shows that total_sqft can be a range (e.g. 2100-2850). For such case we can just take average of min and max value in the range. There are other cases such as 34.46Sq. Meter which one can convert to square ft using unit conversion. I am going to just drop such corner cases to keep things simple__

In [None]:
def convert_sqft_to_num(x):
    tokens=x.split('-')
    if len(tokens) == 2:
        return (float(tokens[0])+float(tokens[1]))/2
    try:
        return float(x)
    except:
        return None

__For below row, it shows total_sqft as 2475 which is an average of the range 2100-2850__

In [None]:
convert_sqft_to_num('2100 - 2850')

In [None]:
data3 = data2.copy()
data3['total_sqft'] = data3['total_sqft'].apply(convert_sqft_to_num)
data3.head(5)

In [None]:
data3.loc[30]

__FEATURE ENGINEERING__

In [None]:
data3.head()

__Add new feature called price per square feet__

In [None]:
data4 = data3.copy()
data4['price_per_sqft'] = data4['price']*100000/data4['total_sqft']
data4.head()

In [None]:
len(data4['location'].unique())

__Examine locations which is a categorical variable. We need to apply dimensionality reduction technique here to reduce number of locations__

In [None]:
data4.location = data4.location.apply(lambda x:x.strip())

location_stats = data4.groupby('location')['location'].agg('count').sort_values(ascending = False)
location_stats

In [None]:
len(location_stats[location_stats<=10])

### __Dimensionality Reduction__

__Any location having less than 10 data points should be tagged as "other" location. This way number of categories can be reduced by huge amount. Later on when we do one hot encoding, it will help us with having fewer dummy columns__

In [None]:
location_stats_less_than_10 = location_stats[location_stats<=10]
location_stats_less_than_10

In [None]:
data4.location = data4.location.apply(lambda x: 'other'if x in location_stats_less_than_10 else x)
len(data4.location.unique())

In [None]:
data4.head(10)

### __Outlier Removal Using Business Logic__

__As a data scientist when you have a conversation with your business manager (who has expertise in real estate), he will tell you that normally square ft per bedroom is 300 (i.e. 2 bhk apartment is minimum 600 sqft. If you have for example 400 sqft apartment with 2 bhk than that seems suspicious and can be removed as an outlier. We will remove such outliers by keeping our minimum thresold per bhk to be 300 sqft__

In [None]:
data4[data4.total_sqft/data4.bhk<300].head()

__Check above data points. We have 6 bhk apartment with 1020 sqft. Another one is 8 bhk and total sqft is 600. These are clear data errors that can be removed safely__

In [None]:
data5 = data4[~(data4.total_sqft/data4.bhk<300)]
data5.shape

### __Outlier Removal Using Standard Deviation and Mean__

In [None]:
data5.price_per_sqft.describe()

__Here we find that min price per sqft is 267 rs/sqft whereas max is 176470, this shows a wide variation in property prices. We should remove outliers per location using mean and one standard deviation__

In [None]:
def remove_pps_outliers(df):
    
    df_out=pd.DataFrame()
    for key, subdf in df.groupby ('location'):
        m = np.mean(subdf.price_per_sqft)
        st = np.std(subdf.price_per_sqft)
        reduced_df = subdf[(subdf.price_per_sqft> (m-st))&(subdf.price_per_sqft<=(m+st))]
        df_out = pd.concat([df_out, reduced_df],ignore_index=True)
    return df_out

data6=remove_pps_outliers(data5)
data6.shape

__Let's check if for a given location how does the 2 BHK and 3 BHK property prices look like__

In [None]:
def plot_scatter_chart(df, location):
    bhk2=df[(df.location==location)&(df.bhk==2)]
    bhk3=df[(df.location==location)&(df. bhk==3)]
    matplotlib.rcParams['figure.figsize']=(15,18)
    plt.scatter(bhk2.total_sqft, bhk2.price,color='blue',label='2 BHK', s=50)
    plt.scatter(bhk3. total_sqft, bhk3.price, marker ='+', color='green',label='3 BHK', s=50)
    plt.xlabel("Total Square Feet Area")
    plt.ylabel("Price")
    plt.title(location)
    plt.legend()
                            
plot_scatter_chart(data6, "Rajaji Nagar")

In [None]:
plot_scatter_chart(data6, "Hebbal")

__We should also remove properties where for same location, the price of (for example) 3 bedroom apartment is less than 2 bedroom apartment (with same square ft area). What we will do is for a given location, we will build a dictionary of stats per bhk, i.e.__

{
    '1' : {
        'mean': 4000,
        'std: 2000,
        'count': 34
    },
    '2' : {
        'mean': 4300,
        'std: 2300,
        'count': 22
    },    
}


__Now we can remove those 2 BHK apartments whose price_per_sqft is less than mean price_per_sqft of 1 BHK apartment__

In [None]:
def remove_bhk_outliers(df):
    exclude_indices=np.array([])
    for location, location_df in df.groupby ('location'):
        bhk_stats = {}
        for bhk, bhk_df in location_df.groupby('bhk'):
             bhk_stats[bhk]={
                  'mean': np.mean (bhk_df.price_per_sqft),
                  'std': np.std (bhk_df.price_per_sqft),
                  'count': bhk_df.shape[0]
             }
        for bhk, bhk_df in location_df.groupby('bhk'):
            stats = bhk_stats.get(bhk-1)
            if stats and stats['count']>5:
                 exclude_indices=np.append(exclude_indices, bhk_df[bhk_df.price_per_sqft<(stats['mean'])].index.values)
    return df.drop(exclude_indices, axis='index')
                                                                   
data7=remove_bhk_outliers(data6)
data7. shape

__Plot same scatter chart again to visualize price_per_sqft for 2 BHK and 3 BHK properties__

In [None]:
plot_scatter_chart(data7, "Hebbal")

In [None]:
plot_scatter_chart(data7, "Rajaji Nagar")

__Based on above charts we can see that data points which are outliers are removed due to remove_bhk_outliers function__

In [None]:
import matplotlib
matplotlib.rcParams["figure.figsize"]=(20, 10)
plt.hist(data7.price_per_sqft,rwidth=0.8)
plt.xlabel("Price Per Square Feet")
plt.ylabel("Count")                       

### __Outlier Removal Using Bathrooms Feature__

In [None]:
data7.bath.unique()

In [None]:
data7[data7.bath>10]

__It is unusual to have 2 more bathrooms than number of bedrooms in a home__

In [None]:
data7[data7.bath>data7.bhk+2]

__If you have 4 bedroom home and even if you have bathroom in all 4 rooms plus one guest bathroom, you will have total bath = total bed + 1 max. Anything above that is an outlier or a data error and can be removed__

In [None]:
data8 = data7[data7.bath<data7.bhk+2]
data8.shape

__Price_per_sqft was used for outlier detection only, so can be dropped. Also size can be dropped as we have bhk.__

In [None]:
data9 = data8.drop(['size','price_per_sqft'],axis='columns')
data9.head(10)

## __Use One Hot Encoding For Location__

In [None]:
dummies = pd.get_dummies(data9.location)
dummies.head(3)

In [None]:
data10 = pd.concat([data9,dummies.drop('other',axis='columns')],axis='columns')
data10.head()

In [None]:
data11 = data10.drop('location',axis='columns')
data11.head(2)

### __Build a Model Now...__

In [None]:
data11.shape

In [None]:
X = data11.drop(['price'],axis='columns')
X.head(3)

In [None]:
X.shape

In [None]:
y = data11.price
y.head(3)

In [None]:
len(y)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=10)

In [None]:
from sklearn.linear_model import LinearRegression
lr_clf = LinearRegression()
lr_clf.fit(X_train,y_train)
lr_clf.score(X_test,y_test)

### __Use K Fold cross validation to measure accuracy of our LinearRegression model__

In [None]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)

cross_val_score(LinearRegression(), X, y, cv=cv)

__We can see that in 5 iterations we get a score above 80% all the time. This is pretty good but we want to test few other algorithms for regression to see if we can get even better score. We will use GridSearchCV for this purpose__

### __Find best model using GridSearchCV__

In [None]:
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor

def find_best_model_using_gridsearchcv(X,y):
    algos = {
        'linear_regression' : {
            'model': LinearRegression(),
            'params': {
                'normalize': [True, False]
            }
        },
        'lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [1,2],
                'selection': ['random', 'cyclic']
            }
        },
        'decision_tree': {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion' : ['mse','friedman_mse'],
                'splitter': ['best','random']
            }
        }
    }
    scores = []
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    for algo_name, config in algos.items():
        gs =  GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
        gs.fit(X,y)
        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })

    return pd.DataFrame(scores,columns=['model','best_score','best_params'])

find_best_model_using_gridsearchcv(X,y)

__Based on above results we can say that LinearRegression gives the best score. Hence we will use that.__

### Test the model for few properties

In [None]:
def predict_price(location,sqft,bath,bhk):    
    loc_index = np.where(X.columns==location)[0][0]

    x = np.zeros(len(X.columns))
    x[0] = sqft
    x[1] = bath
    x[2] = bhk
    if loc_index >= 0:
        x[loc_index] = 1

    return lr_clf.predict([x])[0]

In [None]:
predict_price('1st Phase JP Nagar',1000, 2, 2)

In [None]:
predict_price('Indira Nagar',1000, 3, 3)

### __Export the tested model to a pickle file__

In [None]:
import pickle
with open('home_prices_model.pickle','wb') as f:
    pickle.dump(lr_clf,f)

### __Export location and column information to a file that will be useful later on in our prediction application__

In [None]:
import json
columns = {
    'data_columns' : [col.lower() for col in X.columns]
}
with open("columns.json","w") as f:
    f.write(json.dumps(columns))