# Chennai Real Estate Price prediction 

### Using Python, Linear Regression Model and Angular

##### Importing necessary libraries

In [4]:
import pandas as pd
import math
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib 
matplotlib.rcParams["figure.figsize"] = (20,10)

#####  Reading the data set in csv format

In [5]:
df1 = pd.read_csv("E:\RealEstatePriceProject\dataset\Chennai houseing sale.csv")
df1.head()

Unnamed: 0,PRT_ID,AREA,INT_SQFT,DATE_SALE,DIST_MAINROAD,N_BEDROOM,N_BATHROOM,N_ROOM,SALE_COND,PARK_FACIL,...,UTILITY_AVAIL,STREET,MZZONE,QS_ROOMS,QS_BATHROOM,QS_BEDROOM,QS_OVERALL,REG_FEE,COMMIS,SALES_PRICE
0,P03210,Karapakkam,1004,04-05-2011,131,1.0,1.0,3,AbNormal,Yes,...,AllPub,Paved,A,4.0,3.9,4.9,4.33,380000,144400,7600000
1,P09411,Anna Nagar,1986,19-12-2006,26,2.0,1.0,5,AbNormal,No,...,AllPub,Gravel,RH,4.9,4.2,2.5,3.765,760122,304049,21717770
2,P01812,Adyar,909,04-02-2012,70,1.0,1.0,3,AbNormal,Yes,...,ELO,Gravel,RL,4.1,3.8,2.2,3.09,421094,92114,13159200
3,P05346,Velachery,1855,13-03-2010,14,3.0,2.0,5,Family,No,...,NoSewr,Paved,I,4.7,3.9,3.6,4.01,356321,77042,9630290
4,P06210,Karapakkam,1226,05-10-2009,84,1.0,1.0,3,AbNormal,Yes,...,AllPub,Gravel,C,3.0,2.5,4.1,3.29,237000,74063,7406250


In [None]:
df1.shape

In [None]:
df1.columns

In [None]:
df1['AREA'].unique()

In [None]:
df1['AREA'].value_counts()

In [None]:
df2 = df1.drop(['PRT_ID','DIST_MAINROAD','N_ROOM','DATE_SALE', 'DIST_MAINROAD',
        'SALE_COND', 'PARK_FACIL', 'DATE_BUILD',
       'BUILDTYPE', 'UTILITY_AVAIL', 'STREET', 'MZZONE', 'QS_ROOMS',
       'QS_BATHROOM', 'QS_BEDROOM', 'QS_OVERALL', 'REG_FEE', 'COMMIS'
       ], axis = 'columns')



df2['SALES_PRICE'] = df2['SALES_PRICE'].apply(lambda x: x/100000)
df2.head()


## Data Cleaning

In [None]:
df2.isnull().sum()

In [None]:
df3 = df2.dropna()
df3.isnull().sum()

## Feature Engineering

In [None]:
df3['bhk'] = df3['N_BEDROOM'].apply(lambda x: int(math.floor(x)))
df3.bhk.unique()


In [None]:
df5 = df3.copy()
df5['price_per_sqft'] = df5['SALES_PRICE']*100000/df5['INT_SQFT']
df5.head()

In [None]:
df5_stats = df5['price_per_sqft'].describe()
df5_stats

In [None]:
df5.to_csv("bhp.csv",index=False)

In [None]:
df5.location = df5.AREA.apply(lambda x: x.strip())
location_stats = df5['AREA'].value_counts(ascending=False)
location_stats

### Outlier Reduction

In [None]:
df5[df5.INT_SQFT/df5.bhk<600].head()
df5.shape

In [None]:
df6 = df5[~(df5.INT_SQFT/df5.bhk<600)]
df6.shape

In [None]:
df6.price_per_sqft.describe()


In [None]:
def remove_pps_outliers(df):
    df_out = pd.DataFrame()
    for key, subdf in df.groupby('AREA'):
        m = np.mean(subdf.price_per_sqft)
        st = np.std(subdf.price_per_sqft)
        reduced_df = subdf[(subdf.price_per_sqft>(m-st)) & (subdf.price_per_sqft<=(m+st))]
        df_out = pd.concat([df_out,reduced_df],ignore_index=True)
    return df_out
df7 = remove_pps_outliers(df6)
df7.shape

In [None]:
def plot_scatter_chart(df,location):
    bhk2 = df[(df.AREA==location) & (df.bhk==2)]
    bhk3 = df[(df.AREA==location) & (df.bhk==3)]
    matplotlib.rcParams['figure.figsize'] = (15,10)
    plt.scatter(bhk2.INT_SQFT,bhk2.SALES_PRICE,color='blue',label='2 BHK', s=50)
    plt.scatter(bhk3.INT_SQFT,bhk3.SALES_PRICE,marker='+', color='green',label='3 BHK', s=50)
    plt.xlabel("Total Square Feet Area")
    plt.ylabel("Price (Lakh Indian Rupees)")
    plt.title(location)
    plt.legend()
    
plot_scatter_chart(df7,"KK Nagar")

In [None]:
def remove_bhk_outliers(df):
    exclude_indices = np.array([])
    for location, location_df in df.groupby('AREA'):
        bhk_stats = {}
        for bhk, bhk_df in location_df.groupby('bhk'):
            bhk_stats[bhk] = {
                'mean': np.mean(bhk_df.price_per_sqft),
                'std': np.std(bhk_df.price_per_sqft),
                'count': bhk_df.shape[0]
            }
        for bhk, bhk_df in location_df.groupby('bhk'):
            stats = bhk_stats.get(bhk-1)
            if stats and stats['count']>5:
                exclude_indices = np.append(exclude_indices, bhk_df[bhk_df.price_per_sqft<(stats['mean'])].index.values)
    return df.drop(exclude_indices,axis='index')
df8 = remove_bhk_outliers(df7)
# df8 = df7.copy()
df8.shape

In [None]:
plot_scatter_chart(df8,"T Nagar")


In [None]:
import matplotlib
matplotlib.rcParams["figure.figsize"] = (20,10)
plt.hist(df8.price_per_sqft,rwidth=0.8)
plt.xlabel("Price Per Square Feet")
plt.ylabel("Count")

In [None]:
df8.N_BATHROOM.unique()

In [None]:
df9 = df8[df8.N_BATHROOM<df8.bhk+2]
df9.shape

In [None]:
df10 = df9.drop(['N_BEDROOM','price_per_sqft'],axis='columns')
df10.head(3)

In [None]:
dummies = pd.get_dummies(df10.AREA)
dummies.head(3)

In [None]:
df11 = pd.concat([df10,dummies],axis='columns')
df11.head()


In [None]:
df12 = df11.drop('AREA',axis='columns')
df12.head(2)

## Model Building 

In [None]:
X = df12.drop(['SALES_PRICE'],axis='columns')
X.head(3)

In [None]:
y = df12.SALES_PRICE
y.head(3)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=10)

In [None]:
from sklearn.linear_model import LinearRegression
lr_clf = LinearRegression()
lr_clf.fit(X_train,y_train)
lr_clf.score(X_test,y_test)

In [None]:

from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)

cross_val_score(LinearRegression(), X, y, cv=cv)

In [None]:
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor

def find_best_model_using_gridsearchcv(X,y):
    algos = {
        'linear_regression' : {
            'model': LinearRegression(),
            'params': {
                'normalize': [True, False]
            }
        },
        'lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [1,2],
                'selection': ['random', 'cyclic']
            }
        },
        'decision_tree': {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion' : ['mse','friedman_mse'],
                'splitter': ['best','random']
            }
        }
    }
    scores = []
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    for algo_name, config in algos.items():
        gs =  GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
        gs.fit(X,y)
        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })

    return pd.DataFrame(scores,columns=['model','best_score','best_params'])

find_best_model_using_gridsearchcv(X,y)

In [None]:
def predict_price(location,sqft,bhk):    
    loc_index = np.where(X.columns==location)[0][0]

    x = np.zeros(len(X.columns))
    x[0] = sqft
    x[1] = bhk
    if loc_index >= 0:
        x[loc_index] = 1

    return lr_clf.predict([x])[0]

In [None]:

x = predict_price('KK Nagar',1000, 2, 2)

In [3]:
predict_price('Chrompet',500,1)


NameError: name 'predict_price' is not defined

In [None]:
predict_price('T Nagar',1500, 4)


In [None]:
predict_price('Anna Nagar',500,2)


In [None]:
predict_price('Karapakkam',1000,2)

In [None]:
import pickle
with open('chennai_home_prices_model.pickle','wb') as f:
    pickle.dump(lr_clf,f)

In [None]:
import json
columns = {
    'data_columns' : [col.lower() for col in X.columns]
}
with open("columns.json","w") as f:
    f.write(json.dumps(columns))