## Importing Libraries

In [None]:
%matplotlib inline 
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import warnings

warnings.filterwarnings('ignore')

: 

## Loading Dataset

In [None]:
data = pd.read_csv("House Pridiction Data.csv")

: 

## Analyzing Data

In [None]:
data.info()

: 

In [None]:
data.head()

: 

In [None]:
data.describe()

: 

## Data Cleaning

In [None]:
data=data.drop(['area_type','availability','balcony','society'],axis=1)
data

: 

In [None]:
data.isna().sum()

: 

In [None]:
data=data.dropna()

: 

In [None]:
data.isna().sum()

: 

In [None]:
data.shape

: 

In [None]:
data['size'].unique()

: 

## Feature Engineering

In [None]:
data['BHK']=data['size'].apply(lambda x: int(x.split(' ')[0]))

: 

In [None]:
data.head()

: 

In [None]:
data['BHK'].unique()

: 

In [None]:
data[data.BHK>20]

: 

In [None]:
data.total_sqft.unique()

: 

In [None]:
def isfloat(x):
    try:
        float(x)
    except:
        return False
    return True

: 

In [None]:
data[~data['total_sqft'].apply(isfloat)].head(10)

: 

In [None]:
def convert_sqft_tonum(x):
    token=x.split('-')
    if len(token)==2:
        return (float(token[0])+float(token[1]))/2
    try:
        return float(x)
    except:
        return None

: 

In [None]:
data=data.copy()
data['total_sqft']=data['total_sqft'].apply(convert_sqft_tonum)

: 

In [None]:
data.head(10)

: 

In [None]:
data.loc[30]

: 

In [None]:
data1=data.copy()
data1['price_per_sqft']=data1['price']*1000000/data1['total_sqft']
data1.head()

: 

In [None]:
len(data1.location.unique())

: 

In [None]:
data1.location=data1.location.apply(lambda x: x.strip())
location_stats=data1.groupby('location')['location'].agg('count').sort_values(ascending=False)
location_stats

: 

In [None]:
len(location_stats[location_stats<=10])

: 

In [None]:
locationlessthan10=location_stats[location_stats<=10]
locationlessthan10

: 

In [None]:
len(data1.location.unique())

: 

In [None]:
data1.location=data1.location.apply(lambda x: 'other' if x in locationlessthan10 else x)
len(data1.location.unique())

: 

In [None]:
data1.head(10)

: 

In [None]:
data1[data1.total_sqft/data1.BHK<300].head()

: 

In [None]:
data2=data1[~(data1.total_sqft/data1.BHK<300)]
data2.head(10)

: 

In [None]:
data2.shape

: 

## Removing Outliers

In [None]:
data2["price_per_sqft"].describe().apply(lambda x:format(x,'f'))

: 

In [None]:
def remove_pps_outliers(df):
    df_out=pd.DataFrame()
    for key,subdf in df.groupby('location'):
        m=np.mean(subdf.price_per_sqft)
        st=np.std(subdf.price_per_sqft)
        reduced_df=subdf[(subdf.price_per_sqft>(m-st))& (subdf.price_per_sqft<(m+st))]
        df_out=pd.concat([df_out,reduced_df],ignore_index=True)
    return df_out
data3=remove_pps_outliers(data2)
data3.shape

: 

In [None]:
def plot_scatter_chart(df,location):
    bhk2=df[(df.location==location)&(df.BHK==2)]
    bhk3=df[(df.location==location)&(df.BHK==3)]
    plt.rcParams['figure.figsize']=(15,10)
    plt.scatter(bhk2.total_sqft,bhk2.price,color='Blue',label='2 BHK',s=50)
    plt.scatter(bhk3.total_sqft,bhk3.price,color='green',marker='+',label='3 BHK',s=50)
    plt.xlabel('Total Square Foot')
    plt.ylabel('Price')
    plt.title(location)
    plt.legend()
plot_scatter_chart(data3,"Rajaji Nagar")

: 

In [None]:
def remove_bhk_outliers(df):
    exclude_indices=np.array([])
    for location, location_df in df.groupby('location'):
        bhk_sats={}
        for BHK,BHK_df in location_df.groupby('BHK'):
            bhk_sats[BHK]={
                'mean':np.mean(BHK_df.price_per_sqft),
                'std':np.std(BHK_df.price_per_sqft),
                'count':BHK_df.shape[0]
            }
        for BHK,BHK_df in location_df.groupby('BHK'):
            stats=bhk_sats.get(BHK-1)
            if stats and stats['count']>5:
                exclude_indices=np.append(exclude_indices,BHK_df[BHK_df.price_per_sqft<(stats['mean'])].index.values)
    return df.drop(exclude_indices,axis='index')

data4=remove_bhk_outliers(data3)
data4.shape

: 

In [None]:
plot_scatter_chart(data4,"Rajaji Nagar")

: 

In [None]:
plt.rcParams['figure.figsize']=(20,15)
plt.hist(data4.price_per_sqft,rwidth=0.6)
plt.xlabel("Price Per Square Foor")
plt.ylabel("Count")

: 

In [None]:
data4.bath.unique()

: 

In [None]:
data4[data4.bath>10]

: 

In [None]:
plt.rcParams['figure.figsize']=(20,15)
plt.hist(data4.bath,rwidth=0.6)
plt.xlabel("Number Of Bathroom")
plt.ylabel("Count")

: 

In [None]:
data4[data4.bath>data4.BHK+2]

: 

In [None]:
data5=data4[data4.bath<data4.BHK+2]
data5.shape

: 

In [None]:
data6=data5.drop(['size','price_per_sqft'],axis='columns')
data6

: 

In [None]:
dummies=pd.get_dummies(data6.location)
dummies.head(10)

: 

In [None]:
data7=pd.concat([data6,dummies.drop('other',axis='columns')],axis='columns')
data7.head()

: 

In [None]:
data8=data7.drop('location',axis='columns')
data8.head()

: 

In [None]:
data8.shape

: 

In [None]:
X=data8.drop('price',axis='columns')
X.head()

: 

## Building Linear Regression Model

In [None]:
y=data8.price

: 

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

: 

In [None]:
from sklearn.linear_model import LinearRegression
model=LinearRegression()
model.fit(X_train,y_train)
model.score(X_test,y_test)

: 

## Check the accuracy of model using K-Fold Cross Validation

In [None]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)

cross_val_score(LinearRegression(), X, y, cv=cv)

: 

## Testing Model for few features

In [None]:
def price_predict(location,sqft,bath,BHK):
    loc_index=np.where(X.columns==location)[0][0]
    x=np.zeros(len(X.columns))
    x[0]=sqft
    x[1]=bath
    x[2]=BHK
    if loc_index >=0:
        x[loc_index]=1
    return model.predict([x])[0]

: 

In [None]:
price_predict('1st Phase JP Nagar',1000,2,2)

: 

In [None]:
price_predict('1st Phase JP Nagar',1000,2,3)

: 

In [None]:
price_predict('5th Phase JP Nagar',1000,2,2)

: 

In [None]:
price_predict('Indira Nagar',1000,2,2)

: 