## Importing Libraries

In [276]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [183]:
df=pd.read_csv("/content/Housing (1).csv")

In [184]:
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


## Checking the values of the Categories & Data Pre-processing (One Hot Encoding)

In [185]:
df.mainroad.value_counts()

yes    468
no      77
Name: mainroad, dtype: int64

In [186]:
df['mainroad']=df['mainroad'].apply({'yes':0,'no':1}.get)

In [187]:
df.guestroom.value_counts()

no     448
yes     97
Name: guestroom, dtype: int64

In [188]:
df['guestroom']=df['guestroom'].apply({'yes':0,'no':1}.get)

In [189]:
df.basement.value_counts()

no     354
yes    191
Name: basement, dtype: int64

In [190]:
df['basement']=df['basement'].apply({'yes':0,'no':1}.get)

In [191]:
df.hotwaterheating.value_counts()

no     520
yes     25
Name: hotwaterheating, dtype: int64

In [192]:
df['hotwaterheating']=df['hotwaterheating'].apply({'yes':0,'no':1}.get)

In [193]:
df.airconditioning.value_counts()

no     373
yes    172
Name: airconditioning, dtype: int64

In [194]:
df['airconditioning']=df['airconditioning'].apply({'yes':0,'no':1}.get)

In [195]:
df.prefarea.value_counts()

no     417
yes    128
Name: prefarea, dtype: int64

In [196]:
df['prefarea']=df['prefarea'].apply({'yes':0,'no':1}.get)

In [197]:
df.furnishingstatus.value_counts()

semi-furnished    227
unfurnished       178
furnished         140
Name: furnishingstatus, dtype: int64

In [198]:
df['furnishingstatus']=df['furnishingstatus'].apply({'semi-furnished':10,'unfurnished':20,'furnished':30}.get)

In [199]:
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,0,1,1,1,0,2,0,30
1,12250000,8960,4,4,4,0,1,1,1,0,3,1,30
2,12250000,9960,3,2,2,0,1,0,1,1,2,0,10
3,12215000,7500,4,2,2,0,1,0,1,0,3,0,30
4,11410000,7420,4,1,2,0,0,0,1,0,2,1,30


## Data Pre-processing - Splitting data in Dependent & Indepedent Variables

In [200]:
x = df[['area','bedrooms','bathrooms','stories','furnishingstatus','airconditioning','hotwaterheating','parking','basement']]

In [201]:
z = df[['area','bedrooms','bathrooms','stories','furnishingstatus','airconditioning','hotwaterheating','parking',
        'basement', 'mainroad','guestroom','prefarea']]
#This variable includes all the available fields in the data except Price to check how much impact would addition of following 
#three components will have on the accuracy - Mainroad, Guestroom and Prefarea



In [202]:
y = df['price']

In [203]:
x.head()

Unnamed: 0,area,bedrooms,bathrooms,stories,furnishingstatus,airconditioning,hotwaterheating,parking,basement
0,7420,4,2,3,30,0,1,2,1
1,8960,4,4,4,30,0,1,3,1
2,9960,3,2,2,10,1,1,2,0
3,7500,4,2,2,30,0,1,3,0
4,7420,4,1,2,30,0,1,2,0


In [204]:
y.head()

0    13300000
1    12250000
2    12250000
3    12215000
4    11410000
Name: price, dtype: int64

In [205]:
z.head()

Unnamed: 0,area,bedrooms,bathrooms,stories,furnishingstatus,airconditioning,hotwaterheating,parking,basement,mainroad,guestroom,prefarea
0,7420,4,2,3,30,0,1,2,1,0,1,0
1,8960,4,4,4,30,0,1,3,1,0,1,1
2,9960,3,2,2,10,1,1,2,0,0,1,0
3,7500,4,2,2,30,0,1,3,0,0,1,0
4,7420,4,1,2,30,0,1,2,0,0,0,1


## Data Pre-processing - Splitting data into Training & Testing groups

In [266]:
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.10)

In [267]:
len(x_train)

490

In [268]:
len(y_train)

490

In [269]:
len(x_test)

55

In [270]:
len(y_test)

55

## Model - Linear Regression


In [271]:
model=LinearRegression()
model.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [272]:
pred=model.predict(x_test)

In [273]:
pred[0:5]

array([2812850.36875398, 6793045.46068849, 2671773.78684138,
       2535754.23833441, 5505745.0678785 ])

In [274]:
y.head()

0    13300000
1    12250000
2    12250000
3    12215000
4    11410000
Name: price, dtype: int64

In [275]:
model.score(x,y)

0.6366613509917765

## 0.6366613509917765, i.e. about 63.6% comes out to be the accuracy outcome at 10% data distribution with 90% for training. (Note: Tested that as high the training data was fed to the system to learn, the accuracy increased but marginally. At 99% training dataset, accuracy came out to be 63.9%) 

## Z_train dataset includes the data which was missing in x_train (Mainroad, Guestroom and Prefarea). However, the addition of this data also did not, significantly, improved the accuracy.

## The accuracy, post addition of these three components, came out to be 67.1% at 90% training data. 

In [262]:
z_train,z_test,y_train,y_test = train_test_split(z,y, test_size=0.10)

In [263]:
model=LinearRegression()
model.fit(z_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [264]:
pred=model.predict(z_test)

In [265]:
model.score(z,y)

0.671882539277509