In [48]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [49]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import linear_model

In [50]:
df = pd.read_csv('/content/drive/MyDrive/DATASETS/homeprices2.csv')
df

Unnamed: 0,area,bedrooms,age,price
0,2600,3.0,20,550000
1,3000,4.0,15,565000
2,3200,,18,610000
3,3600,3.0,30,595000
4,4000,5.0,8,760000
5,4100,6.0,8,810000


###***Data Preprocessing: Fill NA values with median value of a column***

In [51]:
df.bedrooms = df.bedrooms.fillna(df.bedrooms.median())
df

Unnamed: 0,area,bedrooms,age,price
0,2600,3.0,20,550000
1,3000,4.0,15,565000
2,3200,4.0,18,610000
3,3600,3.0,30,595000
4,4000,5.0,8,760000
5,4100,6.0,8,810000


###***Linear Regression***

In [52]:
reg = linear_model.LinearRegression()
reg.fit(df.drop('price',axis='columns'),df.price)

LinearRegression()

In [53]:
reg.coef_

array([  112.06244194, 23388.88007794, -3231.71790863])

In [54]:
reg.intercept_

221323.00186540396

###***Find price of home with 2500 sqr ft area, 4 bedrooms, 5 year old***:

In [55]:
import warnings
warnings.filterwarnings('ignore')

In [56]:
reg.predict([[2500,4,5]])

array([578876.03748933])

###***Find price of home with 3000 sqr ft area, 3 bedrooms, 40 year old***

In [57]:
reg.predict([[3000,3,40]])

array([498408.25158031])

###***Generate CSV file with list of home price predictions***

In [58]:
area_df = pd.read_csv("/content/drive/MyDrive/DATASETS/areas.csv")
area_df.head()

Unnamed: 0,area
0,1000
1,1500
2,2300
3,3540
4,4120


In [59]:
#Adding bedrooms and age column to my dataset before prediction 

area_df['bedrooms'] = np.random.randint(1, 5, area_df.shape[0])
area_df['age'] = np.random.randint(5,25, area_df.shape[0])
area_df

Unnamed: 0,area,bedrooms,age
0,1000,1,7
1,1500,1,21
2,2300,4,7
3,3540,2,24
4,4120,3,23
5,4560,2,11
6,5490,1,22
7,3460,1,22
8,4750,1,13
9,2300,1,10


In [60]:
p = reg.predict(area_df)
p

array([ 334152.29852505,  344939.46877525,  550000.11328364,
        587240.57668925,  678857.39100226,  743556.60028245,
        788836.89421574,  561350.1370732 ,  734996.14835625,
        470138.31932392, 1267734.4404921 , 1192290.29310254,
        966025.70783394])

In [61]:
area_df['Prices'] = p
area_df

Unnamed: 0,area,bedrooms,age,Prices
0,1000,1,7,334152.3
1,1500,1,21,344939.5
2,2300,4,7,550000.1
3,3540,2,24,587240.6
4,4120,3,23,678857.4
5,4560,2,11,743556.6
6,5490,1,22,788836.9
7,3460,1,22,561350.1
8,4750,1,13,734996.1
9,2300,1,10,470138.3


In [62]:
#Rounding off the Prices values
area_df['Prices'] = area_df['Prices'].apply(np.floor)
area_df

Unnamed: 0,area,bedrooms,age,Prices
0,1000,1,7,334152.0
1,1500,1,21,344939.0
2,2300,4,7,550000.0
3,3540,2,24,587240.0
4,4120,3,23,678857.0
5,4560,2,11,743556.0
6,5490,1,22,788836.0
7,3460,1,22,561350.0
8,4750,1,13,734996.0
9,2300,1,10,470138.0


In [64]:
area_df.to_csv("/content/drive/MyDrive/DATASETS/Homeprices_Canada_prediction.csv" , index = False)