In [4]:
import pandas as pd
import numpy as np
import math
from sklearn import linear_model

We are going to build a model that takes multiple variables like area, bedrooms, and age to give us the price of a home in Monroe, NJ. Then, we are going to predict the price of these homes:

3000 sqr ft area, 3 bedrooms, 40 year old

2500 sqr ft area, 4 bedrooms, 5 year old

Formula:

price = m1(area) + m2(bedrooms) + m3(age) + b

where m1,m2,m3 are coefficients and b is the intercept



In [12]:
df = pd.read_csv("/content/homeprices.csv")
df

Unnamed: 0,area,bedrooms,age,price
0,2600,3.0,20,550000
1,3000,4.0,15,565000
2,3200,,18,610000
3,3600,3.0,30,595000
4,4000,5.0,8,760000
5,4100,6.0,8,810000


Let's handle the null value by replacing it with the median of the column

In [13]:
median_bedrooms = math.floor(df.bedrooms.median())
median_bedrooms

4

In [14]:
df.bedrooms = df.bedrooms.fillna(median_bedrooms)
df

Unnamed: 0,area,bedrooms,age,price
0,2600,3.0,20,550000
1,3000,4.0,15,565000
2,3200,4.0,18,610000
3,3600,3.0,30,595000
4,4000,5.0,8,760000
5,4100,6.0,8,810000


In [15]:
reg = linear_model.LinearRegression()
reg.fit(df[['area', 'bedrooms', 'age']],df.price)

In [16]:
reg.coef_

array([  112.06244194, 23388.88007794, -3231.71790863])

In [18]:
reg.intercept_

np.float64(221323.00186540396)

So the equation is

price = 112.06244194(area) + 23388.88007794(bedrooms) + -3231.71790863(age) + 221323.00186540396

Notice how there is a negative coefficient for age. This makes sense because as the house gets older, price gets lower

Let's do our predictions now

In [20]:
reg.predict([[3000, 3, 40]])



array([498408.25158031])

In [21]:
reg.predict([[2500, 4, 5]])



array([578876.03748933])

Exercise: Given the dataset, hiring.csv, Build a model and predict the salaries for the following people:


2 yr experience, 9 test score, 6 interview score

12 yr experience, 10 test score, 10 interview score

In [22]:
pip install word2number

Collecting word2number
  Downloading word2number-1.1.zip (9.7 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: word2number
  Building wheel for word2number (setup.py) ... [?25l[?25hdone
  Created wheel for word2number: filename=word2number-1.1-py3-none-any.whl size=5568 sha256=f6a25625e6f56e551c695204102a07579e64f1379c3071d0573d77a6ea23d37b
  Stored in directory: /root/.cache/pip/wheels/5b/79/fb/d25928e599c7e11fe4e00d32048cd74933f34a74c633d2aea6
Successfully built word2number
Installing collected packages: word2number
Successfully installed word2number-1.1


In [34]:
from word2number import w2n
df2 = pd.read_csv("/content/hiring.csv")
df2

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,,8.0,9,50000
1,,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,,7,72000
7,eleven,7.0,8,80000


In [35]:
df2.experience = df2.experience.fillna("zero")
df2

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,zero,8.0,9,50000
1,zero,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,,7,72000
7,eleven,7.0,8,80000


In [36]:
df2.experience = df2.experience.apply(w2n.word_to_num)
df2

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,0,8.0,9,50000
1,0,8.0,6,45000
2,5,6.0,7,60000
3,2,10.0,10,65000
4,7,9.0,6,70000
5,3,7.0,10,62000
6,10,,7,72000
7,11,7.0,8,80000


In [37]:
the_median = df2['test_score(out of 10)'].median()
df2['test_score(out of 10)'] = df2['test_score(out of 10)'].fillna(the_median)
df2

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,0,8.0,9,50000
1,0,8.0,6,45000
2,5,6.0,7,60000
3,2,10.0,10,65000
4,7,9.0,6,70000
5,3,7.0,10,62000
6,10,8.0,7,72000
7,11,7.0,8,80000


In [39]:
reg_salary = linear_model.LinearRegression()
reg_salary.fit(df2[['experience', 'test_score(out of 10)', 'interview_score(out of 10)']], df2['salary($)'])

In [40]:
reg_salary.coef_

array([2812.95487627, 1845.70596798, 2205.24017467])

In [41]:
reg_salary.intercept_

np.float64(17737.263464337688)

Now, time for our predictions

In [43]:
reg_salary.predict([[2,9,6]])



array([53205.96797671])

In [44]:
reg_salary.predict([[12,10,10]])



array([92002.18340611])