In [None]:
# y = mx + b
# for multivariate regression, the formula is used in this way below
# dep_var = m1.x1 + m2.x2 + m3.x3 + b
# here, our prediction is based on three factors
# independent variables are also called features

In [31]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn import linear_model

In [13]:
homeprices =  {
    'area' : [2600, 3000, 3200, 3600, 4000],
    'bedrooms' : [3, 4, np.nan, 3, 5],
    'age': [20, 15, 18, 30, 8],
    'price' : [550000, 565000, 610000, 595000, 760000]
}
homeprices

{'area': [2600, 3000, 3200, 3600, 4000],
 'bedrooms': [3, 4, nan, 3, 5],
 'age': [20, 15, 18, 30, 8],
 'price': [550000, 565000, 610000, 595000, 760000]}

In [14]:
homeprices = pd.DataFrame(homeprices)
homeprices

Unnamed: 0,area,bedrooms,age,price
0,2600,3.0,20,550000
1,3000,4.0,15,565000
2,3200,,18,610000
3,3600,3.0,30,595000
4,4000,5.0,8,760000


In [17]:
# fill the missing value. Using the median of the ccolumn will be a safe assumption
import math
median_bedrooms = math.floor(homeprices.bedrooms.median())
median_bedrooms

3

In [22]:
homeprices.bedrooms = homeprices.bedrooms.fillna(median_bedrooms)
homeprices

Unnamed: 0,area,bedrooms,age,price
0,2600,3.0,20,550000
1,3000,4.0,15,565000
2,3200,3.0,18,610000
3,3600,3.0,30,595000
4,4000,5.0,8,760000


In [23]:
reg = linear_model.LinearRegression()
reg.fit(homeprices[['area', 'bedrooms', 'age']], homeprices.price)

In [24]:
reg.predict([[4500, 6.0, 2]])



array([831550.])

In [25]:
reg.coef_

array([   137.25, -26025.  ,  -6825.  ])

In [26]:
reg.intercept_

383725.00000000006

In [27]:
137.25 * 4500 + -26025. * 6.0 + -6825. * 2 + 383725.00000000006

831550.0

# EXERCISE

In [32]:
interview = {
    'experience' : [np.nan, np.nan, 'five', 'two', 'seven', 'three', 'ten', 'eleven'],
    'test_score(out of 10)' : [8, 8, 6, 10, 9, 7, np.nan, 7],
    'interview_score(out of 10)' : [9, 6, 7, 10, 6, 10, 7, 8],
    'salary($)' : [50000, 45000, 60000, 65000, 70000, 62000, 72000, 80000]
}
interview

{'experience': [nan, nan, 'five', 'two', 'seven', 'three', 'ten', 'eleven'],
 'test_score(out of 10)': [8, 8, 6, 10, 9, 7, nan, 7],
 'interview_score(out of 10)': [9, 6, 7, 10, 6, 10, 7, 8],
 'salary($)': [50000, 45000, 60000, 65000, 70000, 62000, 72000, 80000]}

In [33]:
interview = pd.DataFrame(interview)
interview

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,,8.0,9,50000
1,,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,,7,72000
7,eleven,7.0,8,80000


In [51]:
interview.experience.fillna("zero")

0       nan
1       nan
2      five
3       two
4     seven
5     three
6       ten
7    eleven
Name: experience, dtype: object

In [49]:
interview

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,,8.0,9,50000
1,,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,,7,72000
7,eleven,7.0,8,80000


In [41]:
from word2number import w2n
interview.experience = w2n.word_to_num(interview['experience'])
interview

ValueError: Type of input is not string! Please enter a valid number word (eg. 'two million twenty three thousand and forty nine')