In [2]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from word2number import w2n

In [3]:
# ====== Sample problem of predicting home price in monroe, new jersey (USA) =======

df = pd.read_csv("homeprices.csv")
df


Unnamed: 0,area,bedrooms,age,price
0,2600,3.0,20,550000
1,3000,4.0,15,565000
2,3200,,18,610000
3,3600,3.0,30,595000
4,4000,5.0,8,760000
5,4100,6.0,8,810000


In [7]:
import math
# we need to handle the NaN data point
# => take a median of this entire column (bedrooms) => safe assumption
# floor -> keep it integer
median_bedrooms = math.floor(df.bedrooms.median())
median_bedrooms

4

In [8]:
# fill all NaN values with this median number
df.bedrooms.fillna(median_bedrooms)

0    3.0
1    4.0
2    4.0
3    3.0
4    5.0
5    6.0
Name: bedrooms, dtype: float64

In [9]:
# assign this back to original series
df.bedrooms = df.bedrooms.fillna(median_bedrooms)
df

# data is always messy => clean the data (prepare) 
# => apply actual machine learning model using that data you train the model

Unnamed: 0,area,bedrooms,age,price
0,2600,3.0,20,550000
1,3000,4.0,15,565000
2,3200,4.0,18,610000
3,3600,3.0,30,595000
4,4000,5.0,8,760000
5,4100,6.0,8,810000


In [10]:
# create a linear regression object
reg = linear_model.LinearRegression()
# use your training set here
# df[['']]: create a data frame using your existing data frame
# indepentdent variables: area, bedrooms, age 
# target variable: price
reg.fit(df[['area','bedrooms','age']], df.price)

In [11]:
reg.coef_

array([  112.06244194, 23388.88007794, -3231.71790863])

In [12]:
reg.intercept_

221323.0018654043

In [13]:
# Find price of home with 3000 sqr ft area, 3 bedrooms, 40 year old
reg.predict([[3000, 3, 40]])



array([498408.25158031])

In [14]:
112.06244194*3000 + 23388.88007794*3 + -3231.71790863*40 + 221323.0018654043

498408.2515740243

In [15]:
# Find price of home with 2500 sqr ft area, 4 bedrooms, 5 year old
reg.predict([[2500, 4, 5]])



array([578876.03748933])

In [13]:

# ================ Exercise =====================
# In exercise folder (same level as this notebook on github) there is hiring.csv. This file contains hiring statics for a firm such as 
# experience of candidate, his written test score and personal interview score. Based on these 3 factors, HR will decide the salary. 
# Given this data, you need to build a machine learning model for HR department that can help them decide salaries for future candidates. 
# Using this predict salaries for following candidates,

# 2 yr experience, 9 test score, 6 interview score

# 12 yr experience, 10 test score, 10 interview score

# Answer
# 53713.86 and 93747.79
# ================================================
import pandas as pd
import numpy as np
from sklearn import linear_model
# use to change word into number
from word2number import w2n

df = pd.read_csv("hiring.csv")
df

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,,8.0,9,50000
1,,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,,7,72000
7,eleven,7.0,8,80000


In [14]:
# fill NaN value in experience column with "zero"
df.experience = df.experience.fillna("zero")
df

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,zero,8.0,9,50000
1,zero,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,,7,72000
7,eleven,7.0,8,80000


In [15]:
# change word into number in the experience column
df.experience = df.experience.apply(w2n.word_to_num)
df

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,0,8.0,9,50000
1,0,8.0,6,45000
2,5,6.0,7,60000
3,2,10.0,10,65000
4,7,9.0,6,70000
5,3,7.0,10,62000
6,10,,7,72000
7,11,7.0,8,80000


In [16]:
import math
median_test_score = math.floor(df['test_score(out of 10)'].mean())
median_test_score

7

In [17]:
# fill the NaN value in test_score column with the median
df['test_score(out of 10)'] = df['test_score(out of 10)'].fillna(median_test_score)
df

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,0,8.0,9,50000
1,0,8.0,6,45000
2,5,6.0,7,60000
3,2,10.0,10,65000
4,7,9.0,6,70000
5,3,7.0,10,62000
6,10,7.0,7,72000
7,11,7.0,8,80000


In [18]:
reg = linear_model.LinearRegression()
reg.fit(df[['experience', 'test_score(out of 10)', 'interview_score(out of 10)']], df['salary($)'])

In [19]:
reg.predict([[2,9,6]])



array([53713.86677124])

In [20]:
reg.predict([[12,10,10]])



array([93747.79628651])