In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import linear_model

In [2]:
df = pd.read_csv("files/homeprices_6.csv")
df.tail(10)

Unnamed: 0,town,area,price
3,monroe township,3600,680000
4,monroe township,4000,725000
5,west windsor,2600,585000
6,west windsor,2800,615000
7,west windsor,3300,650000
8,west windsor,3600,710000
9,robinsville,2600,575000
10,robinsville,2900,600000
11,robinsville,3100,620000
12,robinsville,3600,695000


In [3]:
# create dummy variables for the town column
dummies = pd.get_dummies(df.town)
dummies

Unnamed: 0,monroe township,robinsville,west windsor
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0
5,0,0,1
6,0,0,1
7,0,0,1
8,0,0,1
9,0,1,0


In [4]:
# how does one hot encoding work
# you create a new column for each of your catageory and assign
# a binary value to each column

In [5]:
# merging
merged = pd.concat([df , dummies] , axis = "columns")
merged

Unnamed: 0,town,area,price,monroe township,robinsville,west windsor
0,monroe township,2600,550000,1,0,0
1,monroe township,3000,565000,1,0,0
2,monroe township,3200,610000,1,0,0
3,monroe township,3600,680000,1,0,0
4,monroe township,4000,725000,1,0,0
5,west windsor,2600,585000,0,0,1
6,west windsor,2800,615000,0,0,1
7,west windsor,3300,650000,0,0,1
8,west windsor,3600,710000,0,0,1
9,robinsville,2600,575000,0,1,0


In [6]:
# gonna drop one dummy column... cause see lec 6:50  in https://www.youtube.com/watch?v=9yl6-HEY7_s&list=PLeo1K3hjS3uvCeTYTeyfe0-rN5r8zn9rw&index=6
# *[cause it one can be derived]
# or just google dummy variable trap

In [7]:
final = merged.drop(["town" , "west windsor"] , axis = "columns")

In [8]:
final

Unnamed: 0,area,price,monroe township,robinsville
0,2600,550000,1,0
1,3000,565000,1,0
2,3200,610000,1,0
3,3600,680000,1,0
4,4000,725000,1,0
5,2600,585000,0,0
6,2800,615000,0,0
7,3300,650000,0,0
8,3600,710000,0,0
9,2600,575000,0,1


In [9]:
# if you are not dropping the column above shown the linear regression 
# model will automatically drop that but it is a good thing to drop first
# or you can import this 
from sklearn.linear_model import LinearRegression

In [10]:
model = LinearRegression()

In [11]:
x = final.drop("price" , axis = "columns") # cause price is y here

In [12]:
y = final.price

In [13]:
model.fit(x , y)

LinearRegression()

In [14]:
model.predict([[2800 , 0,1]])



array([590775.63964739])

In [15]:
# if you want to predict price for house in west windsor you have
# to pass  0 0 cause we dropped the west windsor column so.. yk
model.predict([[2800 , 0,0]])



array([605103.20361213])

In [16]:
# accuracy of the model
# here it comes to be 95 percent
# how does it calculates that.. so
# it predict values for x and then compares the values
# with y
model.score(x,y)

0.9573929037221872

In [17]:
# for label encoding on the town column
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [18]:
dfle = df
dfle.head(5)

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000


In [22]:
dfle = df
# to do label encoding on town column
dfle.town = le.fit_transform(dfle.town)
dfle

Unnamed: 0,town,area,price
0,0,2600,550000
1,0,3000,565000
2,0,3200,610000
3,0,3600,680000
4,0,4000,725000
5,2,2600,585000
6,2,2800,615000
7,2,3300,650000
8,2,3600,710000
9,1,2600,575000


In [23]:
dfle.head(5)

Unnamed: 0,town,area,price
0,0,2600,550000
1,0,3000,565000
2,0,3200,610000
3,0,3600,680000
4,0,4000,725000


In [19]:
# .values cause we want x to be a array not a dataframe
x = dfle[["town" , "area"]].values
x

array([[   0, 2600],
       [   0, 3000],
       [   0, 3200],
       [   0, 3600],
       [   0, 4000],
       [   2, 2600],
       [   2, 2800],
       [   2, 3300],
       [   2, 3600],
       [   1, 2600],
       [   1, 2900],
       [   1, 3100],
       [   1, 3600]], dtype=int64)

In [20]:
y = dfle.price
y

0     550000
1     565000
2     610000
3     680000
4     725000
5     585000
6     615000
7     650000
8     710000
9     575000
10    600000
11    620000
12    695000
Name: price, dtype: int64

In [21]:
# for creating dummy variable column
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer([("town", OneHotEncoder(), [0])], remainder = 'passthrough')
x = ct.fit_transform(x) # *[x is a categorical variable only 1st column.. we specified that above..]

In [22]:
x
# here it created 3 dummy variable columns.. first 3 see
# and the 4th one is the area column

array([[1.0e+00, 0.0e+00, 0.0e+00, 2.6e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.0e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.2e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.6e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 4.0e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 2.6e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 2.8e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.3e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.6e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 2.6e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 2.9e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.1e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.6e+03]])

In [23]:
# take all rows , all columns except 1st column
x = x[:,1:]
x

array([[0.0e+00, 0.0e+00, 2.6e+03],
       [0.0e+00, 0.0e+00, 3.0e+03],
       [0.0e+00, 0.0e+00, 3.2e+03],
       [0.0e+00, 0.0e+00, 3.6e+03],
       [0.0e+00, 0.0e+00, 4.0e+03],
       [0.0e+00, 1.0e+00, 2.6e+03],
       [0.0e+00, 1.0e+00, 2.8e+03],
       [0.0e+00, 1.0e+00, 3.3e+03],
       [0.0e+00, 1.0e+00, 3.6e+03],
       [1.0e+00, 0.0e+00, 2.6e+03],
       [1.0e+00, 0.0e+00, 2.9e+03],
       [1.0e+00, 0.0e+00, 3.1e+03],
       [1.0e+00, 0.0e+00, 3.6e+03]])

In [24]:
new_model = LinearRegression()
new_model.fit(x,y)

LinearRegression()

In [25]:
new_model.predict([[1,0,2800]])

array([590775.63964739])

In [28]:
new_model.score(x,y)

0.9573929037221873