In [76]:
import pandas as pd

In [77]:
df = pd.read_csv('homePricesOfCities.csv')

In [78]:
df

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000
5,west windsor,2600,585000
6,west windsor,2800,615000
7,west windsor,3300,650000
8,west windsor,3600,710000
9,robinsville,2600,575000


## Method 1 Using dummies variables

In [79]:
dummies = pd.get_dummies(df.town)   # To create multiple columns based on values in each row

In [80]:
dummies

Unnamed: 0,monroe township,robinsville,west windsor
0,True,False,False
1,True,False,False
2,True,False,False
3,True,False,False
4,True,False,False
5,False,False,True
6,False,False,True
7,False,False,True
8,False,False,True
9,False,True,False


In [81]:
mergedtable = pd.concat([df, dummies], axis='columns')  # Joining two rows. Actual and dummies

In [7]:
mergedtable

Unnamed: 0,town,area,price,monroe township,robinsville,west windsor
0,monroe township,2600,550000,True,False,False
1,monroe township,3000,565000,True,False,False
2,monroe township,3200,610000,True,False,False
3,monroe township,3600,680000,True,False,False
4,monroe township,4000,725000,True,False,False
5,west windsor,2600,585000,False,False,True
6,west windsor,2800,615000,False,False,True
7,west windsor,3300,650000,False,False,True
8,west windsor,3600,710000,False,False,True
9,robinsville,2600,575000,False,True,False


In [28]:
finalTable = mergedtable.drop(['town', 'west windsor'], axis='columns') # Here we are dropping town as we have already created column for each value. Also we are dropping west windsor because with previous two column(with value false) it can be derived

In [29]:
finalTable

Unnamed: 0,area,price,monroe township,robinsville
0,2600,550000,True,False
1,3000,565000,True,False
2,3200,610000,True,False
3,3600,680000,True,False
4,4000,725000,True,False
5,2600,585000,False,False
6,2800,615000,False,False
7,3300,650000,False,False
8,3600,710000,False,False
9,2600,575000,False,True


In [30]:
x = finalTable.drop(['price'], axis='columns')

In [31]:
x

Unnamed: 0,area,monroe township,robinsville
0,2600,True,False
1,3000,True,False
2,3200,True,False
3,3600,True,False
4,4000,True,False
5,2600,False,False
6,2800,False,False
7,3300,False,False
8,3600,False,False
9,2600,False,True


In [32]:
y = finalTable.price

In [33]:
y

0     550000
1     565000
2     610000
3     680000
4     725000
5     585000
6     615000
7     650000
8     710000
9     575000
10    600000
11    620000
12    695000
Name: price, dtype: int64

In [34]:
from sklearn.linear_model import LinearRegression

In [35]:
model = LinearRegression()

In [36]:
model.fit(x,y)                #fit is for training the model

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [37]:
model.score(x,y)   # This is to find the accurate score

0.9573929037221872

In [38]:
model.predict([[5000, True, False]])   # This is to find in monroe township



array([844263.59922594])

In [39]:
model.predict([[5000, False, True]])  # This is to find in robinsville



array([869950.01075034])

In [40]:
model.predict([[5000, False, False]]) # This is to find in west windsor



array([884277.57471508])

## Method 2 : Using One hot encoder

In [1]:
from sklearn.preprocessing import LabelEncoder

In [2]:
le = LabelEncoder()

In [8]:
df2 = df
df2

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000
5,west windsor,2600,585000
6,west windsor,2800,615000
7,west windsor,3300,650000
8,west windsor,3600,710000
9,robinsville,2600,575000


In [10]:
df2.town = le.fit_transform(df['town'])

In [11]:
df2

Unnamed: 0,town,area,price
0,0,2600,550000
1,0,3000,565000
2,0,3200,610000
3,0,3600,680000
4,0,4000,725000
5,2,2600,585000
6,2,2800,615000
7,2,3300,650000
8,2,3600,710000
9,1,2600,575000


In [59]:
X = df2[['town', 'area']].values          #pass the values in array

In [60]:
X

array([[   0, 2600],
       [   0, 3000],
       [   0, 3200],
       [   0, 3600],
       [   0, 4000],
       [   2, 2600],
       [   2, 2800],
       [   2, 3300],
       [   2, 3600],
       [   1, 2600],
       [   1, 2900],
       [   1, 3100],
       [   1, 3600]])

In [61]:
y = df2.price.values

In [62]:
y

array([550000, 565000, 610000, 680000, 725000, 585000, 615000, 650000,
       710000, 575000, 600000, 620000, 695000])

In [63]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer([('town', OneHotEncoder(), [0])], remainder='passthrough')  
# IMPORTANT: We can see few difference in video and here. Because the video one was deprecated

In [64]:
X = ct.fit_transform(X)   # This will assign each town value to area and create 2D array

In [65]:
X

array([[1.0e+00, 0.0e+00, 0.0e+00, 2.6e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.0e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.2e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.6e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 4.0e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 2.6e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 2.8e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.3e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.6e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 2.6e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 2.9e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.1e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.6e+03]])

In [74]:
X = X[:,1:]              # Dropping first column (monroe township)
X

array([[0.0e+00, 2.6e+03],
       [0.0e+00, 3.0e+03],
       [0.0e+00, 3.2e+03],
       [0.0e+00, 3.6e+03],
       [0.0e+00, 4.0e+03],
       [1.0e+00, 2.6e+03],
       [1.0e+00, 2.8e+03],
       [1.0e+00, 3.3e+03],
       [1.0e+00, 3.6e+03],
       [0.0e+00, 2.6e+03],
       [0.0e+00, 2.9e+03],
       [0.0e+00, 3.1e+03],
       [0.0e+00, 3.6e+03]])

In [67]:
from sklearn.linear_model import LinearRegression
model2 = LinearRegression()

In [68]:
model2.fit(X,y)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [69]:
model2.predict([[0,0,5000]])  # This is to find in monroe township

array([844263.59922597])

In [70]:
model2.predict([[1,0,5000]])  # This is to find in robinsville

array([869950.01075037])

In [72]:
model2.predict([[0,1,5000]]) # This is to find in west windsor

array([884277.57471511])

In [73]:
model2.score(X,y)

0.9573929037221873

#### We can see above 2 methods are working and giving nearly to value