In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('homeprices.csv')
df.head()

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000


In [3]:
df.shape

(13, 3)

In [4]:
df.town.value_counts()

monroe township    5
robinsville        4
west windsor       4
Name: town, dtype: int64

In [5]:
# so one_hot encoding of town column
dummies = pd.get_dummies(df.town)

In [6]:
dummies.head()

Unnamed: 0,monroe township,robinsville,west windsor
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0


In [7]:
merged = pd.concat([df, dummies], axis='columns')

In [10]:
merged.head()

Unnamed: 0,town,area,price,monroe township,robinsville,west windsor
0,monroe township,2600,550000,1,0,0
1,monroe township,3000,565000,1,0,0
2,monroe township,3200,610000,1,0,0
3,monroe township,3600,680000,1,0,0
4,monroe township,4000,725000,1,0,0


In [12]:
# to avoid dummy variable trap we should drop one one_hot encoded column, lets drop last column
final = merged.drop(['town', 'west windsor'], axis='columns')

In [13]:
final.head(3)

Unnamed: 0,area,price,monroe township,robinsville
0,2600,550000,1,0
1,3000,565000,1,0
2,3200,610000,1,0


In [17]:
X = final.drop('price', axis='columns').values

In [21]:
# no of samples, and no of featues
X.shape

(13, 3)

In [19]:
y = final.price.values

In [22]:
y.shape

(13,)

In [24]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X, y)

LinearRegression()

In [25]:
# it will compute the predicted values of X(training set) and compare with orginal values and gives sc
model.score(X, y)

0.9573929037221873

In [33]:
# we can also use sklearn label encoder then pass labelencoder to onehotencoder we get one_hot vectors
dfle = df.copy()
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
dfle.town = le.fit_transform(dfle.town)
dfle.head()

Unnamed: 0,town,area,price
0,0,2600,550000
1,0,3000,565000
2,0,3200,610000
3,0,3600,680000
4,0,4000,725000


In [36]:
X = dfle.drop('price', axis='columns').values
X

array([[   0, 2600],
       [   0, 3000],
       [   0, 3200],
       [   0, 3600],
       [   0, 4000],
       [   2, 2600],
       [   2, 2800],
       [   2, 3300],
       [   2, 3600],
       [   1, 2600],
       [   1, 2900],
       [   1, 3100],
       [   1, 3600]], dtype=int64)

In [37]:
y = dfle.price.values

In [38]:
y

array([550000, 565000, 610000, 680000, 725000, 585000, 615000, 650000,
       710000, 575000, 600000, 620000, 695000], dtype=int64)

In [41]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
# if we not pass remainder='passthrough' we got only one_hot columns
ct = ColumnTransformer([('town', OneHotEncoder(), [0])], remainder='passthrough')

In [42]:
X = ct.fit_transform(X)