In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("homeprices.csv")
df

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000
5,west windsor,2600,585000
6,west windsor,2800,615000
7,west windsor,3300,650000
8,west windsor,3600,710000
9,robinsville,2600,575000


In [3]:
#to use OHE, we should first use Label Encoding
le = LabelEncoder()

In [4]:
#fit and transofrm the data frame using le on town column
#see the new data frame where town will be 0,2 or 1
df.town = le.fit_transform(df.town)
df

Unnamed: 0,town,area,price
0,0,2600,550000
1,0,3000,565000
2,0,3200,610000
3,0,3600,680000
4,0,4000,725000
5,2,2600,585000
6,2,2800,615000
7,2,3300,650000
8,2,3600,710000
9,1,2600,575000


In [5]:
#retreive training data
x = df[['town', 'area']]
x

Unnamed: 0,town,area
0,0,2600
1,0,3000
2,0,3200
3,0,3600
4,0,4000
5,2,2600
6,2,2800
7,2,3300
8,2,3600
9,1,2600


In [6]:
#retreive target data
y= df.price
y

0     550000
1     565000
2     610000
3     680000
4     725000
5     585000
6     615000
7     650000
8     710000
9     575000
10    600000
11    620000
12    695000
Name: price, dtype: int64

In [7]:
#now apply OHE on town column
ohe = OneHotEncoder(handle_unknown='ignore')

In [8]:
x1 = ohe.fit_transform(df[['town']])
x1 = pd.DataFrame(x1.toarray())

In [9]:
#to avoid dummy variable trap, drop 0th column
x1 = x1.iloc[:,1:]
x1

Unnamed: 0,1,2
0,0.0,0.0
1,0.0,0.0
2,0.0,0.0
3,0.0,0.0
4,0.0,0.0
5,0.0,1.0
6,0.0,1.0
7,0.0,1.0
8,0.0,1.0
9,1.0,0.0


In [10]:
#add these columns to x
x = pd.concat([x,x1], axis="columns")
x

Unnamed: 0,town,area,1,2
0,0,2600,0.0,0.0
1,0,3000,0.0,0.0
2,0,3200,0.0,0.0
3,0,3600,0.0,0.0
4,0,4000,0.0,0.0
5,2,2600,0.0,1.0
6,2,2800,0.0,1.0
7,2,3300,0.0,1.0
8,2,3600,0.0,1.0
9,1,2600,1.0,0.0


In [11]:
#remove town as it is already encoded
x.drop('town', axis=1, inplace=True)
x

Unnamed: 0,area,1,2
0,2600,0.0,0.0
1,3000,0.0,0.0
2,3200,0.0,0.0
3,3600,0.0,0.0
4,4000,0.0,0.0
5,2600,0.0,1.0
6,2800,0.0,1.0
7,3300,0.0,1.0
8,3600,0.0,1.0
9,2600,1.0,0.0


In [12]:
# Convert all column names in 'x' to strings
# encountered a TypeError when trying to fit the linear regression model using the sklearn library. 
# The error message suggests that the feature names in your DataFrame x are of mixed types (some are strings and some are integers). 
# The sklearn library requires all feature names to be of the same type, preferably strings, for compatibility.
x.columns = x.columns.astype(str)

In [13]:
#let us create linear regression model
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(x,y) #train the model

In [14]:
#predict the price of house with 2800sqft area located at robinsville
model.predict([[2800,1,0]])

array([590775.63964739])

In [15]:
#predict the price of house with 3400sqft at monroe township
model.predict([[3400,0,0]])

array([641227.69296925])

In [16]:
#find the accuracy of the model
model.score(x,y)

0.9573929037221872