In [51]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [3]:
"""Reference https://youtu.be/9yl6-HEY7_s"""
df = pd.read_csv("data/homeprices.csv")
df

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000
5,west windsor,2600,585000
6,west windsor,2800,615000
7,west windsor,3300,650000
8,west windsor,3600,710000
9,robinsville,2600,575000


In [5]:
dummies = pd.get_dummies(df.town)
dummies

Unnamed: 0,monroe township,robinsville,west windsor
0,True,False,False
1,True,False,False
2,True,False,False
3,True,False,False
4,True,False,False
5,False,False,True
6,False,False,True
7,False,False,True
8,False,False,True
9,False,True,False


In [7]:
merged = pd.concat([df, dummies], axis="columns")
merged

Unnamed: 0,town,area,price,monroe township,robinsville,west windsor
0,monroe township,2600,550000,True,False,False
1,monroe township,3000,565000,True,False,False
2,monroe township,3200,610000,True,False,False
3,monroe township,3600,680000,True,False,False
4,monroe township,4000,725000,True,False,False
5,west windsor,2600,585000,False,False,True
6,west windsor,2800,615000,False,False,True
7,west windsor,3300,650000,False,False,True
8,west windsor,3600,710000,False,False,True
9,robinsville,2600,575000,False,True,False


In [9]:
final = merged.drop(["town", "west windsor"], axis='columns')
final

Unnamed: 0,area,price,monroe township,robinsville
0,2600,550000,True,False
1,3000,565000,True,False
2,3200,610000,True,False
3,3600,680000,True,False
4,4000,725000,True,False
5,2600,585000,False,False
6,2800,615000,False,False
7,3300,650000,False,False
8,3600,710000,False,False
9,2600,575000,False,True


In [12]:
model = LinearRegression()

x (input features) is area and towns and y (lables, targets) is price

In [16]:
# So drop "price column" for inputs
x = final.drop("price", axis="columns")
x

Unnamed: 0,area,monroe township,robinsville
0,2600,True,False
1,3000,True,False
2,3200,True,False
3,3600,True,False
4,4000,True,False
5,2600,False,False
6,2800,False,False
7,3300,False,False
8,3600,False,False
9,2600,False,True


In [15]:
y = final.price
y

0     550000
1     565000
2     610000
3     680000
4     725000
5     585000
6     615000
7     650000
8     710000
9     575000
10    600000
11    620000
12    695000
Name: price, dtype: int64

In [20]:
model.fit(x, y)

In [22]:
model.predict([[2800, 0,1]]) # prdict the price for robinsville. 2800 is for are/ sq foot




array([590775.63964739])

In [23]:
model.predict([[3400, 0,0]]) # predict the price for west windsor




array([681241.66845839])

In [27]:
model.score(x,y) # evaluate the accuracy

0.9573929037221873

In [31]:
# <<<Label Encoder>>>
le = LabelEncoder()
dfle = df
print(le.fit_transform(dfle.town)) # Returns labels. 0 is monroe, 2 is west windsor, 1 is robinsville
dfle.town = le.fit_transform(dfle.town) # assigne it to data frame
dfle

[0 0 0 0 0 2 2 2 2 1 1 1 1]


Unnamed: 0,town,area,price
0,0,2600,550000
1,0,3000,565000
2,0,3200,610000
3,0,3600,680000
4,0,4000,725000
5,2,2600,585000
6,2,2800,615000
7,2,3300,650000
8,2,3600,710000
9,1,2600,575000


In [35]:
# Make a training dataset
x = dfle[["town", "area"]].values # Add the values attribute to make it 2D array instead of a data frame
x

array([[   0, 2600],
       [   0, 3000],
       [   0, 3200],
       [   0, 3600],
       [   0, 4000],
       [   2, 2600],
       [   2, 2800],
       [   2, 3300],
       [   2, 3600],
       [   1, 2600],
       [   1, 2900],
       [   1, 3100],
       [   1, 3600]])

In [36]:
# Make target data
y = dfle.price
y

0     550000
1     565000
2     610000
3     680000
4     725000
5     585000
6     615000
7     650000
8     710000
9     575000
10    600000
11    620000
12    695000
Name: price, dtype: int64

In [None]:
# Create OneHotEncoder class object
# """categorical_features= deprecated"""
# ohe = OneHotEncoder(categorical_features=[0]) # specify the column index in x categorical dataset

In [76]:
"""Solution"""
# Assuming df is your DataFrame with columns ['town', 'area', 'price']
le = LabelEncoder()
dfle = df.copy()  # Make a copy to avoid modifying the original DataFrame
# Transform the 'town' column using LabelEncoder
dfle['town'] = le.fit_transform(dfle['town'])

# Prepare the features (X) and target variable (y)
X = dfle[['town', 'area']].values
y = dfle.price
print(y)
# Use ColumnTransformer to one-hot encode the 'town' column
# drop=first gets rid of the first column
ct = ColumnTransformer([('one_hot_encoder', OneHotEncoder(drop="first"), [0])], remainder='passthrough')
X = ct.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Now, X_train and X_test will have the 'town' column one-hot encoded

0     550000
1     565000
2     610000
3     680000
4     725000
5     585000
6     615000
7     650000
8     710000
9     575000
10    600000
11    620000
12    695000
Name: price, dtype: int64


In [77]:
ct.fit_transform(X)

array([[0.0e+00, 0.0e+00, 2.6e+03],
       [0.0e+00, 0.0e+00, 3.0e+03],
       [0.0e+00, 0.0e+00, 3.2e+03],
       [0.0e+00, 0.0e+00, 3.6e+03],
       [0.0e+00, 0.0e+00, 4.0e+03],
       [0.0e+00, 1.0e+00, 2.6e+03],
       [0.0e+00, 1.0e+00, 2.8e+03],
       [0.0e+00, 1.0e+00, 3.3e+03],
       [0.0e+00, 1.0e+00, 3.6e+03],
       [1.0e+00, 0.0e+00, 2.6e+03],
       [1.0e+00, 0.0e+00, 2.9e+03],
       [1.0e+00, 0.0e+00, 3.1e+03],
       [1.0e+00, 0.0e+00, 3.6e+03]])

In [83]:
model.fit(X_train, y_train)

In [87]:
model.predict([[1,0,2800]]) # Price in robinsville

array([586192.30769231])

In [90]:
model.predict([[0,1,3400]]) # Price in west windsor

array([684277.77777778])

In [None]:
"""One-Hot encoding
Referencce: https://www.scaler.com/topics/data-science/one-hot-encoding/"""

# #<<Using Pandas>>
# df = pd.DataFrame({
#     "name" : ["Swift", "Verna", "Polo", "Creta", "Innova"],
#     "brand": ["Suzuki", "Hyundai", "VW", "Hyundai", "Toyota"],
#     "color": ["Red", "White", 'Red', "Black", "White"]
# })
# print(df)
# print()

# df_one_hot_encoded = pd.get_dummies(data = df, columns = ["brand","color"], prefix = "is")
# print(df_one_hot_encoded)
# print()

# #<<Using Scikit-Learn>>
# cols_to_encode = ["brand", "color"]
# new_encoded_cols_names = []