In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder

In [2]:
housing_data = pd.read_csv("location_home.csv")
housing_data

Unnamed: 0,housing,area,price
0,Kolkata,6300,50000
1,Kolkata,5400,40000
2,Kolkata,5200,39500
3,Kolkata,4002,20000
4,Kolkata,5610,57000
5,Kolkata,7790,70000
6,North,4522,25000
7,North,5566,30000
8,North,6635,45000
9,North,7556,80000


### Create dummy variables for substi

In [4]:
dummy_var = pd.get_dummies(housing_data["housing"])   # Create numerical equivalents of string based categories
dummy_var

Unnamed: 0,Kolkata,North,South
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0
5,1,0,0
6,0,1,0
7,0,1,0
8,0,1,0
9,0,1,0


In [5]:
# Merge the main csv + dummy

final_csv = pd.concat([housing_data,dummy_var],axis="columns")
final_csv

Unnamed: 0,housing,area,price,Kolkata,North,South
0,Kolkata,6300,50000,1,0,0
1,Kolkata,5400,40000,1,0,0
2,Kolkata,5200,39500,1,0,0
3,Kolkata,4002,20000,1,0,0
4,Kolkata,5610,57000,1,0,0
5,Kolkata,7790,70000,1,0,0
6,North,4522,25000,0,1,0
7,North,5566,30000,0,1,0
8,North,6635,45000,0,1,0
9,North,7556,80000,0,1,0


In [6]:
# we dont need string based column for ML

housing_list = final_csv.drop(["housing"],axis="columns")
housing_list

Unnamed: 0,area,price,Kolkata,North,South
0,6300,50000,1,0,0
1,5400,40000,1,0,0
2,5200,39500,1,0,0
3,4002,20000,1,0,0
4,5610,57000,1,0,0
5,7790,70000,1,0,0
6,4522,25000,0,1,0
7,5566,30000,0,1,0
8,6635,45000,0,1,0
9,7556,80000,0,1,0


### Dummy variable Trap - drop one column to prevent multi-colinearity issues 
### b can be pred if a=0 and c=0 ..linear reg issue can occur

In [8]:
housing_list_fix = housing_list.drop(["South"],axis="columns")
housing_list_fix

Unnamed: 0,area,price,Kolkata,North
0,6300,50000,1,0
1,5400,40000,1,0
2,5200,39500,1,0
3,4002,20000,1,0
4,5610,57000,1,0
5,7790,70000,1,0
6,4522,25000,0,1
7,5566,30000,0,1
8,6635,45000,0,1
9,7556,80000,0,1


In [10]:
X_axis = housing_list_fix.drop("price",axis="columns")
X_axis

Unnamed: 0,area,Kolkata,North
0,6300,1,0
1,5400,1,0
2,5200,1,0
3,4002,1,0
4,5610,1,0
5,7790,1,0
6,4522,0,1
7,5566,0,1
8,6635,0,1
9,7556,0,1


In [12]:
Y_axis = housing_list_fix["price"]
Y_axis

0      50000
1      40000
2      39500
3      20000
4      57000
5      70000
6      25000
7      30000
8      45000
9      80000
10     90000
11    100000
12     60000
13     75000
14     80000
15     86000
16    120000
Name: price, dtype: int64

### Linear Regression

In [15]:
reg = LinearRegression()   # regression class
reg

LinearRegression()

In [16]:
reg.fit(X_axis,Y_axis)   # training model

LinearRegression()

In [18]:
reg.predict([[5200,0,0]])    # For South as Kolkata = 0 & North = 0



array([58247.150706])

In [20]:
reg.score(X_axis,Y_axis)    # score return in decimal 

0.8860432252475337

In [21]:
reg.predict([[5200,1,0]])  # For Kolkata



array([37138.25127667])

In [22]:
reg.predict([[5200,0,1]])   # For North



array([35209.17863639])

### Use One Hot Encoder + Label Encoder

In [24]:
label_enc = LabelEncoder()

In [25]:
housing_data = pd.read_csv("location_home.csv")
housing_data

Unnamed: 0,housing,area,price
0,Kolkata,6300,50000
1,Kolkata,5400,40000
2,Kolkata,5200,39500
3,Kolkata,4002,20000
4,Kolkata,5610,57000
5,Kolkata,7790,70000
6,North,4522,25000
7,North,5566,30000
8,North,6635,45000
9,North,7556,80000


In [26]:
housing_label_enc = housing_data

In [28]:
# using integer replacements
housing_label_enc.housing = label_enc.fit_transform(housing_label_enc.housing)
housing_label_enc

Unnamed: 0,housing,area,price
0,0,6300,50000
1,0,5400,40000
2,0,5200,39500
3,0,4002,20000
4,0,5610,57000
5,0,7790,70000
6,1,4522,25000
7,1,5566,30000
8,1,6635,45000
9,1,7556,80000


In [30]:
X_val = housing_label_enc[["housing","area"]].values
X_val

array([[   0, 6300],
       [   0, 5400],
       [   0, 5200],
       [   0, 4002],
       [   0, 5610],
       [   0, 7790],
       [   1, 4522],
       [   1, 5566],
       [   1, 6635],
       [   1, 7556],
       [   1, 7996],
       [   1, 8100],
       [   2, 5000],
       [   2, 6000],
       [   2, 7000],
       [   2, 7500],
       [   2, 8000]], dtype=int64)