In [None]:
'''Most Machine Learning algorithms cannot work with categorical data and needs to be converted into numerical data. 
Sometimes in datasets, we encounter columns that contain categorical features (string values) for example parameter Gender 
will have categorical parameters like Male, Female. These labels have no specific order of preference and also since the 
data is string labels, machine learning models misinterpreted that there is some sort of hierarchy in them.

 One approach to solve this problem can be label encoding where we will assign a numerical value to these labels for example 
Male and Female mapped to 0 and 1. But this can add bias in our model as it will start giving higher preference to the Female 
parameter as 1>0 and ideally both labels are equally important in the dataset. To deal with this issue we will use One Hot 
Encoding technique.

One hot encoding is a technique used to represent categorical variables as numerical values in a machine learning model. 
The advantages of using one hot encoding include:'''
'''1.  It allows the use of categorical variables in models that require numerical input.
2. It can improve model performance by providing more information to the model about the categorical variable.
3. It can help to avoid the problem of ordinality, which can occur when a categorical variable has a 
natural ordering (e.g. “small”, “medium”, “large”).

The disadvantages of using one hot encoding include:
1. It can lead to increased dimensionality, as a separate column is created for each category in the variable. 
This can make the model more complex and slow to train.
 
2. It can lead to sparse data, as most observations will have a value of 0 in most of the one-hot encoded columns.
 
3. It can lead to overfitting, especially if there are many categories in the variable and the sample size is relatively small.
 
4. One-hot-encoding is a powerful technique to treat categorical data, but it can lead to increased dimensionality, sparsity and overfitting. 
It is important to use it cautiously, and consider other methods such as ordinal encoding or binary encoding.

One Hot Encoding:
In this technique, the categorical parameters will prepare separate columns for both Male and Female labels. So, wherever there 
is Male, the value will be 1 in Male column and 0 in Female column, and vice-versa. Let’s understand with an example: Consider 
the data where fruits and their corresponding categorical values and prices are given.'''

In [59]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np


In [60]:
df  = pd.read_csv("homeprices2.csv")
df

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000
5,west windsor,2600,585000
6,west windsor,2800,615000
7,west windsor,3300,650000
8,west windsor,3600,710000
9,robinsville,2600,575000


In [61]:
#Checking for the labels in the categorical parameters 
df['town'].unique()

array(['monroe township', 'west windsor', 'robinsville'], dtype=object)

In [62]:
#Checking for the label counts in the categorical parameters 
df.town.value_counts()

monroe township    5
west windsor       4
robinsville        4
Name: town, dtype: int64

In [63]:
#Using get_dummies approach:
#Using pandas to create dummy variables
one_hot_encoded_data = pd.get_dummies(df.town)
one_hot_encoded_data

Unnamed: 0,monroe township,robinsville,west windsor
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0
5,0,0,1
6,0,0,1
7,0,0,1
8,0,0,1
9,0,1,0


In [13]:
#How we can cancat through column
merged = pd.concat([df, one_hot_encoded_data], axis = 'columns')
merged

Unnamed: 0,town,area,price,monroe township,robinsville,west windsor
0,monroe township,2600,550000,1,0,0
1,monroe township,3000,565000,1,0,0
2,monroe township,3200,610000,1,0,0
3,monroe township,3600,680000,1,0,0
4,monroe township,4000,725000,1,0,0
5,west windsor,2600,585000,0,0,1
6,west windsor,2800,615000,0,0,1
7,west windsor,3300,650000,0,0,1
8,west windsor,3600,710000,0,0,1
9,robinsville,2600,575000,0,1,0


In [20]:
#Now drop one variable
final = merged.drop(['town', 'west windsor'], axis = 'columns')
final

Unnamed: 0,area,price,monroe township,robinsville
0,2600,550000,1,0
1,3000,565000,1,0
2,3200,610000,1,0
3,3600,680000,1,0
4,4000,725000,1,0
5,2600,585000,0,0
6,2800,615000,0,0
7,3300,650000,0,0
8,3600,710000,0,0
9,2600,575000,0,1


In [None]:
#One Hot Encoding using Sci-kit learn Library: 

'''One hot encoding algorithm is an encoding system of Sci-kit learn library. 
One Hot Encoding is used to convert numerical categorical variables into binary vectors. 
Before implementing this algorithm. Make sure the categorical values must 
be label encoded as one hot encoding takes only numerical categorical values. '''

In [21]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [22]:
'''Dummy Variable Trap
When you can derive one variable from other variables, they are known to be multi-colinear. 
Here if you know values of california and georgia then you can easily infer value of new jersey state, i.e. 
california=0 and georgia=0. There for these state variables are called to be multi-colinear. In this situation 
linear regression won't work as expected. Hence you need to drop one column.

NOTE: sklearn library takes care of dummy variable trap hence even if you don't drop one of the state columns it is going to 
work, however we should make a habit of taking care of dummy variable trap ourselves just in case library that you are using 
is not handling this for you'''

X = final.drop('price',axis='columns')
X

Unnamed: 0,area,monroe township,robinsville
0,2600,1,0
1,3000,1,0
2,3200,1,0
3,3600,1,0
4,4000,1,0
5,2600,0,0
6,2800,0,0
7,3300,0,0
8,3600,0,0
9,2600,0,1


In [23]:
y = final.price
y

0     550000
1     565000
2     610000
3     680000
4     725000
5     585000
6     615000
7     650000
8     710000
9     575000
10    600000
11    620000
12    695000
Name: price, dtype: int64

In [24]:
model.fit(X,y)

LinearRegression()

In [30]:
model.predict([[2800,0,1]])



array([590775.63964739])

In [26]:
model.predict([[3400,1,0]])



array([641227.69296925])

In [51]:
model.predict([[3400,0,0]])

array([87543575.57514073])

In [28]:
model.score(X,y)

0.9573929037221873

In [31]:
#Labelencoder
df

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000
5,west windsor,2600,585000
6,west windsor,2800,615000
7,west windsor,3300,650000
8,west windsor,3600,710000
9,robinsville,2600,575000


In [33]:
#Using sklearn OneHotEncoder
#First step is to use label encoder to convert town names into numbers
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [36]:
dfle = df
dfle.town = le.fit_transform(dfle.town)
dfle

Unnamed: 0,town,area,price
0,0,2600,550000
1,0,3000,565000
2,0,3200,610000
3,0,3600,680000
4,0,4000,725000
5,2,2600,585000
6,2,2800,615000
7,2,3300,650000
8,2,3600,710000
9,1,2600,575000


In [38]:
X = dfle[['town','area']].values
X

array([[   0, 2600],
       [   0, 3000],
       [   0, 3200],
       [   0, 3600],
       [   0, 4000],
       [   2, 2600],
       [   2, 2800],
       [   2, 3300],
       [   2, 3600],
       [   1, 2600],
       [   1, 2900],
       [   1, 3100],
       [   1, 3600]], dtype=int64)

In [40]:
y = dfle.price
y

0     550000
1     565000
2     610000
3     680000
4     725000
5     585000
6     615000
7     650000
8     710000
9     575000
10    600000
11    620000
12    695000
Name: price, dtype: int64

In [44]:

from sklearn.compose import ColumnTransformer
ct = ColumnTransformer([('town', OneHotEncoder(), [0])], remainder = 'passthrough')

In [45]:
X = ct.fit_transform(X)
X

array([[1.0e+00, 0.0e+00, 0.0e+00, 2.6e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.0e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.2e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.6e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 4.0e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 2.6e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 2.8e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.3e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.6e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 2.6e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 2.9e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.1e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.6e+03]])

In [46]:
X = X[:,1:]
X

array([[0.0e+00, 0.0e+00, 2.6e+03],
       [0.0e+00, 0.0e+00, 3.0e+03],
       [0.0e+00, 0.0e+00, 3.2e+03],
       [0.0e+00, 0.0e+00, 3.6e+03],
       [0.0e+00, 0.0e+00, 4.0e+03],
       [0.0e+00, 1.0e+00, 2.6e+03],
       [0.0e+00, 1.0e+00, 2.8e+03],
       [0.0e+00, 1.0e+00, 3.3e+03],
       [0.0e+00, 1.0e+00, 3.6e+03],
       [1.0e+00, 0.0e+00, 2.6e+03],
       [1.0e+00, 0.0e+00, 2.9e+03],
       [1.0e+00, 0.0e+00, 3.1e+03],
       [1.0e+00, 0.0e+00, 3.6e+03]])

In [47]:
model.fit(X,y)

LinearRegression()

In [48]:
model.predict([[0,1,3400]]) # 3400 sqr ft home in west windsor

array([681241.6684584])

In [49]:
model.predict([[1,0,2800]]) # 2800 sqr ft home in robbinsville

array([590775.63964739])