In [7]:
import csv

header = ['town','area', 'price']
data = [
    ["Dbs",2000, 2001],
    ["Dbs",2700, 5381],
    ["Fbs",3300, 8567],
    ["yo",4289, 11568],
    ["yo",3800, 14246]
]

with open('countries.csv', 'w', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)

    # write the header
    writer.writerow(header)

    # write multiple rows
    writer.writerows(data)


In [8]:
# Categorical Variables:
#         1.Nominal Variables: They dont have numeric ordering. Eg: town
#         2.Ordinal Variables: They have numeric ordering. Eg: ratings
# To deal with nominal variables we use a technique called One-hot encoding.The extra variables used in the technique 
# are called dummy variable.


In [9]:
import pandas as pd
df = pd.read_csv("countries.csv")
df

Unnamed: 0,town,area,price
0,Dbs,2000,2001
1,Dbs,2700,5381
2,Fbs,3300,8567
3,yo,4289,11568
4,yo,3800,14246


In [10]:
# to get dummies variables
dummies = pd.get_dummies(df.town)

In [11]:
# merging two df, drop original town col,drop one of the dummy variables, 
merged = pd.concat([df,dummies],axis="columns")
merged

Unnamed: 0,town,area,price,Dbs,Fbs,yo
0,Dbs,2000,2001,1,0,0
1,Dbs,2700,5381,1,0,0
2,Fbs,3300,8567,0,1,0
3,yo,4289,11568,0,0,1
4,yo,3800,14246,0,0,1


In [12]:
final = merged.drop(['town','yo'],axis="columns")
final

Unnamed: 0,area,price,Dbs,Fbs
0,2000,2001,1,0
1,2700,5381,1,0
2,3300,8567,0,1
3,4289,11568,0,0
4,3800,14246,0,0


In [13]:
from sklearn.linear_model import LinearRegression
regr = LinearRegression()

In [14]:
# Getting X and Y variable
x = final.drop(["price"],axis="columns")
x

Unnamed: 0,area,Dbs,Fbs
0,2000,1,0
1,2700,1,0
2,3300,0,1
3,4289,0,0
4,3800,0,0


In [15]:
y = final.price
y

0     2001
1     5381
2     8567
3    11568
4    14246
Name: price, dtype: int64

In [16]:
regr.fit(x,y)

LinearRegression()

In [17]:
regr.predict([[3000,0,1]])



array([8132.31577063])

In [18]:
regr.predict([[4000,1,0]])



array([6081.76326152])

In [19]:
# to check the accuracy of model
regr.score(x,y)

0.9095062920591562

In [20]:
# One Hot encoder method
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [21]:
dfl = df
dfl.town = le.fit_transform(dfl.town)
dfl

Unnamed: 0,town,area,price
0,0,2000,2001
1,0,2700,5381
2,1,3300,8567
3,2,4289,11568
4,2,3800,14246


In [22]:
x = dfl[['town','area']].values
x

array([[   0, 2000],
       [   0, 2700],
       [   1, 3300],
       [   2, 4289],
       [   2, 3800]])

In [23]:
y=dfl.price

In [24]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer(
    [('one_hot_encoder', OneHotEncoder(categories='auto'), [0])],
    remainder='passthrough' 
)
x = ct.fit_transform(x)
x

array([[1.000e+00, 0.000e+00, 0.000e+00, 2.000e+03],
       [1.000e+00, 0.000e+00, 0.000e+00, 2.700e+03],
       [0.000e+00, 1.000e+00, 0.000e+00, 3.300e+03],
       [0.000e+00, 0.000e+00, 1.000e+00, 4.289e+03],
       [0.000e+00, 0.000e+00, 1.000e+00, 3.800e+03]])

In [25]:
x=x[:,1:]
x

array([[0.000e+00, 0.000e+00, 2.000e+03],
       [0.000e+00, 0.000e+00, 2.700e+03],
       [1.000e+00, 0.000e+00, 3.300e+03],
       [0.000e+00, 1.000e+00, 4.289e+03],
       [0.000e+00, 1.000e+00, 3.800e+03]])

In [26]:
regr.fit(x,y)

LinearRegression()

In [27]:
# fbs,4500
regr.predict([[1,0,4500]])

array([10305.73691747])