In [69]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

In [70]:
data = pd.read_csv('taiwan_housing_data.csv')
data

Unnamed: 0,house_age,dist_MRT,num_stores,Xs,Ys,dist_fr_cen,price
0,32.0,84.87882,10,0.666075,1.559003,2.553169,37.9
1,19.5,306.59470,9,0.597812,1.265025,2.256929,42.2
2,13.3,561.98450,5,1.027494,2.062484,3.089078,47.3
3,13.3,561.98450,5,1.027494,2.062484,3.089078,54.8
4,5.0,390.56840,5,0.896828,1.163084,2.177555,43.1
...,...,...,...,...,...,...,...
409,13.7,4082.01500,0,-2.929249,-3.101137,4.094031,15.4
410,5.6,90.45606,9,0.972968,0.605721,1.642819,50.0
411,18.8,390.96960,7,0.635474,1.142664,2.134823,40.6
412,8.1,104.81010,5,0.743234,-0.240095,0.764526,52.5


#### Using SKLearn

In [71]:
from sklearn.linear_model import LinearRegression

In [72]:
X = data[['house_age', 'dist_MRT', 'num_stores', 'Xs', 'Ys', 'dist_fr_cen']]
y = data.price

In [73]:
model = LinearRegression()
model.fit(X,y)
model.coef_

array([-0.25978681, -0.00928698,  1.04741983,  0.2052644 , -0.41837436,
        5.00374588])

#### Using Statsmodel api

In [74]:
X = data[['house_age', 'dist_MRT', 'num_stores', 'Xs', 'Ys', 'dist_fr_cen']]
y = data.price
X = sm.add_constant(X)

In [75]:
s_model = sm.OLS(y, X)
s_results = s_model.fit()
s_results.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.587
Model:,OLS,Adj. R-squared:,0.581
Method:,Least Squares,F-statistic:,96.37
Date:,"Mon, 14 Jun 2021",Prob (F-statistic):,5.400000000000001e-75
Time:,00:22:12,Log-Likelihood:,-1484.7
No. Observations:,414,AIC:,2983.0
Df Residuals:,407,BIC:,3012.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,37.8992,1.889,20.063,0.000,34.186,41.613
house_age,-0.2598,0.038,-6.767,0.000,-0.335,-0.184
dist_MRT,-0.0093,0.001,-6.353,0.000,-0.012,-0.006
num_stores,1.0474,0.189,5.535,0.000,0.675,1.419
Xs,0.2053,0.484,0.424,0.672,-0.746,1.157
Ys,-0.4184,0.764,-0.548,0.584,-1.920,1.083
dist_fr_cen,5.0037,1.271,3.936,0.000,2.505,7.503

0,1,2,3
Omnibus:,241.076,Durbin-Watson:,2.153
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3700.329
Skew:,2.148,Prob(JB):,0.0
Kurtosis:,17.002,Cond. No.,8320.0


#### Normalizing a Dataset

In [76]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [77]:
# define data
data = np.array([[100, 0.001],[8, 0.05],[50, 0.005],[88, 0.07],[4, 0.1]])
pd.DataFrame(data)

Unnamed: 0,0,1
0,100.0,0.001
1,8.0,0.05
2,50.0,0.005
3,88.0,0.07
4,4.0,0.1


In [78]:
# define min max scaler
scaler = MinMaxScaler()
# transform data
scaled = scaler.fit_transform(data)
print(scaled)

[[1.         0.        ]
 [0.04166667 0.49494949]
 [0.47916667 0.04040404]
 [0.875      0.6969697 ]
 [0.         1.        ]]


In [79]:
pd.DataFrame(scaled)

Unnamed: 0,0,1
0,1.0,0.0
1,0.041667,0.494949
2,0.479167,0.040404
3,0.875,0.69697
4,0.0,1.0


#### Standardizing a Dataset

In [80]:
from sklearn.preprocessing import StandardScaler

In [81]:
scaler = StandardScaler()
# transform data
scaled = scaler.fit_transform(data)

In [82]:
pd.DataFrame(scaled)

Unnamed: 0,0,1
0,1.263981,-1.1639
1,-1.061744,0.126396
2,0.0,-1.058569
3,0.960626,0.653048
4,-1.162863,1.443025


#### Dummy Variables

In [83]:
df = pd.DataFrame(['Chicago','Boston','Seattle'],columns=['City'])
df

Unnamed: 0,City
0,Chicago
1,Boston
2,Seattle


In [84]:
pd.get_dummies(df.City)

Unnamed: 0,Boston,Chicago,Seattle
0,0,1,0
1,1,0,0
2,0,0,1


In [85]:
df = df.join(pd.get_dummies(df.City))

In [86]:
df

Unnamed: 0,City,Boston,Chicago,Seattle
0,Chicago,0,1,0
1,Boston,1,0,0
2,Seattle,0,0,1
