In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.animation as animation
%matplotlib inline
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor as gbr
import statsmodels.api as sm
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error

In [2]:
df = pd.read_csv('flats_wroclaw_final.csv')

#### Dummy values to add some attributes to model

In [118]:
df = pd.get_dummies(df)

In [120]:
df.corr()[['price']].sort_values(by='price', ascending=False)

Unnamed: 0,price
price,1.0
flat_surface_m2,0.763638
room_number,0.5244
price_per_m2,0.497794
floor_number,0.325845
floor,0.293427
building_type_apartamentowiec,0.138394
district_Stare Miasto,0.111519
district_Śródmieście,0.058282
building_type_loft,0.050685


In [122]:
df.corr()[['prod_year']].sort_values(by='prod_year', ascending=False)
# high correlation with nothing

Unnamed: 0,prod_year
prod_year,1.0
room_number,0.013955
district_Fabryczna,0.006107
building_type_dom wolnostojący,0.005483
building_type_loft,0.002138
building_type_plomba,0.000604
building_type_kamienica,0.000561
district_Stare Miasto,0.000545
district_Psie Pole,0.000182
building_type_szeregowiec,-4.9e-05


In [123]:
df.corr()[['floor']].sort_values(by='floor', ascending=False)
# high correlation with nothing

Unnamed: 0,floor
floor,1.0
floor_number,0.475856
price_per_m2,0.365945
price,0.293427
flat_surface_m2,0.08976
building_type_loft,0.043093
district_Stare Miasto,0.031258
building_type_apartamentowiec,0.029249
building_type_szeregowiec,0.027141
room_number,0.023528


In [124]:
df.corr()[['floor_number']].sort_values(by='floor_number', ascending=False)
# acceptable correlation with price per m2

Unnamed: 0,floor_number
floor_number,1.0
price_per_m2,0.551983
floor,0.475856
price,0.325845
district_Stare Miasto,0.056694
building_type_apartamentowiec,0.039329
building_type_szeregowiec,0.028809
district_Śródmieście,0.025022
building_type_loft,0.016662
building_type_kamienica,0.0122


In [125]:
df.corr()[['room_number']].sort_values(by='room_number', ascending=False)
# high correlation with flat surface but it is obvious

Unnamed: 0,room_number
room_number,1.0
flat_surface_m2,0.792123
price,0.5244
building_type_apartamentowiec,0.100065
district_Śródmieście,0.064769
district_Stare Miasto,0.061793
building_type_kamienica,0.045587
building_type_loft,0.036666
building_type_szeregowiec,0.023588
floor,0.023528


In [126]:
df.corr()[['flat_surface_m2']].sort_values(by='flat_surface_m2', ascending=False)
# high correlation with room number but it is obvious

Unnamed: 0,flat_surface_m2
flat_surface_m2,1.0
room_number,0.792123
price,0.763638
building_type_apartamentowiec,0.129952
floor,0.08976
district_Stare Miasto,0.089579
district_Śródmieście,0.061707
building_type_loft,0.048456
building_type_kamienica,0.048316
building_type_szeregowiec,0.023647


In [127]:
df.corr()[['price_per_m2']].sort_values(by='price_per_m2', ascending=False)
# acceptable correlation with floor_number

Unnamed: 0,price_per_m2
price_per_m2,1.0
floor_number,0.551983
price,0.497794
floor,0.365945
building_type_apartamentowiec,0.067635
district_Stare Miasto,0.058581
district_Śródmieście,0.037884
building_type_loft,0.02785
building_type_szeregowiec,0.026858
building_type_kamienica,0.009595


##### X (predicator) and y (regressor) 

In [138]:
predicator = 'flat_surface_m2'
regressor = 'price'
X = df[predicator].values.reshape(-1,1)
y = df[regressor].values

In [129]:
X_sm = X = sm.add_constant(X)
model_sm = sm.OLS(y, X_sm)
model_sm.fit().summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.583
Model:,OLS,Adj. R-squared:,0.583
Method:,Least Squares,F-statistic:,13090.0
Date:,"Wed, 27 Jan 2021",Prob (F-statistic):,0.0
Time:,15:58:35,Log-Likelihood:,-126250.0
No. Observations:,9358,AIC:,252500.0
Df Residuals:,9356,BIC:,252500.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-4.782e+04,5183.046,-9.226,0.000,-5.8e+04,-3.77e+04
x1,9472.5789,82.800,114.404,0.000,9310.274,9634.884

0,1,2,3
Omnibus:,12289.077,Durbin-Watson:,0.885
Prob(Omnibus):,0.0,Jarque-Bera (JB):,8967003.026
Skew:,6.859,Prob(JB):,0.0
Kurtosis:,154.027,Cond. No.,180.0


##### Train and Test dataset

In [140]:
# the test set will be 20% of the whole data set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [141]:
slr_model = LinearRegression()
slr_model = slr_model.fit(X_train, y_train)

#### a1:
##### for an increase of 1 square meter in house size,
##### the house price will go up by ~9242.25 PLN , on average

In [142]:
slr_model.coef_

array([9242.24942581])

#### a0:
##### the value of y when x=0

In [143]:
slr_model.intercept_

-36133.38605649513

#### y^ = a1 * X + a0
##### y^ = 9242.25 * X - 4782.16

In [144]:
prediction = slr_model.coef_ * X_test + slr_model.intercept_

In [145]:
mean_absolute_error(y_test, prediction)

101789.11416907945

##### Predict price of 30m2 flat

In [146]:
price_30sqt_flat_manually = -36133.38605649513 + 9242.24942581*30
price_30sqt_flat_manually

241134.09671780484

In [147]:
price_30sqt_flat_model = slr_model.predict(X_test)
price_30sqt_flat_model

array([462948.08293727, 310450.9674114 , 666739.68277639, ...,
       416736.83580822, 679586.40947827, 428751.76006177])

In [148]:
slr_model.score(X_test, y_test)

0.606908750638334

In [149]:
slr_pred_error = mean_absolute_error(y_test, price_30sqt_flat_model)
slr_pred_error

101789.11416907945

### Root Mean Squared Error (RMSE)

In [150]:
mse = mean_squared_error(y_test, model.predict(X_test))
mse

34759012585.72996

In [151]:
np.sqrt(mse)

186437.69089357965

In [152]:
model.score(X_test,y_test)

0.606908750638334

### We use train data and test data , train data to train our machine and test data to see if it has learnt the data well or not.

### Gradient Boosting Regression

In [153]:
clf = gbr(n_estimators=650, max_depth=5, min_samples_split=2, learning_rate=0.1, loss='ls')

In [154]:
clf.fit(X_train, y_train)

GradientBoostingRegressor(max_depth=5, n_estimators=650)

In [155]:
clf.score(X_test, y_test) # n_estimators=650

0.7678158518508678

In [156]:
gbr_pred = clf.predict(X_test)

In [157]:
gbr_pred_error = mean_absolute_error(y_test, gbr_pred)
gbr_pred_error

68813.31351688648

### MULTIPLE Gradient Boosting Regression

In [178]:
X_multi = df.drop(columns=['price']).values
y_multi = df.price.values

In [179]:
# Better here cut out random seed
X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(X_multi, y_multi, test_size=0.2, random_state=42)

In [175]:
clf_multi = gbr(n_estimators=650, max_depth=5, min_samples_split=2, learning_rate=0.1, loss='ls')
clf.fit(X_train_multi, y_train_multi)
clf.score(X_test_multi, y_test_multi) # n_estimators=650

0.9887129370978401

In [177]:
gbr_multi_pred = clf.predict(X_test_multi)
gbr_multi_pred_error = mean_absolute_error(y_test_multi, gbr_multi_pred)
gbr_multi_pred_error

5250.472204731598

#### Multiple Linear Regression With scikit-learn

In [178]:
X_multi = df.drop(columns=['price']).values
y_multi = df.price.values

In [179]:
# Better here cut out random seed
X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(X_multi, y_multi, test_size=0.2, random_state=42)

In [180]:
multi_model = LinearRegression().fit(X_train_multi, y_train_multi)

In [181]:
multi_model.score(X_test_multi, y_test_multi)

0.9115936269752613

In [182]:
mlr_pred = multi_model.predict(X_test_multi)

In [183]:
mlr_pred_error = mean_absolute_error(y_test_multi, mlr_pred)
mlr_pred_error

41404.69066278894

## Model selection

### Simple Linear Regression - We are on average wrong about 98761.93 PLN

In [184]:
np.mean(cross_val_score(slr_model, X_train, y_train, scoring='neg_mean_absolute_error'))

-98761.93234078004

### Gradient Boost Regression - We are on average wrong about 66942.85 PLN

In [185]:
np.mean(cross_val_score(clf, X_train, y_train, scoring='neg_mean_absolute_error'))

-66942.85408479504

In [209]:
np.mean(cross_val_score(clf_multi, X_train_multi, y_train_multi, scoring='neg_mean_absolute_error'))

-5840.001259406544

### Multiple Linear Regression - We are on average wrong about 40534.99 PLN

In [186]:
np.mean(cross_val_score(multi_model, X_train_multi, y_train_multi, scoring='neg_mean_absolute_error'))

-40534.98968979342

### LASSO - We are on average wrong about 98761.92 PLN

In [187]:
model_lasso = Lasso()
np.mean(cross_val_score(model_lasso, X_train, y_train, scoring='neg_mean_absolute_error'))

-98761.92349156333

In [188]:
model_lasso.fit(X_train, y_train)
model_lasso.score(X_test, y_test)

0.6069087241112526

In [189]:
la_pred = model_lasso.predict(X_test)

In [190]:
la_pred_error = mean_absolute_error(y_test, la_pred)
la_pred_error

101789.10790927647

### LASSO MULTIPLE REGRESSION

In [191]:
model_lasso_multi = Lasso()
model_lasso_multi.fit(X_train_multi, y_train_multi)
model_lasso_multi.score(X_test_multi,y_test_multi)

0.9115918015557223

In [192]:
lasso_multi_pred = model_lasso_multi.predict(X_test_multi)

In [193]:
lasso_multi_pred_error = mean_absolute_error(y_test_multi, lasso_multi_pred)
lasso_multi_pred_error

41405.13610895002

In [210]:
np.mean(cross_val_score(model_lasso_multi, X_train_multi, y_train_multi, scoring='neg_mean_absolute_error'))

-40532.407851512886

### Random Forest Regression - We are on average wrong about 63423.31 PLN

In [194]:
from sklearn.ensemble import RandomForestRegressor

In [195]:
rfr = RandomForestRegressor()
np.mean(cross_val_score(rfr, X_train, y_train, scoring='neg_mean_absolute_error'))

-63423.31236261614

In [196]:
rfr.fit(X_train, y_train)
rfr.score(X_test, y_test)

0.7880573724043916

In [197]:
rfr_pred = rfr.predict(X_test)

In [198]:
rfr_pred_error = mean_absolute_error(y_test, rfr_pred)
rfr_pred_error

63695.80452467194

### RANDOM FOREST MULTIPLE REGRESSION

In [199]:
rfr_multi = RandomForestRegressor()
rfr_multi.fit(X_train_multi, y_train_multi)

RandomForestRegressor()

In [200]:
rfr_multi.score(X_test_multi, y_test_multi)

0.9821618422462398

In [201]:
rfr_multi_pred = rfr_multi.predict(X_test_multi)

In [202]:
rfr_multi_pred_error = mean_absolute_error(y_test_multi, rfr_multi_pred)
rfr_multi_pred_error

5795.196253955703

## BEST SCORE GIVES US MULTIPLE RANDOM FOREST REGRESSION - We are on average wrong about 5821.11 PLN

In [211]:
np.mean(cross_val_score(rfr_multi, X_train_multi, y_train_multi, scoring='neg_mean_absolute_error'))

-5821.112175216098

#### TUNE MISS FOREST REGRESSION

In [212]:
from sklearn.model_selection import GridSearchCV

In [221]:
parameters = {'n_estimators':range(10,100,10), 'criterion':['mse', 'mae'], 'max_features': ['auto', 'sqrt', 'log2']}

In [222]:
gs = GridSearchCV(rfr, parameters, scoring='neg_mean_absolute_error')

In [223]:
# find best parameters for model
gs.fit(X_train_multi, y_train_multi)

GridSearchCV(estimator=RandomForestRegressor(),
             param_grid={'criterion': ['mse', 'mae'],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': range(10, 100, 10)},
             scoring='neg_mean_absolute_error')

In [224]:
gs.best_score_

-5890.30145239134

In [225]:
gs.best_estimator_

RandomForestRegressor(n_estimators=70)

In [230]:
rfr_gs_multi = RandomForestRegressor(criterion='mse', max_features='auto', n_estimators=70)
# np.mean(cross_val_score(rfr, X_train, y_train, scoring='neg_mean_absolute_error', cv=3))
rfr_gs_multi.fit(X_train_multi, y_train_multi)

RandomForestRegressor(n_estimators=70)

In [231]:
rfr_gs_multi.score(X_test_multi, y_test_multi)

0.984595852905293

In [232]:
rfr_gs_multi_pred = rfr_gs_multi.predict(X_test_multi)

In [233]:
rfr_gs_multi_pred_error = mean_absolute_error(y_test_multi, rfr_gs_multi_pred)
rfr_gs_multi_pred_error

5492.5672786575105

In [229]:
gs.best_params_

{'criterion': 'mse', 'max_features': 'auto', 'n_estimators': 70}