In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.animation as animation
%matplotlib inline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error

In [2]:
df = pd.read_csv('flats_wroclaw_final.csv')

In [3]:
df.corr()[['price']].sort_values(by='price', ascending=False)

Unnamed: 0,price
price,1.0
flat_surface_m2,0.763638
room_number,0.5244
price_per_m2,0.497794
floor_number,0.325845
floor,0.293427
prod_year,-0.095981


In [4]:
df.corr()[['prod_year']].sort_values(by='prod_year', ascending=False)
# high correlation with nothing

Unnamed: 0,prod_year
prod_year,1.0
room_number,0.013955
floor,-0.003564
floor_number,-0.012122
flat_surface_m2,-0.062396
price_per_m2,-0.07571
price,-0.095981


In [5]:
df.corr()[['floor']].sort_values(by='floor', ascending=False)
# high correlation with nothing

Unnamed: 0,floor
floor,1.0
floor_number,0.475856
price_per_m2,0.365945
price,0.293427
flat_surface_m2,0.08976
room_number,0.023528
prod_year,-0.003564


In [6]:
df.corr()[['floor_number']].sort_values(by='floor_number', ascending=False)
# acceptable correlation with price per m2

Unnamed: 0,floor_number
floor_number,1.0
price_per_m2,0.551983
floor,0.475856
price,0.325845
prod_year,-0.012122
flat_surface_m2,-0.019704
room_number,-0.104513


In [7]:
df.corr()[['room_number']].sort_values(by='room_number', ascending=False)
# high correlation with flat surface but it is obvious

Unnamed: 0,room_number
room_number,1.0
flat_surface_m2,0.792123
price,0.5244
floor,0.023528
prod_year,0.013955
floor_number,-0.104513
price_per_m2,-0.186876


In [8]:
df.corr()[['flat_surface_m2']].sort_values(by='flat_surface_m2', ascending=False)
# high correlation with room number but it is obvious

Unnamed: 0,flat_surface_m2
flat_surface_m2,1.0
room_number,0.792123
price,0.763638
floor,0.08976
floor_number,-0.019704
prod_year,-0.062396
price_per_m2,-0.075039


In [9]:
df.corr()[['price_per_m2']].sort_values(by='price_per_m2', ascending=False)
# acceptable correlation with floor_number

Unnamed: 0,price_per_m2
price_per_m2,1.0
floor_number,0.551983
price,0.497794
floor,0.365945
flat_surface_m2,-0.075039
prod_year,-0.07571
room_number,-0.186876


##### X (predicator) and y (regressor) 

In [10]:
predicator = 'flat_surface_m2'
regressor = 'price'
X = df[predicator].values.reshape(-1,1)
y = df[regressor]

##### Train and Test dataset

In [11]:
# the test set will be 20% of the whole data set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
model = LinearRegression()
model = model.fit(X_train, y_train)

#### a1:
##### for an increase of 1 square meter in house size,
##### the house price will go up by ~9242.25 PLN , on average

In [13]:
model.coef_

array([9242.24942581])

#### a0:
##### the value of y when x=0

In [14]:
model.intercept_

-36133.38605649513

#### y^ = a1 * X + a0
##### y^ = 9242.25 * X - 4782.16

In [15]:
prediction = model.coef_ * X + model.intercept_
prediction

array([[ 406108.24896854],
       [1924147.71715791],
       [2061857.23360248],
       ...,
       [ 435498.60214261],
       [ 446312.03397081],
       [ 316920.54200946]])

##### Predict price of 30m2 flat

In [16]:
price_30sqt_flat_manually = -36133.38605649513 + 9242.24942581*30
price_30sqt_flat_manually

241134.09671780484

In [17]:
price_30sqt_flat_model = model.predict(X_train)
price_30sqt_flat_model

array([889015.78146713, 439195.50191294, 457310.31078753, ...,
       599086.41697946, 762212.11934501, 374592.17842652])

In [19]:
model.score(X_train, y_train)

0.5749472673707678

### Root Mean Squared Error (RMSE)

In [20]:
mse = mean_squared_error(y_test, model.predict(X_test))
mse

34759012585.72996

In [21]:
np.sqrt(mse)

186437.69089357965

In [22]:
model.score(X_test,y_test)

0.606908750638334

### We use train data and test data , train data to train our machine and test data to see if it has learnt the data well or not.

### Gradient Boosting Regression

In [25]:
from sklearn.ensemble import GradientBoostingRegressor as gbr

In [53]:
clf = gbr(n_estimators=650, max_depth=5, min_samples_split=2, learning_rate=0.1, loss='ls')

In [54]:
clf.fit(X_train, y_train)

GradientBoostingRegressor(max_depth=5, n_estimators=650)

In [55]:
clf.score(X_test, y_test) # n_estimators=650

0.7678158518508678

#### Add some random seed

In [71]:
clf2 = gbr(n_estimators=650, max_depth=5, min_samples_split=2, learning_rate=0.1, loss='ls', random_state=10)

In [72]:
clf.fit(X_train, y_train)

GradientBoostingRegressor(max_depth=5, n_estimators=650)

In [73]:
clf.score(X_test, y_test) # n_estimators=650

0.7678158518508678

#### Multiple Linear Regression With scikit-learn