In [2]:
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import seaborn as sns

import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv("houses.csv")
df

Unnamed: 0,title,house_type,price,size,construction_costs
0,Dom w papawerach 2,"jednorodzinny z poddaszem, z garażem jednostan...",3930,128,331800
1,Domek letniskowy w krokusach 4 (A),Domek letniskowy parterowy,2300,25,74400
2,Domek letniskowy w krokusach 5,Domek letniskowy parterowy,2300,25,83500
3,Domek letniskowy w krokusach 4,Domek letniskowy parterowy,2300,36,95300
4,Domek letniskowy pod laskiem,Domek letniskowy parterowy,2300,38,96800
...,...,...,...,...,...
731,Dom w firletkach,"jednorodzinny z poddaszem, z garażem jednostan...",3930,137,278600
732,Dom w zdrojówkach 12,"jednorodzinny z poddaszem, z garażem jednostan...",3930,129,278700
733,Dom w renklodach 26 (G),"jednorodzinny parterowy, z garażem jednostanow...",3600,140,279000
734,Dom w balsamowcach (A),"jednorodzinny z poddaszem, z garażem jednostan...",3930,124,279200


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 736 entries, 0 to 735
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   title               736 non-null    object
 1   house_type          736 non-null    object
 2   price               736 non-null    int64 
 3   size                736 non-null    int64 
 4   construction_costs  736 non-null    int64 
dtypes: int64(3), object(2)
memory usage: 28.9+ KB


In [5]:
df["minmax"] = (df.price-df.price.min()) / (df.price.max()-df.price.min())
df["z"] = (df.price - df.price.mean()) / df.price.std()

In [6]:
bin_names = ["cheap", "medium", "high", "luxury"]
df["category"] = pd.cut(df.minmax, 
                           [0, .25, .5, .75, 1],
                           labels=bin_names)

In [7]:
df

Unnamed: 0,title,house_type,price,size,construction_costs,minmax,z,category
0,Dom w papawerach 2,"jednorodzinny z poddaszem, z garażem jednostan...",3930,128,331800,0.905556,1.350995,luxury
1,Domek letniskowy w krokusach 4 (A),Domek letniskowy parterowy,2300,25,74400,0.000000,-3.282798,
2,Domek letniskowy w krokusach 5,Domek letniskowy parterowy,2300,25,83500,0.000000,-3.282798,
3,Domek letniskowy w krokusach 4,Domek letniskowy parterowy,2300,36,95300,0.000000,-3.282798,
4,Domek letniskowy pod laskiem,Domek letniskowy parterowy,2300,38,96800,0.000000,-3.282798,
...,...,...,...,...,...,...,...,...
731,Dom w firletkach,"jednorodzinny z poddaszem, z garażem jednostan...",3930,137,278600,0.905556,1.350995,luxury
732,Dom w zdrojówkach 12,"jednorodzinny z poddaszem, z garażem jednostan...",3930,129,278700,0.905556,1.350995,luxury
733,Dom w renklodach 26 (G),"jednorodzinny parterowy, z garażem jednostanow...",3600,140,279000,0.722222,0.412865,high
734,Dom w balsamowcach (A),"jednorodzinny z poddaszem, z garażem jednostan...",3930,124,279200,0.905556,1.350995,luxury


In [9]:
#Advanced Scaling

from sklearn import preprocessing

out = preprocessing.MinMaxScaler().fit_transform(df[["price"]])

In [13]:
np.squeeze(out) == df.minmax

0      False
1       True
2       True
3       True
4       True
       ...  
731    False
732    False
733    False
734    False
735    False
Name: minmax, Length: 736, dtype: bool

In [14]:
print(out[0], df.minmax[0])

[0.90555556] 0.9055555555555556


In [15]:
np.allclose(np.squeeze(out), df.minmax)

True

In [16]:
preprocessing.StandardScaler().fit_transform(df[["price"]])

array([[ 1.35191404],
       [-3.2850309 ],
       [-3.2850309 ],
       [-3.2850309 ],
       [-3.2850309 ],
       [-1.74886509],
       [-3.2850309 ],
       [-3.2850309 ],
       [-3.2850309 ],
       [-1.74886509],
       [-1.74886509],
       [-1.74886509],
       [-1.74886509],
       [-1.74886509],
       [-1.74886509],
       [-1.74886509],
       [-0.89543965],
       [-1.74886509],
       [-1.74886509],
       [-1.74886509],
       [-0.89543965],
       [-1.74886509],
       [-1.74886509],
       [-1.12301977],
       [-1.74886509],
       [-1.12301977],
       [-1.74886509],
       [-1.12301977],
       [-1.74886509],
       [-1.12301977],
       [ 1.35191404],
       [-3.2850309 ],
       [-3.2850309 ],
       [-3.2850309 ],
       [-3.2850309 ],
       [-1.74886509],
       [-3.2850309 ],
       [-3.2850309 ],
       [-3.2850309 ],
       [-1.74886509],
       [-1.74886509],
       [-1.74886509],
       [-1.74886509],
       [-1.74886509],
       [-1.74886509],
       [-1

In [17]:
preprocessing.RobustScaler().fit_transform(df[["price"]])

array([[ 1.43478261],
       [-5.65217391],
       [-5.65217391],
       [-5.65217391],
       [-5.65217391],
       [-3.30434783],
       [-5.65217391],
       [-5.65217391],
       [-5.65217391],
       [-3.30434783],
       [-3.30434783],
       [-3.30434783],
       [-3.30434783],
       [-3.30434783],
       [-3.30434783],
       [-3.30434783],
       [-2.        ],
       [-3.30434783],
       [-3.30434783],
       [-3.30434783],
       [-2.        ],
       [-3.30434783],
       [-3.30434783],
       [-2.34782609],
       [-3.30434783],
       [-2.34782609],
       [-3.30434783],
       [-2.34782609],
       [-3.30434783],
       [-2.34782609],
       [ 1.43478261],
       [-5.65217391],
       [-5.65217391],
       [-5.65217391],
       [-5.65217391],
       [-3.30434783],
       [-5.65217391],
       [-5.65217391],
       [-5.65217391],
       [-3.30434783],
       [-3.30434783],
       [-3.30434783],
       [-3.30434783],
       [-3.30434783],
       [-3.30434783],
       [-3

In [18]:
#For problems with pandas_profiling

#pip install markupsafe==2.0.1
#pip3 install --force-reinstall MarkupSafe==2.0.1

In [None]:
#EDA

from pandas_profiling import ProfileReport

profile = ProfileReport(df)
profile

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

In [6]:
#Descriptive statistics

df.describe()

Unnamed: 0,price,size,construction_costs
count,736.0,736.0,736.0
mean,3454.769022,100.980978,214433.695652
std,351.763605,26.656335,49710.104153
min,2300.0,25.0,74400.0
25%,3370.0,89.0,181700.0
50%,3600.0,104.0,227400.0
75%,3600.0,119.0,254800.0
max,4100.0,163.0,331800.0


In [18]:
#Correlation

df_cut.corr()

Unnamed: 0,price,size,construction_costs
price,1.0,0.795186,0.869839
size,0.795186,1.0,0.855587
construction_costs,0.869839,0.855587,1.0


In [19]:
df_cut

Unnamed: 0,price,size,construction_costs
0,3930,128,331800
1,2300,25,74400
2,2300,25,83500
3,2300,36,95300
4,2300,38,96800
...,...,...,...
731,3930,137,278600
732,3930,129,278700
733,3600,140,279000
734,3930,124,279200


In [32]:
#Preparing training data

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(df[["price","size"]], df.construction_costs, test_size=.5)

In [33]:
x_train.shape

(368, 2)

In [34]:
x_test.shape

(368, 2)

In [35]:
#Building the model

model = LinearRegression()
model.fit(x_train, y_train)
model.score(x_test, y_test)

0.8336884715555155

In [25]:
#Improving the model

from sklearn import preprocessing

x_val, x_test, y_val, y_test = train_test_split(x_test, y_test)
x_test.shape

(92, 3)

In [26]:
scaler = preprocessing.StandardScaler()
model = LinearRegression()

scaler.fit(x_train)

StandardScaler()

In [27]:
x_scaled = scaler.transform(x_train)
x_scaled

array([[-0.28039062,  0.0733107 ,  0.29450485],
       [-0.93852323, -2.28075488, -1.91082166],
       [ 0.37774199,  0.82063311,  0.40213129],
       ...,
       [-1.79695707, -2.46758548, -2.28040676],
       [-1.79695707, -1.90709367, -1.58388099],
       [ 0.37774199,  0.93273147,  1.11287187]])

In [28]:
model.fit(x_scaled, y_train)

LinearRegression()

In [29]:
model.score(scaler.transform(x_val), y_val)

1.0

In [30]:
scaler = preprocessing.MinMaxScaler().fit(x_train)
model = LinearRegression().fit(scaler.transform(x_train), y_train)
model.score(scaler.transform(x_val), y_val)

1.0