In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from scipy.stats import trimboth
import ast
from sklearn.cluster import DBSCAN
import geopy
from tqdm import tqdm
import geopy.distance
from tqdm._tqdm_notebook import tqdm_notebook
tqdm.pandas(desc="Example Desc")

In [2]:
import plotly.offline as pyo
import chart_studio.plotly as chspy
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
import plotly.express as px

init_notebook_mode(connected=True)

In [3]:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import seaborn as sns; sns.set()
import csv

In [4]:
flats = pd.read_csv('final.csv', index_col=0)

### Count number of adds in a region

In [5]:
spb = flats[(flats.trade_type == 'sale')&\
            ((flats.city == 'Санкт-Петербург')| (flats.city == 'Ленинградская область'))]

In [6]:
spb.groupby('trade_type')['_id'].count().sort_values()

trade_type
sale    11769
Name: _id, dtype: int64

Введем расстояние от центра города в spb

In [7]:
def get_dist(x):
    if (np.isnan(x.latitude)  or np.isnan(x.longitude)):
        return np.nan
    else:
        return geopy.distance.vincenty((x.latitude, x.longitude),\
                                     (59.939095, 30.315868)).km

In [8]:
spb['distance'] = spb.progress_apply(get_dist, axis=1)

Example Desc: 100%|██████████████████████████████████████████████████████████████████████████████| 11769/11769 [00:01<00:00, 5920.78it/s]


In [9]:
spb = spb[spb['distance']<50]

In [10]:
spb.groupby('New_building')['id'].count().sort_values()

New_building
1.0    1429
0.0    5240
Name: id, dtype: int64

In [11]:
spb_cluster = spb[['longitude', 'latitude', 'price_per_sq_meter']]

In [12]:
# from IPython.display import Image
# from IPython.core.display import HTML , display
# x = flats[flats.cian_id==228193926].pic_urls.values[0]#[247768]
# for i in ast.literal_eval(x):
#     display(Image(url= i))

## Statistic analysis

In [13]:
clustering_data=spb[spb.trade_type=='sale']\
[['price_per_sq_meter', 'latitude', 'longitude', '_id']].dropna()

In [14]:
clustering_data.head()

Unnamed: 0,price_per_sq_meter,latitude,longitude,_id
2971,106154.0,60.259951,30.235713,ObjectId(5e579c1b5aaf06a3bbfabfa0)
3386,115713.0,60.023989,30.235693,ObjectId(5e57a1a4089b1f8c93fac038)
3389,78661.0,59.966607,30.480724,ObjectId(5e57a1a9550a1f3209fac030)
3777,100000.0,59.950467,30.573692,ObjectId(5e57a82b495d4f5d65fc7d86)
5351,31475.0,59.987677,30.333652,ObjectId(5e57bf52eca493834fe53abc)


In [15]:
x = clustering_data.price_per_sq_meter
clustering_data.price_per_sq_meter = (x - x.mean())/x.std()/3
x = clustering_data.latitude
clustering_data.latitude = (x - x.mean())/x.std()
x = clustering_data.longitude
clustering_data.longitude = (x - x.mean())/x.std()

In [16]:
clustering_data['cluster_label'] = DBSCAN(eps=0.1, min_samples=5).fit_predict(clustering_data[['price_per_sq_meter', 'latitude', 'longitude']])

In [17]:
clustering_data.head()

Unnamed: 0,price_per_sq_meter,latitude,longitude,_id,cluster_label
2971,-0.126315,3.139996,-0.432799,ObjectId(5e579c1b5aaf06a3bbfabfa0),-1
3386,-0.085607,0.892597,-0.432908,ObjectId(5e57a1a4089b1f8c93fac038),0
3389,-0.243396,0.346071,0.913827,ObjectId(5e57a1a9550a1f3209fac030),1
3777,-0.152522,0.192345,1.424795,ObjectId(5e57a82b495d4f5d65fc7d86),2
5351,-0.44434,0.546746,0.105491,ObjectId(5e57bf52eca493834fe53abc),-1


In [18]:
clustering_data = clustering_data[['_id', 'cluster_label']]
spb = spb.merge(clustering_data, left_on='_id', right_on='_id')

In [19]:
clustering_data.groupby('cluster_label')['_id'].count().sort_values(ascending=False)

cluster_label
 4     1632
 0     1348
-1      636
 7      477
 3      366
       ... 
 48       5
 86       5
 74       5
 75       5
 79       5
Name: _id, Length: 94, dtype: int64

In [20]:
spb['cluster_label'] = spb['cluster_label'].progress_apply(lambda x: str(x))

Example Desc: 100%|██████████████████████████████████████████████████████████████████████████████| 6669/6669 [00:00<00:00, 318400.63it/s]


In [21]:
lst = spb.cluster_label.unique()

In [22]:
lst

array(['-1', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11',
       '12', '13', '23', '14', '15', '16', '17', '18', '19', '20', '21',
       '22', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33',
       '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44',
       '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55',
       '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66',
       '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '77',
       '78', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88',
       '89', '90', '91', '92'], dtype=object)

In [23]:
spb.New_building

0       0.0
1       0.0
2       0.0
3       0.0
4       0.0
       ... 
6664    0.0
6665    0.0
6666    0.0
6667    1.0
6668    1.0
Name: New_building, Length: 6669, dtype: float64

Visualizating

In [24]:
layout = dict(
    height = 800,
    # top, bottom, left and right margins
    margin = dict(t = 0, b = 0, l = 0, r = 0),
    font = dict(color = '#FFFFFF', size = 11),
    paper_bgcolor = '#000000',
    mapbox = dict(
        # here you need the token from Mapbox
        accesstoken = 'pk.eyJ1Ijoibmlrc29iOTciLCJhIjoiY2s4cGs3NzRnMDAxODNnbnR0M3QxNHl5NyJ9.Ppwf_rp0v7AP5-VepAWydw',
        bearing = 0,
        # where we want the map to be centered
        center = dict(
            lat = 59.939095,
            lon = 30.315868
        ),
        # we want the map to be "parallel" to our screen, with no angle
        pitch = 0,
        # default level of zoom
        zoom = 8,
        # default map style
        style = 'dark'
    )
)

In [25]:
spb_new = spb[spb.New_building==1.0]
spb_old = spb[spb.New_building==0.0]

In [26]:
color_list = px.colors.qualitative.Dark24

In [27]:
data = list()

In [28]:
data = list()
#set the geo=spatial data
data_old = [go.Scattermapbox(
            lat= spb_old[spb_old["cluster_label"]==lst[i]]['latitude'] ,
            lon= spb_old[spb_old["cluster_label"]==lst[i]]['longitude'],
            customdata = spb_old[spb_old["cluster_label"]==lst[i]]['total_price'],
            mode='markers',
            marker=dict(
                size= 4,
                color=spb_old[spb_old["cluster_label"]==lst[i]]['price_per_sq_meter'].astype(float),
                opacity = .8,
                symbol = 'circle'
            ),
    text=list(spb_old[spb_old["cluster_label"]==lst[i]].Number_of_rooms),
          ) for i in range(len(lst))]
data_new = [
go.Scattermapbox(
            lat= spb_new[spb_new["cluster_label"]==lst[i]]['latitude'] ,
            lon= spb_new[spb_new["cluster_label"]==lst[i]]['longitude'],
            customdata = spb_new[spb_new["cluster_label"]==lst[i]]['total_price'],
            mode='markers',
            marker=dict(
                size= 4,
                opacity = .8,
                reversescale = True,
            autocolorscale = False,
            symbol = 'square',
                color=spb_new[spb_new["cluster_label"]==lst[i]]['price_per_sq_meter'].astype(float),
            colorbar=dict(
            title="Colorbar"
        ),
        colorscale="Viridis"),
#     text=list(spb_new[spb_new["cluster_label"]==lst[i]].Number_of_rooms),
          ) for i in range(len(lst))]
data.extend(data_old)
data.extend(data_new)

In [29]:
for i in range(len(lst)):
    data.append(
    go.Box(
y = spb_new.price_per_sq_meter[spb_new.cluster_label==lst[i]].dropna(),
name=i))

In [30]:
fig = dict(data=data, layout=layout)
iplot(fig)

In [31]:
px.set_mapbox_access_token('pk.eyJ1Ijoibmlrc29iOTciLCJhIjoiY2s4cGs3NzRnMDAxODNnbnR0M3QxNHl5NyJ9.Ppwf_rp0v7AP5-VepAWydw')
# spb_with_coord =  spb[['longitude', 'latitude', 'district', 'cian_id']].dropna()
fig = px.scatter_mapbox(spb, lat="latitude", lon="longitude", color="cluster_label",\
                        size_max=10, zoom=9,  mapbox_style='carto-darkmatter')
fig.show()

In [32]:
studio_flat = spb[spb.Number_of_rooms==0.8]
# studio_flat.total_area = studio_flat.total_price/studio_flat.price_per_sq_meter
one_flat = spb[spb.Number_of_rooms == 1]
two_flat = spb[spb.Number_of_rooms == 2]
three_flat = spb[spb.Number_of_rooms == 3]
four_flat = spb[spb.Number_of_rooms == 4]
five_flat = spb[spb.Number_of_rooms == 5]
flat_list = [studio_flat, one_flat, two_flat, three_flat, four_flat, five_flat]

In [35]:
new_flt = list()
for flt in flat_list:
    flt = flt[['total_price', 'total_area', \
               'price_per_sq_meter', 'New_building', 'trade_type', 'cluster_label','distance']]\
    .dropna()[flt.trade_type=='sale']
    flt = flt.loc[flt['total_price'].isin(trimboth(flt.total_price, 0.1))\
                  & flt['total_area'].isin(trimboth(flt.total_area, 0.1))]
    new_flt.append(flt)
#     print(flt.price_per_sq_meter.mean())
studio_flat, one_flat, two_flat, three_flat, four_flat, five_flat = new_flt

In [36]:
studio_flat.keys()

Index(['total_price', 'total_area', 'price_per_sq_meter', 'New_building',
       'trade_type', 'cluster_label', 'distance'],
      dtype='object')

In [37]:
px.scatter(x = one_flat.distance,
    y = one_flat.price_per_sq_meter, color = one_flat.cluster_label)#district

In [38]:
px.scatter(x = two_flat.distance,
    y = two_flat.price_per_sq_meter, color = two_flat.cluster_label)

# Prices_per_square

## One room flats price per meter box plot

In [40]:
data = list()
#set the geo=spatial data
data_old = [go.Scattermapbox(
            lat= spb_old[spb_old["cluster_label"]==lst[i]]['latitude'] ,
            lon= spb_old[spb_old["cluster_label"]==lst[i]]['longitude'],
            customdata = spb_old[spb_old["cluster_label"]==lst[i]]['total_price'],
            mode='markers',
            marker=dict(
                size= 4,
                color=spb_old[spb_old["cluster_label"]==lst[i]]['price_per_sq_meter'].astype(float),
                opacity = .8,
                symbol = 'circle'
            ),
    text=list(spb_old[spb_old["cluster_label"]==lst[i]].Number_of_rooms),
          ) for i in range(len(lst))]
data_new = [
go.Scattermapbox(
            lat= spb_new[spb_new["cluster_label"]==lst[i]]['latitude'] ,
            lon= spb_new[spb_new["cluster_label"]==lst[i]]['longitude'],
            customdata = spb_new[spb_new["cluster_label"]==lst[i]]['total_price'],
            mode='markers',
            marker=dict(
                size= 4,
                opacity = .8,
                reversescale = True,
            autocolorscale = False,
            symbol = 'square',
                color=spb_new[spb_new["cluster_label"]==lst[i]]['price_per_sq_meter'].astype(float),
            ),
    text=list(spb_new[spb_new["cluster_label"]==lst[i]].Number_of_rooms),
          ) for i in range(len(lst))]
data.extend(data_old)
data.extend(data_new)

In [41]:
for i in range(len(lst)):
    data.append(
    go.Box(
y = one_flat.price_per_sq_meter[one_flat.cluster_label==lst[i]].dropna(),
name=i))

In [42]:
layout = go.Layout(title="Box plot of one-room flat's price per meter in different clusters")

In [43]:
fig = go.Figure(data=data, layout=layout)

In [44]:
pyo.plot(fig, filename='visualization/one_room_price_per_meter_box_plot', auto_open=False)

'visualization/one_room_price_per_meter_box_plot.html'

## Two rooms flats price per meter box plot

In [None]:
data = list()
for i in range(len(lst)):
    data.append(
    go.Box(
y = two_flat.price_per_sq_meter[two_flat.cluster_label==i].dropna(),
name=i))

In [None]:
layout = go.Layout(title="Box plot of one-room flat's price per meter in different clusters")

In [None]:
fig = go.Figure(data=data, layout=layout)

In [None]:
pyo.plot(fig, filename='visualization/two_rooms_price_per_meter_box_plot', auto_open=False)

## Three rooms flats price per meter box plot

In [None]:
data = list()
for i in range(len(lst)):
    data.append(
    go.Box(
y = three_flat.price_per_sq_meter[three_flat.cluster_label==i].dropna(),
name=i))

In [None]:
layout = go.Layout(title="Box plot of three-room flat's price per meter in different clusters")

In [None]:
fig = go.Figure(data=data, layout=layout)

In [None]:
pyo.plot(fig, filename='visualization/three_rooms_price_per_meter_box_plot', auto_open=False)

## Four rooms flats price per meter box plot

In [None]:
data = list()
for i in range(len(lst)):
    data.append(
    go.Box(
y = four_flat.price_per_sq_meter[four_flat.cluster_label==i].dropna(),
name=i))

In [None]:
layout = go.Layout(title="Box plot of four-room flat's price per meter in different clusters")

In [None]:
fig = go.Figure(data=data, layout=layout)

In [None]:
pyo.plot(fig, filename='visualization/four_rooms_price_per_meter_box_plot', auto_open=False)

## Five rooms flats price per meter box plot

In [None]:
data = list()
for i in range(len(lst)):
    data.append(
    go.Box(
y = five_flat.price_per_sq_meter[five_flat.cluster_label==i].dropna(),
name=i))

In [None]:
layout = go.Layout(title="Box plot of five-room flat's price per meter in different clusters")

In [None]:
fig = go.Figure(data=data, layout=layout)

In [None]:
pyo.plot(fig, filename='visualization/five_rooms_price_per_meter_box_plot', auto_open=False)

In [None]:
fig = px.histogram(one_flat[one_flat.district=="Приморский"], x="total_price"
                   , nbins=20, color="New_building", opacity=0.8)
fig.show()

fig = px.histogram(one_flat[one_flat.district=="Выборгский"], x="total_price"
                   , nbins=20, color="New_building")
fig.show()

In [None]:
studio_flat = spb[spb.Number_of_rooms==0.8]
studio_flat.total_area = studio_flat.total_price/studio_flat.price_per_sq_meter
one_flat = spb[spb.Number_of_rooms == 1]
two_flat = spb[spb.Number_of_rooms == 2]
three_flat = spb[spb.Number_of_rooms == 3]
four_flat = spb[spb.Number_of_rooms == 4]
five_flat = spb[spb.Number_of_rooms == 5]
flat_list = [studio_flat, one_flat, two_flat, three_flat, four_flat, five_flat]

In [None]:
new_flt = list()
for flt in flat_list:
    flt = flt[['total_price', 'total_area', 'district', 'price_per_sq_meter', 'New_building', 'trade_type']]\
    .dropna()[flt.trade_type=='sale']
    flt = flt.loc[flt['total_price'].isin(trimboth(flt.total_price, 0.1))\
                  & flt['total_area'].isin(trimboth(flt.total_area, 0.1))]
    new_flt.append(flt)
#     print(flt.price_per_sq_meter.mean())
studio_flat, one_flat, two_flat, three_flat, four_flat, five_flat = new_flt