In [7]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from scipy.stats import trimboth
import geopy
import geopy.distance

In [23]:
import plotly.offline as pyo
import chart_studio.plotly as chspy
import plotly
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
import plotly.express as px

init_notebook_mode(connected=True)

In [9]:
flats = pd.read_csv('final.csv', index_col=0)

делаю числом все, что может быть числом

In [10]:
flats.total_price.unique()

array([      nan, 22900000.,  3920000., ...,  4458628.,  1376595.,
        1976600.])

In [11]:
flats.keys()

Index(['Building_year', 'Number_of_rooms', '_id', 'active', 'address',
       'bathroom_number', 'bathroom_separated', 'ceiling_type',
       'central_heating', 'cian_id', 'date_of_adding_to_db', 'date_of_place',
       'emergency_condition', 'house_type', 'housing_complex', 'id',
       'kitchen_area', 'latitude', 'living_area', 'longitude',
       'passengers_elevator_number', 'pic_urls', 'porch_num',
       'price_per_house_in_dst', 'price_per_house_in_dst_dynamics',
       'price_per_meter_in_dst', 'price_per_meter_in_dst_dynamics',
       'price_per_sq_meter', 'price_range', 'purchase_dynamics',
       'purchase_price', 'rent_dynamics', 'rent_dynamics_in_dst', 'rent_price',
       'rent_price_in_dst', 'room1_square', 'room2_square', 'room3_square',
       'seen_as_old', 'service_elevator_number', 'storey_number', 'total_area',
       'total_number_views', 'total_price', 'type_of_flat', 'visitors',
       'whole_storey_number', 'windows_to_street', 'сeiling_height',
       'New_bui

### Count number of adds in a region

In [15]:
spb = flats[flats.city == 'Санкт-Петербург']

Введем расстояние от центра города в spb

In [16]:
def get_dist(x):
    if (np.isnan(x.latitude)  or np.isnan(x.longitude)):
        return np.nan
    else:
        return geopy.distance.vincenty((x.latitude, x.longitude),\
                                     (59.939095, 30.315868)).km

In [17]:
spb['distance'] = spb.apply(get_dist, axis=1)

In [19]:
spb_districts = ['Приморский',
 'Красносельский',
 'Петроградский',
 'Адмиралтейский',
 'Московский',
 'Калининский',
 'Курортный',
 'Центральный',
 'Василеостровский',
 'Фрунзенский',
 'Выборгский',
 'Невский',
 'Петродворцовый',
 'Красногвардейский',
 'Колпинский',
 'Кировский',
 'Пушкинский', 
'Кронштадтский']

In [20]:
def find_district(address):
    for i in spb_districts: 
        if i in address:
            return i.strip()
spb['district'] = spb.apply(lambda row : find_district(row['address']), axis = 1) 

In [22]:
spb.groupby('district')['id'].count().sort_values()

district
Кронштадтский          36
Петродворцовый        130
Колпинский            160
Курортный             316
Кировский             486
Красносельский        509
Пушкинский            523
Петроградский         617
Адмиралтейский        696
Красногвардейский     733
Фрунзенский           824
Калининский          1176
Василеостровский     1207
Невский              1213
Московский           1856
Центральный          1905
Выборгский           2053
Приморский           3186
Name: id, dtype: int64

## Statistic analysis

In [24]:
studio_flat = spb[spb.Number_of_rooms==0.8]
studio_flat.total_area = studio_flat.total_price/studio_flat.price_per_sq_meter
one_flat = spb[spb.Number_of_rooms == 1]
two_flat = spb[spb.Number_of_rooms == 2]
three_flat = spb[spb.Number_of_rooms == 3]
four_flat = spb[spb.Number_of_rooms == 4]
five_flat = spb[spb.Number_of_rooms == 5]
flat_list = [studio_flat, one_flat, two_flat, three_flat, four_flat, five_flat]

In [32]:
new_flt = list()
for flt in flat_list:
    flt = flt[['total_price', 'total_area', 'district', 'price_per_sq_meter', 'distance', 'New_building']]\
    .dropna()[flt.total_price>100000]
    flt = flt.loc[flt['total_price'].isin(trimboth(flt.total_price, 0.1))\
                  | flt['total_area'].isin(trimboth(flt.total_area, 0.1))]
    new_flt.append(flt)
studio_flat, one_flat, two_flat, three_flat, four_flat, five_flat = new_flt

In [33]:
px.scatter(x = one_flat.total_area,
    y = one_flat.total_price, color = one_flat.district)

In [27]:
px.scatter(x = one_flat.distance,
    y = one_flat.price_per_sq_meter, color = one_flat.district)

In [28]:
flats.Number_of_rooms.value_counts()

2.0    65178
1.0    64057
3.0    25857
0.8     8695
4.0     4070
5.0      644
Name: Number_of_rooms, dtype: int64

In [29]:
lst = list(spb.district.value_counts().keys())

# Prices

## One room flats price box plot

In [26]:
data = list()
for i in range(len(lst)):
    data.append(
    go.Box(
y = one_flat.total_price[one_flat.district==lst[i]].dropna(),
name=lst[i]))

In [27]:
layout = go.Layout(title="Box plot of one-room flat's price in different disctricts")

In [28]:
fig = go.Figure(data=data, layout=layout)

In [29]:
pyo.plot(fig, filename='visualization/one_room_price_box_plot', auto_open=False)

'visualization/one_room_price_box_plot.html'

## Two rooms flats price box plot

In [30]:
data = list()
for i in range(len(lst)):
    data.append(
    go.Box(
y = two_flat.total_price[two_flat.district==lst[i]].dropna(),
name=lst[i]))

In [31]:
layout = go.Layout(title="Box plot of one-room flat's price in different disctricts")

In [32]:
fig = go.Figure(data=data, layout=layout)

In [33]:
pyo.plot(fig, filename='visualization/two_rooms_price_box_plot', auto_open=False)

'visualization/two_rooms_price_box_plot.html'

## Three rooms flats price box plot

In [34]:
data = list()
for i in range(len(lst)):
    data.append(
    go.Box(
y = three_flat.total_price[three_flat.district==lst[i]].dropna(),
name=lst[i]))

In [35]:
layout = go.Layout(title="Box plot of three-room flat's price in different disctricts")

In [36]:
fig = go.Figure(data=data, layout=layout)

In [37]:
pyo.plot(fig, filename='visualization/three_rooms_price_box_plot', auto_open=False)

'visualization/three_rooms_price_box_plot.html'

## Four rooms flats price box plot

In [38]:
data = list()
for i in range(len(lst)):
    data.append(
    go.Box(
y = four_flat.total_price[four_flat.district==lst[i]].dropna(),
name=lst[i]))

In [39]:
layout = go.Layout(title="Box plot of four-room flat's price in different disctricts")

In [40]:
fig = go.Figure(data=data, layout=layout)

In [41]:
pyo.plot(fig, filename='visualization/four_rooms_price_box_plot', auto_open=False)

'visualization/four_rooms_price_box_plot.html'

## Five rooms flats price box plot

In [42]:
data = list()
for i in range(len(lst)):
    data.append(
    go.Box(
y = five_flat.total_price[five_flat.district==lst[i]].dropna(),
name=lst[i]))

In [43]:
layout = go.Layout(title="Box plot of five-room flat's price in different disctricts")

In [44]:
fig = go.Figure(data=data, layout=layout)

In [45]:
pyo.plot(fig, filename='visualization/five_rooms_price_box_plot', auto_open=False)

'visualization/five_rooms_price_box_plot.html'

# Prices_per_square

## One room flats price per meter box plot

In [46]:
data = list()
for i in range(len(lst)):
    data.append(
    go.Box(
y = one_flat.price_per_sq_meter[one_flat.district==lst[i]].dropna(),
name=lst[i]))

In [47]:
layout = go.Layout(title="Box plot of one-room flat's price per meter in different disctricts")

In [48]:
fig = go.Figure(data=data, layout=layout)

In [49]:
pyo.plot(fig, filename='visualization/one_room_price_per_meter_box_plot', auto_open=False)

'visualization/one_room_price_per_meter_box_plot.html'

## Two rooms flats price per meter box plot

In [50]:
data = list()
for i in range(len(lst)):
    data.append(
    go.Box(
y = two_flat.price_per_sq_meter[two_flat.district==lst[i]].dropna(),
name=lst[i]))

In [51]:
layout = go.Layout(title="Box plot of one-room flat's price per meter in different disctricts")

In [52]:
fig = go.Figure(data=data, layout=layout)

In [53]:
pyo.plot(fig, filename='visualization/two_rooms_price_per_meter_box_plot', auto_open=False)

'visualization/two_rooms_price_per_meter_box_plot.html'

## Three rooms flats price per meter box plot

In [54]:
data = list()
for i in range(len(lst)):
    data.append(
    go.Box(
y = three_flat.price_per_sq_meter[three_flat.district==lst[i]].dropna(),
name=lst[i]))

In [55]:
layout = go.Layout(title="Box plot of three-room flat's price per meter in different disctricts")

In [56]:
fig = go.Figure(data=data, layout=layout)

In [57]:
pyo.plot(fig, filename='visualization/three_rooms_price_per_meter_box_plot', auto_open=False)

'visualization/three_rooms_price_per_meter_box_plot.html'

## Four rooms flats price per meter box plot

In [58]:
data = list()
for i in range(len(lst)):
    data.append(
    go.Box(
y = four_flat.price_per_sq_meter[four_flat.district==lst[i]].dropna(),
name=lst[i]))

In [59]:
layout = go.Layout(title="Box plot of four-room flat's price per meter in different disctricts")

In [60]:
fig = go.Figure(data=data, layout=layout)

In [61]:
pyo.plot(fig, filename='visualization/four_rooms_price_per_meter_box_plot', auto_open=False)

'visualization/four_rooms_price_per_meter_box_plot.html'

## Five rooms flats price per meter box plot

In [62]:
data = list()
for i in range(len(lst)):
    data.append(
    go.Box(
y = five_flat.price_per_sq_meter[five_flat.district==lst[i]].dropna(),
name=lst[i]))

In [63]:
layout = go.Layout(title="Box plot of five-room flat's price per meter in different disctricts")

In [64]:
fig = go.Figure(data=data, layout=layout)

In [65]:
pyo.plot(fig, filename='visualization/five_rooms_price_per_meter_box_plot', auto_open=False)

'visualization/five_rooms_price_per_meter_box_plot.html'

In [34]:
fig = px.histogram(one_flat[one_flat.district=="Приморский"], x="total_price"
                   , nbins=20, color="New_building", opacity=0.8)
fig.show()

In [35]:
fig = px.histogram(one_flat[one_flat.district=="Выборгский"], x="total_price"
                   , nbins=20, color="New_building")
fig.show()

In [36]:
fig = px.histogram(one_flat[one_flat.district=="Центральный"], x="total_price"
                   , nbins=20, color="New_building")
fig.show()

In [37]:
fig = px.histogram(one_flat[one_flat.district=="Московский"], x="total_price"
                   , nbins=20, color="New_building")
fig.show()

In [38]:
fig = px.histogram(one_flat[one_flat.district=="Московский"], x="total_price"
                   , nbins=20, color="New_building")
fig.show()

In [41]:
fig = px.histogram(one_flat[one_flat.district=="Московский"], x="total_price", 
                   nbins=20, color="New_building")
fig.show()