In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from scipy.stats import trimboth
import geopy
import geopy.distance

In [2]:
import plotly.offline as pyo
import plotly.graph_objs as go
import chart_studio.plotly as chspy

In [3]:
flats = pd.read_csv('database.csv',sep=',',dtype='unicode')

In [4]:
flats.drop(['kitchen_area', 'living_area', '_id', 'Building_year', 'storey_number', 'whole_storey_number'], axis=1, inplace=True)

делаю числом все, что может быть числом

In [5]:
def make_float(x):
    try:
        tmp = float(x)
    except:
        tmp = x
    return tmp

In [6]:
def make_float_or_nan(x):
    try:
        tmp = float(x)
    except:
        tmp = np.nan
    return tmp

In [7]:
flats = flats.applymap(make_float)

делаю числом все цены, иначе делаю их нанами

In [8]:
flats.total_price = flats.total_price.apply(make_float_or_nan)

In [9]:
flats.total_price.unique()

array([     nan,  434000.,  160000., ..., 2874960.,  442250., 4173120.])

### Count number of adds in a region

In [10]:
addresses_one_flat = flats['address'].str.replace("[","").str.replace("]","").str.replace('"',"").str.split(",",expand=True)
flats['city'] = addresses_one_flat.iloc[:,0]

In [11]:
flats.groupby('city')['visitors'].count().sort_values(ascending=False)

city
Москва                   1275
Московская область       1071
Санкт-Петербург           847
Краснодарский край        833
Ленинградская область     452
                         ... 
Еврейская АО                1
Магаданская область         1
Ингушетия респ.             1
Калмыкия респ.              1
Ненецкий АО                 0
Name: visitors, Length: 84, dtype: int64

In [12]:
spb = flats[flats.city == 'Санкт-Петербург']
address_spb = spb['address'].str.replace("[","").str.replace("]","").str.replace('"',"").str.split(",",expand=True)

Введем расстояние от центра города в spb

In [13]:
def get_dist(x):
    if (np.isnan(x.latitude)  or np.isnan(x.longitude)):
        return np.nan
    else:
        return geopy.distance.vincenty((x.latitude, x.longitude),\
                                     (59.939095, 30.315868)).km

In [14]:
spb['distance'] = spb.apply(get_dist, axis=1)

In [15]:
spb.keys()

Index(['Number_of_rooms', 'active', 'address', 'bathroom_number',
       'bathroom_separated', 'ceiling_type', 'central_heating', 'cian_id',
       'date_of_adding_to_db', 'date_of_place', 'emergency_condition',
       'house_type', 'housing_complex', 'id', 'latitude', 'longitude',
       'passengers_elevator_number', 'pic_urls', 'porch_num',
       'price_per_house_in_dst', 'price_per_house_in_dst_dynamics',
       'price_per_meter_in_dst', 'price_per_meter_in_dst_dynamics',
       'price_per_sq_meter', 'price_range', 'purchase_dynamics',
       'purchase_price', 'rent_dynamics', 'rent_dynamics_in_dst', 'rent_price',
       'rent_price_in_dst', 'room1_square', 'room2_square', 'room3_square',
       'seen_as_old', 'service_elevator_number', 'total_area',
       'total_number_views', 'total_price', 'type_of_flat', 'visitors',
       'windows_to_street', 'сeiling_height', 'city', 'distance'],
      dtype='object')

In [16]:
spb_districts = ['Приморский',
 'Красносельский',
 'Петроградский',
 'Адмиралтейский',
 'Московский',
 'Калининский',
 'Курортный',
 'Центральный',
 'Василеостровский',
 'Фрунзенский',
 'Выборгский',
 'Невский',
 'Петродворцовый',
 'Красногвардейский',
 'Колпинский',
 'Кировский',
 'Пушкинский', 
'Кронштадтский']

In [17]:
def find_district(address):
    for i in spb_districts: 
        if i in address:
            return i.strip()
spb['district'] = spb.apply(lambda row : find_district(row['address']), axis = 1) 

In [18]:
spb.groupby('district')['id'].count().sort_values()

district
Кронштадтский          30
Петродворцовый         95
Колпинский            119
Курортный             197
Кировский             403
Пушкинский            413
Красносельский        413
Петроградский         512
Адмиралтейский        563
Красногвардейский     586
Фрунзенский           714
Калининский           990
Невский              1011
Василеостровский     1120
Московский           1627
Центральный          1724
Выборгский           1826
Приморский           2766
Name: id, dtype: int64

## Statistic analysis

In [19]:
import plotly
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
import plotly.express as px

init_notebook_mode(connected=True)

In [20]:
studio_flat = spb[spb.Number_of_rooms==0.8]
studio_flat.total_area = studio_flat.total_price/studio_flat.price_per_sq_meter
one_flat = spb[spb.Number_of_rooms == 1]
two_flat = spb[spb.Number_of_rooms == 2]
three_flat = spb[spb.Number_of_rooms == 3]
four_flat = spb[spb.Number_of_rooms == 4]
five_flat = spb[spb.Number_of_rooms == 5]
flat_list = [studio_flat, one_flat, two_flat, three_flat, four_flat, five_flat]

In [21]:
new_flt = list()
for flt in flat_list:
    flt = flt[['total_price', 'total_area', 'district', 'price_per_sq_meter', 'distance']].dropna()[flt.total_price>100000]
    flt = flt.loc[flt['total_price'].isin(trimboth(flt.total_price, 0.1))\
                  | flt['total_area'].isin(trimboth(flt.total_area, 0.1))]
    new_flt.append(flt)
studio_flat, one_flat, two_flat, three_flat, four_flat, five_flat = new_flt

In [22]:
px.scatter(x = one_flat.total_area,
    y = one_flat.total_price, color = one_flat.district)

In [23]:
px.scatter(x = one_flat.distance,
    y = one_flat.price_per_sq_meter, color = one_flat.district)

In [24]:
flats.Number_of_rooms.value_counts()

2.0    53844
1.0    49989
3.0    21158
0.8     6612
4.0     3350
5.0      518
Name: Number_of_rooms, dtype: int64

In [25]:
lst = list(spb.district.value_counts().keys())

# Prices

## One room flats price box plot

In [26]:
data = list()
for i in range(len(lst)):
    data.append(
    go.Box(
y = one_flat.total_price[one_flat.district==lst[i]].dropna(),
name=lst[i]))

In [27]:
layout = go.Layout(title="Box plot of one-room flat's price in different disctricts")

In [28]:
fig = go.Figure(data=data, layout=layout)

In [29]:
pyo.plot(fig, filename='visualization/one_room_price_box_plot', auto_open=False)

'visualization/one_room_price_box_plot.html'

## Two rooms flats price box plot

In [30]:
data = list()
for i in range(len(lst)):
    data.append(
    go.Box(
y = two_flat.total_price[two_flat.district==lst[i]].dropna(),
name=lst[i]))

In [31]:
layout = go.Layout(title="Box plot of one-room flat's price in different disctricts")

In [32]:
fig = go.Figure(data=data, layout=layout)

In [33]:
pyo.plot(fig, filename='visualization/two_rooms_price_box_plot', auto_open=False)

'visualization/two_rooms_price_box_plot.html'

## Three rooms flats price box plot

In [34]:
data = list()
for i in range(len(lst)):
    data.append(
    go.Box(
y = three_flat.total_price[three_flat.district==lst[i]].dropna(),
name=lst[i]))

In [35]:
layout = go.Layout(title="Box plot of three-room flat's price in different disctricts")

In [36]:
fig = go.Figure(data=data, layout=layout)

In [37]:
pyo.plot(fig, filename='visualization/three_rooms_price_box_plot', auto_open=False)

'visualization/three_rooms_price_box_plot.html'

## Four rooms flats price box plot

In [38]:
data = list()
for i in range(len(lst)):
    data.append(
    go.Box(
y = four_flat.total_price[four_flat.district==lst[i]].dropna(),
name=lst[i]))

In [39]:
layout = go.Layout(title="Box plot of four-room flat's price in different disctricts")

In [40]:
fig = go.Figure(data=data, layout=layout)

In [41]:
pyo.plot(fig, filename='visualization/four_rooms_price_box_plot', auto_open=False)

'visualization/four_rooms_price_box_plot.html'

## Five rooms flats price box plot

In [42]:
data = list()
for i in range(len(lst)):
    data.append(
    go.Box(
y = five_flat.total_price[five_flat.district==lst[i]].dropna(),
name=lst[i]))

In [43]:
layout = go.Layout(title="Box plot of five-room flat's price in different disctricts")

In [44]:
fig = go.Figure(data=data, layout=layout)

In [45]:
pyo.plot(fig, filename='visualization/five_rooms_price_box_plot', auto_open=False)

'visualization/five_rooms_price_box_plot.html'

# Prices_per_square

## One room flats price per meter box plot

In [46]:
data = list()
for i in range(len(lst)):
    data.append(
    go.Box(
y = one_flat.price_per_sq_meter[one_flat.district==lst[i]].dropna(),
name=lst[i]))

In [47]:
layout = go.Layout(title="Box plot of one-room flat's price per meter in different disctricts")

In [48]:
fig = go.Figure(data=data, layout=layout)

In [49]:
pyo.plot(fig, filename='visualization/one_room_price_per_meter_box_plot', auto_open=False)

'visualization/one_room_price_per_meter_box_plot.html'

## Two rooms flats price per meter box plot

In [50]:
data = list()
for i in range(len(lst)):
    data.append(
    go.Box(
y = two_flat.price_per_sq_meter[two_flat.district==lst[i]].dropna(),
name=lst[i]))

In [51]:
layout = go.Layout(title="Box plot of one-room flat's price per meter in different disctricts")

In [52]:
fig = go.Figure(data=data, layout=layout)

In [53]:
pyo.plot(fig, filename='visualization/two_rooms_price_per_meter_box_plot', auto_open=False)

'visualization/two_rooms_price_per_meter_box_plot.html'

## Three rooms flats price per meter box plot

In [54]:
data = list()
for i in range(len(lst)):
    data.append(
    go.Box(
y = three_flat.price_per_sq_meter[three_flat.district==lst[i]].dropna(),
name=lst[i]))

In [55]:
layout = go.Layout(title="Box plot of three-room flat's price per meter in different disctricts")

In [56]:
fig = go.Figure(data=data, layout=layout)

In [57]:
pyo.plot(fig, filename='visualization/three_rooms_price_per_meter_box_plot', auto_open=False)

'visualization/three_rooms_price_per_meter_box_plot.html'

## Four rooms flats price per meter box plot

In [58]:
data = list()
for i in range(len(lst)):
    data.append(
    go.Box(
y = four_flat.price_per_sq_meter[four_flat.district==lst[i]].dropna(),
name=lst[i]))

In [59]:
layout = go.Layout(title="Box plot of four-room flat's price per meter in different disctricts")

In [60]:
fig = go.Figure(data=data, layout=layout)

In [61]:
pyo.plot(fig, filename='visualization/four_rooms_price_per_meter_box_plot', auto_open=False)

'visualization/four_rooms_price_per_meter_box_plot.html'

## Five rooms flats price per meter box plot

In [62]:
data = list()
for i in range(len(lst)):
    data.append(
    go.Box(
y = five_flat.price_per_sq_meter[five_flat.district==lst[i]].dropna(),
name=lst[i]))

In [63]:
layout = go.Layout(title="Box plot of five-room flat's price per meter in different disctricts")

In [64]:
fig = go.Figure(data=data, layout=layout)

In [65]:
pyo.plot(fig, filename='visualization/five_rooms_price_per_meter_box_plot', auto_open=False)

'visualization/five_rooms_price_per_meter_box_plot.html'

In [66]:
fig = px.histogram(one_flat[one_flat.district=="Приморский"], x="total_price", nbins=20)
fig.show()

In [67]:
fig = px.histogram(one_flat[one_flat.district=="Выборгский"], x="total_price", nbins=20)
fig.show()

In [68]:
fig = px.histogram(one_flat[one_flat.district=="Центральный"], x="total_price", nbins=20)
fig.show()

In [69]:
fig = px.histogram(one_flat[one_flat.district=="Московский"], x="total_price", nbins=20)
fig.show()