In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from scipy.stats import trimboth
import ast
import geopy
from tqdm import tqdm
import geopy.distance
from tqdm._tqdm_notebook import tqdm_notebook
tqdm.pandas(desc="Example Desc")

In [None]:
import plotly.offline as pyo
import chart_studio.plotly as chspy
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
import plotly.express as px

init_notebook_mode(connected=True)

In [None]:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import seaborn as sns; sns.set()
import csv

In [None]:
flats = pd.read_csv('final.csv', index_col=0)

### Count number of adds in a region

In [None]:
spb = flats[(flats.trade_type == 'sale')&\
            ((flats.city == 'Санкт-Петербург')| (flats.city == 'Ленинградская область'))]

In [None]:
spb.groupby('trade_type')['_id'].count().sort_values()

Введем расстояние от центра города в spb

In [None]:
def get_dist(x):
    if (np.isnan(x.latitude)  or np.isnan(x.longitude)):
        return np.nan
    else:
        return geopy.distance.vincenty((x.latitude, x.longitude),\
                                     (59.939095, 30.315868)).km

In [None]:
spb['distance'] = spb.progress_apply(get_dist, axis=1)

In [None]:
spb_districts = ['Приморский',
 'Красносельский',
 'Петроградский',
 'Адмиралтейский',
 'Московский',
 'Калининский',
 'Курортный',
 'Центральный',
 'Василеостровский',
 'Фрунзенский',
 'Выборгский',
 'Невский',
 'Петродворцовый',
 'Красногвардейский',
 'Колпинский',
 'Кировский',
 'Пушкинский', 
'Кронштадтский']

In [None]:
def find_district(address):
    for i in spb_districts: 
        if i in address:
            return i.strip()
    return None
spb['district'] = spb.apply(lambda row : find_district(row['address']), axis = 1) 

In [None]:
spb.groupby('New_building')['id'].count().sort_values()

In [None]:
spb_cluster = spb[['longitude', 'latitude', 'price_per_sq_meter']]

In [None]:
from IPython.display import Image
from IPython.core.display import HTML , display
x = flats[flats.cian_id==228193926].pic_urls.values[0]#[247768]
for i in ast.literal_eval(x):
    display(Image(url= i))

## Statistic analysis

In [None]:
clustering_data=spb[spb.trade_type=='sale']\
[['price_per_sq_meter', 'latitude', 'longitude', '_id']].dropna()

In [None]:
clustering_data.head()

In [None]:
x = clustering_data.price_per_sq_meter
clustering_data.price_per_sq_meter = (x - x.mean())/x.std()/1.2
x = clustering_data.latitude
clustering_data.latitude = (x - x.mean())/x.std()
x = clustering_data.longitude
clustering_data.longitude = (x - x.mean())/x.std()

In [None]:
kmeans = KMeans(n_clusters = 20, init ='k-means++')
kmeans.fit(clustering_data[['price_per_sq_meter', 'latitude', 'longitude']]) # Compute k-means clustering.
labels = kmeans.predict(clustering_data[['price_per_sq_meter', 'latitude', 'longitude']]) # Labels of each point
clustering_data['cluster_label'] = kmeans.fit_predict(clustering_data\
                                                      [['price_per_sq_meter', 'latitude', 'longitude']])
centers = kmeans.cluster_centers_ # Coordinates of cluster centers.
# clustering_data.head(10)

In [None]:
clustering_data = clustering_data[['_id', 'cluster_label']]
spb = spb.merge(clustering_data, left_on='_id', right_on='_id')

In [None]:
clustering_data.groupby('cluster_label')['_id'].count().sort_values(ascending=False)

In [None]:
spb['cluster_label'] = spb['cluster_label'].progress_apply(lambda x: str(x))

In [None]:
spb.New_building

In [None]:
spb.New_building = spb.New_building.progress_apply(lambda x: 10 if x==1 else 1)

In [None]:
px.set_mapbox_access_token('pk.eyJ1Ijoibmlrc29iOTciLCJhIjoiY2s4cGs3NzRnMDAxODNnbnR0M3QxNHl5NyJ9.Ppwf_rp0v7AP5-VepAWydw')
# spb_with_coord =  spb[['longitude', 'latitude', 'district', 'cian_id']].dropna()
fig = px.scatter_mapbox(spb, lat="latitude", lon="longitude", color="cluster_label",\
                        size_max=10, zoom=10, size='New_building',  mapbox_style='carto-darkmatter')
fig.show()

In [None]:
studio_flat = spb[spb.Number_of_rooms==0.8]
studio_flat.total_area = studio_flat.total_price/studio_flat.price_per_sq_meter
one_flat = spb[spb.Number_of_rooms == 1]
two_flat = spb[spb.Number_of_rooms == 2]
three_flat = spb[spb.Number_of_rooms == 3]
four_flat = spb[spb.Number_of_rooms == 4]
five_flat = spb[spb.Number_of_rooms == 5]
flat_list = [studio_flat, one_flat, two_flat, three_flat, four_flat, five_flat]

In [None]:
new_flt = list()
for flt in flat_list:
    flt = flt[['total_price', 'total_area', 'district', \
               'price_per_sq_meter', 'New_building', 'trade_type', 'cluster_label',\
              'distance']]\
    .dropna()[flt.trade_type=='sale']
    flt = flt.loc[flt['total_price'].isin(trimboth(flt.total_price, 0.1))\
                  & flt['total_area'].isin(trimboth(flt.total_area, 0.1))]
    new_flt.append(flt)
#     print(flt.price_per_sq_meter.mean())
studio_flat, one_flat, two_flat, three_flat, four_flat, five_flat = new_flt

In [None]:
px.scatter(x = one_flat.total_area,
    y = one_flat.total_price, color = one_flat.cluster_label)#district

In [None]:
px.scatter(x = one_flat.distance,
    y = one_flat.price_per_sq_meter, color = one_flat.cluster_label)#district

# Prices

In [None]:
lst = list(spb.cluster_label.value_counts().keys())

## One room flats price box plot

In [None]:
data = list()
for i in range(len(lst)):
    data.append(
    go.Box(
y = one_flat.total_price[one_flat.cluster_label==lst[i]].dropna(),
name=lst[i]))

In [None]:
layout = go.Layout(title="Box plot of one-room flat's price in different clusters")

In [None]:
fig = go.Figure(data=data, layout=layout)

In [None]:
pyo.plot(fig, filename='visualization/one_room_price_box_plot', auto_open=False)

## Two rooms flats price box plot

In [None]:
data = list()
for i in range(len(lst)):
    data.append(
    go.Box(
y = two_flat.total_price[two_flat.cluster_label==lst[i]].dropna(),
name=lst[i]))

In [None]:
layout = go.Layout(title="Box plot of one-room flat's price in different clusters")

In [None]:
fig = go.Figure(data=data, layout=layout)

In [None]:
pyo.plot(fig, filename='visualization/two_rooms_price_box_plot', auto_open=False)

## Three rooms flats price box plot

In [None]:
data = list()
for i in range(len(lst)):
    data.append(
    go.Box(
y = three_flat.total_price[three_flat.cluster_label==lst[i]].dropna(),
name=lst[i]))

In [None]:
layout = go.Layout(title="Box plot of three-room flat's price in different clusters")

In [None]:
fig = go.Figure(data=data, layout=layout)

In [None]:
pyo.plot(fig, filename='visualization/three_rooms_price_box_plot', auto_open=False)

## Four rooms flats price box plot

In [None]:
data = list()
for i in range(len(lst)):
    data.append(
    go.Box(
y = four_flat.total_price[four_flat.cluster_label==lst[i]].dropna(),
name=lst[i]))

In [None]:
layout = go.Layout(title="Box plot of four-room flat's price in different clusters")

In [None]:
fig = go.Figure(data=data, layout=layout)

In [None]:
pyo.plot(fig, filename='visualization/four_rooms_price_box_plot', auto_open=False)

## Five rooms flats price box plot

In [None]:
data = list()
for i in range(len(lst)):
    data.append(
    go.Box(
y = five_flat.total_price[five_flat.cluster_label==lst[i]].dropna(),
name=lst[i]))

In [None]:
layout = go.Layout(title="Box plot of five-room flat's price in different clusters")

In [None]:
fig = go.Figure(data=data, layout=layout)

In [None]:
pyo.plot(fig, filename='visualization/five_rooms_price_box_plot', auto_open=False)

# Prices_per_square

## One room flats price per meter box plot

In [None]:
data = list()
for i in range(len(lst)):
    data.append(
    go.Box(
y = one_flat.price_per_sq_meter[one_flat.cluster_label==lst[i]].dropna(),
name=lst[i]))

In [None]:
layout = go.Layout(title="Box plot of one-room flat's price per meter in different clusters")

In [None]:
fig = go.Figure(data=data, layout=layout)

In [None]:
pyo.plot(fig, filename='visualization/one_room_price_per_meter_box_plot', auto_open=False)

## Two rooms flats price per meter box plot

In [None]:
data = list()
for i in range(len(lst)):
    data.append(
    go.Box(
y = two_flat.price_per_sq_meter[two_flat.cluster_label==lst[i]].dropna(),
name=lst[i]))

In [None]:
layout = go.Layout(title="Box plot of one-room flat's price per meter in different clusters")

In [None]:
fig = go.Figure(data=data, layout=layout)

In [None]:
pyo.plot(fig, filename='visualization/two_rooms_price_per_meter_box_plot', auto_open=False)

## Three rooms flats price per meter box plot

In [None]:
data = list()
for i in range(len(lst)):
    data.append(
    go.Box(
y = three_flat.price_per_sq_meter[three_flat.cluster_label==lst[i]].dropna(),
name=lst[i]))

In [None]:
layout = go.Layout(title="Box plot of three-room flat's price per meter in different clusters")

In [None]:
fig = go.Figure(data=data, layout=layout)

In [None]:
pyo.plot(fig, filename='visualization/three_rooms_price_per_meter_box_plot', auto_open=False)

## Four rooms flats price per meter box plot

In [None]:
data = list()
for i in range(len(lst)):
    data.append(
    go.Box(
y = four_flat.price_per_sq_meter[four_flat.cluster_label==lst[i]].dropna(),
name=lst[i]))

In [None]:
layout = go.Layout(title="Box plot of four-room flat's price per meter in different clusters")

In [None]:
fig = go.Figure(data=data, layout=layout)

In [None]:
pyo.plot(fig, filename='visualization/four_rooms_price_per_meter_box_plot', auto_open=False)

## Five rooms flats price per meter box plot

In [None]:
data = list()
for i in range(len(lst)):
    data.append(
    go.Box(
y = five_flat.price_per_sq_meter[five_flat.cluster_label==lst[i]].dropna(),
name=lst[i]))

In [None]:
layout = go.Layout(title="Box plot of five-room flat's price per meter in different clusters")

In [None]:
fig = go.Figure(data=data, layout=layout)

In [None]:
pyo.plot(fig, filename='visualization/five_rooms_price_per_meter_box_plot', auto_open=False)

In [None]:
fig = px.histogram(one_flat[one_flat.district=="Приморский"], x="total_price"
                   , nbins=20, color="New_building", opacity=0.8)
fig.show()

In [None]:
fig = px.histogram(one_flat[one_flat.district=="Выборгский"], x="total_price"
                   , nbins=20, color="New_building")
fig.show()