In [47]:
from pathlib import Path
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import plotly.io as pio

In [48]:
pio.templates.default = 'plotly_white'
px.set_mapbox_access_token(open(".mapbox_token").read())
PALETTE = px.colors.qualitative.Prism
MAIN_COLOR = 'rgb(231,41,138)'
BASE_DIR = Path.cwd().parent

In [49]:
df_listing = pd.read_csv(BASE_DIR / 'staging_data' / 'listing_cleaned.csv')
df_hosts_reviews = pd.read_parquet(BASE_DIR / 'staging_data' / 'hosts_reviews_all_cleaned.parquet')

In [50]:
df_hosts_reviews.columns

Index(['date', 'listing_id_encod', 'listing_id', 'reviewer_id',
       'reviewer_name', 'comments', 'host_id', 'host_name', 'host_description',
       'neighbourhood', 'latitude', 'longitude', 'room_type', 'price',
       'number_of_reviews', 'last_review', 'reviews_per_month',
       'number_of_reviews_ltm', 'review_scores_rating',
       'review_scores_accuracy', 'review_scores_cleanliness',
       'review_scores_checkin', 'review_scores_communication',
       'review_scores_location', 'review_scores_value'],
      dtype='object')

In [51]:
num_reviews = (df_hosts_reviews
            .groupby(['listing_id', 'neighbourhood'])['number_of_reviews']
            .size().sort_values(ascending=False)
            .reset_index())
num_reviews

Unnamed: 0,listing_id,neighbourhood,number_of_reviews
0,3.080758e+07,Södermalms,975
1,5.812430e+05,Östermalms,670
2,3.678366e+07,Norrmalms,653
3,2.991540e+05,Södermalms,578
4,1.500515e+07,Södermalms,515
...,...,...,...
2775,3.563447e+07,Hässelby-Vällingby,1
2776,6.550611e+17,Bromma,1
2777,4.616620e+07,Kungsholmens,1
2778,6.541030e+17,Kungsholmens,1


In [52]:
fig = px.bar(num_reviews.nlargest(50, 'number_of_reviews'), 
       y='number_of_reviews',
       hover_data = ['listing_id'],
       color='neighbourhood', color_discrete_sequence=PALETTE)
fig.update_layout(xaxis_title = None)

In [53]:
fig = px.bar(num_reviews.groupby('neighbourhood')['number_of_reviews'].sum().sort_values(ascending=True), orientation='h')
fig.update_traces(marker_color=MAIN_COLOR)

In [54]:
fig = ff.create_hexbin_mapbox(
    data_frame=df_listing, lat="latitude", lon="longitude",
    nx_hexagon=10, opacity=0.9, labels={"color": "Point Count"},
    color_continuous_scale=PALETTE,
    #show_original_data=True
)
fig.update_layout(margin=dict(b=0, t=0, l=0, r=0))
fig.show()

In [55]:
fig = px.scatter_mapbox(df_listing, 
                            lat="latitude", lon="longitude", 
                            color="room_type", size="number_of_reviews",
                            hover_data = {'latitude': False, 'longitude': False, 'price': True},
                            color_discrete_sequence=PALETTE,
                            size_max=15, zoom=10)
fig.show()

In [56]:
most_reviewed_neighb = num_reviews.nlargest(50, 'number_of_reviews')['neighbourhood'].unique().tolist()
city_parts = df_hosts_reviews[df_hosts_reviews['neighbourhood'].isin(most_reviewed_neighb) == True]

In [57]:
fig = px.scatter(city_parts, y='price', x='review_scores_rating',
           facet_col='neighbourhood', facet_col_wrap=4,
           color='neighbourhood', color_discrete_sequence=PALETTE)
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.update_traces(marker=dict(size=7,
                              line=dict(width=0.5,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))
fig.update_yaxes(tick0=10000, dtick=5000)
fig

In [58]:
df_hosts_reviews['review_scores_rating'].value_counts()

5.00    6023
4.92    4386
4.83    3103
4.84    3035
4.86    2869
        ... 
3.40       5
2.50       4
3.75       4
2.00       3
1.50       2
Name: review_scores_rating, Length: 114, dtype: int64

In [59]:
px.histogram(df_hosts_reviews, x='review_scores_rating')

In [60]:
df_hosts_reviews_en_labeled = pd.read_parquet(BASE_DIR / 'processed_data' / 'hosts_reviews_en_labeled.parquet')

In [61]:
df_hosts_reviews_en_labeled['sentiment_label_class'].value_counts().reset_index().sort_values('index', ascending=True)

Unnamed: 0,index,sentiment_label_class
1,0,1235
0,1,68396


In [62]:
neg_reviews = (df_hosts_reviews_en_labeled.query('sentiment_label_class == 0')
                                .groupby('neighbourhood')['sentiment_label_class']
                                .count().reset_index()
                                .rename(columns = {'sentiment_label_class': 'sentiment_label_nagative'}))
total_reviews = (df_hosts_reviews_en_labeled
                                .groupby('neighbourhood')
                                .size().reset_index()
                                .rename(columns = {0: 'total_reviews'}))
proportion_neg_reviews = neg_reviews.merge(total_reviews, on='neighbourhood')
proportion_neg_reviews['percent_of_neg_reviews'] = proportion_neg_reviews['sentiment_label_nagative'] / proportion_neg_reviews['total_reviews'] * 100
proportion_neg_reviews

Unnamed: 0,neighbourhood,sentiment_label_nagative,total_reviews,percent_of_neg_reviews
0,Bromma,22,2030,1.083744
1,Enskede-Årsta-Vantörs,69,4131,1.670298
2,Farsta,22,1221,1.801802
3,Hägersten-Liljeholmens,43,3745,1.148198
4,Hässelby-Vällingby,16,836,1.913876
5,Kungsholmens,168,7943,2.11507
6,Norrmalms,149,7456,1.998391
7,Rinkeby-Tensta,8,374,2.139037
8,Skarpnäcks,37,2242,1.650312
9,Skärholmens,24,938,2.558635


In [63]:
fig = px.bar(proportion_neg_reviews.sort_values('percent_of_neg_reviews'), x='percent_of_neg_reviews', y='neighbourhood')
fig.update_traces(marker_color=MAIN_COLOR)

In [64]:
df_hosts_reviews_en_labeled.corr()





Unnamed: 0,index,listing_id_encod,listing_id,reviewer_id,host_id,latitude,longitude,price,number_of_reviews,reviews_per_month,number_of_reviews_ltm,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,sentiment_label,sentiment_label_class
index,1.0,0.981383,0.438429,0.310614,0.499106,0.054481,-0.000488,-0.008039,-0.232958,0.251568,0.186242,-0.165591,-0.17092,-0.083141,-0.19052,-0.236189,0.003311,-0.157063,-0.120364,-0.031463
listing_id_encod,0.981383,1.0,0.567044,0.305164,0.487419,0.052799,-0.008056,-0.01176,-0.267367,0.22259,0.137121,-0.14815,-0.163143,-0.078,-0.18284,-0.208825,0.005154,-0.138083,-0.105775,-0.026905
listing_id,0.438429,0.567044,1.0,0.134705,0.235967,0.001683,-0.047708,-0.019347,-0.212662,0.033089,-0.06702,-0.000425,-0.045764,-0.008649,-0.062405,-0.016811,-0.019963,0.012405,-0.002544,-0.001071
reviewer_id,0.310614,0.305164,0.134705,1.0,0.213574,0.0028,-0.062927,-0.090828,-0.004481,0.160112,0.146006,-0.123928,-0.112148,-0.095737,-0.096115,-0.134078,-0.115552,-0.079836,-0.130865,-0.034454
host_id,0.499106,0.487419,0.235967,0.213574,1.0,0.009757,0.000487,-0.067227,0.034102,0.306754,0.310937,-0.318945,-0.285536,-0.273683,-0.25082,-0.40296,-0.150361,-0.231414,-0.139697,-0.049915
latitude,0.054481,0.052799,0.001683,0.0028,0.009757,1.0,-0.241885,0.076721,0.037555,0.047157,-0.013505,-0.047085,-0.04792,-0.014064,-0.047583,-0.048904,0.174207,-0.08158,-0.025826,0.000293
longitude,-0.000488,-0.008056,-0.047708,-0.062927,0.000487,-0.241885,1.0,0.063581,0.196481,0.19318,0.171706,-0.032751,-0.030847,0.006061,-0.052409,-0.090535,0.337906,-0.10015,-0.001383,-0.000903
price,-0.008039,-0.01176,-0.019347,-0.090828,-0.067227,0.076721,0.063581,1.0,-0.108675,-0.100314,-0.065708,0.106268,0.060711,0.090227,0.06544,0.058219,0.217195,-0.026953,0.044302,0.013168
number_of_reviews,-0.232958,-0.267367,-0.212662,-0.004481,0.034102,0.037555,0.196481,-0.108675,1.0,0.796116,0.719318,-0.173568,-0.066856,-0.054228,-0.058694,-0.202922,-0.027681,-0.13776,-0.089809,-0.030454
reviews_per_month,0.251568,0.22259,0.033089,0.160112,0.306754,0.047157,0.19318,-0.100314,0.796116,1.0,0.879223,-0.295212,-0.181965,-0.124653,-0.201274,-0.368076,-0.067395,-0.246603,-0.15654,-0.05131


In [65]:
df_hosts_reviews_en_topics = pd.read_parquet(BASE_DIR / 'processed_data' / 'hosts_reviews_en_topics.parquet')

In [66]:
avg_topic_by_neighbourhood = (df_hosts_reviews_en_topics
                                                .groupby(['neighbourhood'])
                                                .agg(avg_topic=('dominant_topic', 'mean'))
                                                .sort_values('avg_topic', ascending=False)
                                                .reset_index())
avg_topic_by_neighbourhood

Unnamed: 0,neighbourhood,avg_topic
0,Farsta,1.526618
1,Hägersten-Liljeholmens,1.518558
2,Östermalms,1.517599
3,Hässelby-Vällingby,1.510766
4,Rinkeby-Tensta,1.508021
5,Enskede-Årsta-Vantörs,1.500121
6,Skarpnäcks,1.477698
7,Norrmalms,1.460435
8,Kungsholmens,1.446305
9,Södermalms,1.437284


In [67]:
px.density_heatmap(avg_topic_by_neighbourhood,
                   x='neighbourhood', y='avg_topic')

In [68]:
numnber_topic_by_neighbourhood = (df_hosts_reviews_en_topics.groupby(['neighbourhood', 'dominant_topic'])
        .agg(number_of_reviews=('number_of_reviews', 'count'))
        .sort_values('number_of_reviews', ascending=False)
        .reset_index())
numnber_topic_by_neighbourhood 

Unnamed: 0,neighbourhood,dominant_topic,number_of_reviews
0,Södermalms,2,19734
1,Södermalms,1,7031
2,Södermalms,0,5587
3,Kungsholmens,2,4976
4,Norrmalms,2,4730
5,Östermalms,2,3021
6,Enskede-Årsta-Vantörs,2,2573
7,Hägersten-Liljeholmens,2,2350
8,Kungsholmens,1,1536
9,Kungsholmens,0,1431


In [69]:
# Define a custom color mapping function
def get_tile_color(tile_path):
    # Get the name of the neighbourhood and the dominant topic from the tile path
    neighbourhood, dominant_topic = tile_path[-2:]

    # Assign a specific color to each combination of neighbourhood and dominant topic
    if dominant_topic == 'topic 0' and neighbourhood == 'Neighbourhood 1':
        return 'red'
    elif dominant_topic == 'topic 1' and neighbourhood == 'Neighbourhood 2':
        return 'green'
    else:
        # Use the default color scale for all other tiles
        return None

# Create the treemap with the custom color mapping function
fig = px.treemap(
    numnber_topic_by_neighbourhood,
    path=['neighbourhood', 'dominant_topic'],
    values='number_of_reviews',
    color_discrete_sequence=PALETTE,
    color_continuous_midpoint=np.average(numnber_topic_by_neighbourhood['number_of_reviews']),
)

# Apply the custom color mapping function to each tile
fig.data[0].marker.colors = [get_tile_color(tile_path) for tile_path in fig.data[0].ids]

# Show the figure
fig.show()