In [2]:
from pathlib import Path
import pandas as pd
import numpy as np
from nltk.probability import FreqDist
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import plotly.io as pio



In [3]:
pio.templates.default = 'plotly_white'
px.set_mapbox_access_token(open(".mapbox_token").read())
PALETTE = px.colors.qualitative.Prism
MAIN_COLOR = 'rgb(231,41,138)'
BASE_DIR = Path.cwd().parent

In [3]:
df_listing = pd.read_csv(BASE_DIR / 'staging_data' / 'listing_cleaned.csv')
df_hosts_reviews = pd.read_parquet(BASE_DIR / 'staging_data' / 'hosts_reviews_all_cleaned.parquet')

In [4]:
count_words = df_hosts_reviews['comments'].str.split().apply(len)
print('minimum number of words: {}'.format(count_words.min()))
print('median number of words: {}'.format(np.median(count_words)))
print('average number of words: {}'.format(count_words.mean()))
print('maximum number of words: {}'.format(count_words.max()))

minimum number of words: 0
median number of words: 30.0
average number of words: 40.31585625497323
maximum number of words: 1000


In [5]:
px.histogram(count_words)

In [6]:
df_hosts_reviews[df_hosts_reviews['comments'].str.split().apply(len) == 1000]['comments'].tolist()

['i stayed in this apartment for a month as i was in stockholm for work locationwise kungsholmen is fantastic very safe and beautiful good neighbors in the building too  about the apartment it is designed very simple as you can see in the pictures it is correct that simplicity came with major missings i would say in different parts especially in the kitchen the kitchen is nicely renovated but there are so many musthave kitchen utensils missing not even a salad bowl with a depth to prepare a salad no vegetable peeler no bowl for the oven the oven plate is there but if you want to use the oven with more liquid food for stew for example you just cant just one small pot and not a second one which then he provided afterwards the inside of the kitchen shelves was not so clean there was dust in fact i didnt feel comfortable so i put everything into the dishwasher to make a clean start not so late after the heavy cover material in front of the dishwasher fell onto my anklefoot cause apparently

In [7]:
num_reviews = (df_hosts_reviews
            .groupby(['listing_id_encod', 'neighbourhood'])['number_of_reviews']
            .size().sort_values(ascending=False)
            .reset_index())
num_reviews

Unnamed: 0,listing_id_encod,neighbourhood,number_of_reviews
0,1073.0,Södermalm,975
1,27.0,Östermalm,670
2,1329.0,Norrmalm,653
3,7.0,Södermalm,578
4,545.0,Södermalm,515
...,...,...,...
2775,1269.0,Hässelby-Vällingby,1
2776,2360.0,Bromma,1
2777,1652.0,Kungsholmen,1
2778,2352.0,Kungsholmen,1


In [8]:
fig = px.bar(num_reviews.nlargest(50, 'number_of_reviews'), 
       y='number_of_reviews',
       hover_data = ['listing_id_encod'],
       color='neighbourhood', color_discrete_sequence=PALETTE)
fig.update_layout(xaxis_title = None)

In [9]:
fig = px.bar(num_reviews.groupby('neighbourhood')['number_of_reviews'].sum().sort_values(ascending=True), orientation='h')
fig.update_traces(marker_color=MAIN_COLOR)

In [10]:
fig = ff.create_hexbin_mapbox(
    data_frame=df_listing, lat="latitude", lon="longitude",
    nx_hexagon=10, opacity=0.9, labels={"color": "Point Count"},
    color_continuous_scale=PALETTE,
    #show_original_data=True
)
fig.update_layout(margin=dict(b=0, t=0, l=0, r=0))
fig.show()

In [11]:
fig = px.scatter_mapbox(df_listing, 
                            lat="latitude", lon="longitude", 
                            color="room_type", size="number_of_reviews",
                            hover_data = {'latitude': False, 'longitude': False, 'price': True},
                            color_discrete_sequence=PALETTE,
                            size_max=15, zoom=10)
fig.show()

In [12]:
most_reviewed_neighb = num_reviews.nlargest(50, 'number_of_reviews')['neighbourhood'].unique().tolist()
city_parts = df_hosts_reviews[df_hosts_reviews['neighbourhood'].isin(most_reviewed_neighb) == True]

In [13]:
fig = px.scatter(city_parts, y='price', x='review_scores_rating',
           facet_col='neighbourhood', facet_col_wrap=4,
           color='neighbourhood', color_discrete_sequence=PALETTE)
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.update_traces(marker=dict(size=7,
                              line=dict(width=0.5,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))
fig.update_yaxes(tick0=10000, dtick=5000)
fig.show()

In [None]:
df_hosts_reviews['review_scores_rating'].value_counts()

5.00    6023
4.92    4386
4.83    3103
4.84    3035
4.86    2869
        ... 
3.40       5
2.50       4
3.75       4
2.00       3
1.50       2
Name: review_scores_rating, Length: 114, dtype: int64

In [None]:
px.histogram(df_hosts_reviews, x='review_scores_rating')

In [None]:
df_hosts_reviews_en_labeled = pd.read_parquet(BASE_DIR / 'processed_data' / 'hosts_reviews_en_labeled.parquet')

In [None]:
df_hosts_reviews_en_labeled['sentiment_label_class'].value_counts().reset_index().sort_values('index', ascending=True)

Unnamed: 0,index,sentiment_label_class
1,0,1225
0,1,68395


In [None]:
neg_reviews = (df_hosts_reviews_en_labeled.query('sentiment_label_class == 0')
                                .groupby('neighbourhood')['sentiment_label_class']
                                .count().reset_index()
                                .rename(columns = {'sentiment_label_class': 'sentiment_label_nagative'}))
total_reviews = (df_hosts_reviews_en_labeled
                                .groupby('neighbourhood')
                                .size().reset_index()
                                .rename(columns = {0: 'total_reviews'}))
proportion_neg_reviews = neg_reviews.merge(total_reviews, on='neighbourhood')
proportion_neg_reviews['percent_of_neg_reviews'] = proportion_neg_reviews['sentiment_label_nagative'] / proportion_neg_reviews['total_reviews'] * 100
proportion_neg_reviews

Unnamed: 0,neighbourhood,sentiment_label_nagative,total_reviews,percent_of_neg_reviews
0,Bromma,22,2035,1.081081
1,Enskede-Årsta-Vantör,68,4133,1.645294
2,Farsta,22,1220,1.803279
3,Hägersten-Liljeholmen,42,3745,1.121495
4,Hässelby-Vällingby,16,836,1.913876
5,Kungsholmen,166,7937,2.09147
6,Norrmalm,148,7455,1.985245
7,Rinkeby-Tensta,8,374,2.139037
8,Skarpnäck,37,2244,1.648841
9,Skärholmen,24,939,2.555911


In [None]:
fig = px.bar(proportion_neg_reviews.sort_values('percent_of_neg_reviews'), x='percent_of_neg_reviews', y='neighbourhood')
fig.update_traces(marker_color=MAIN_COLOR)

In [None]:
df_hosts_reviews_en_labeled.corr()





Unnamed: 0,index,listing_id_encod,listing_id,reviewer_id,host_id,latitude,longitude,price,number_of_reviews,reviews_per_month,number_of_reviews_ltm,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,sentiment_label,sentiment_label_class
index,1.0,0.981383,0.438535,0.31039,0.499155,0.054424,-0.000434,-0.00787,-0.233338,0.251269,0.18609,-0.165262,-0.170741,-0.083005,-0.190221,-0.235817,0.003267,-0.156845,-0.120509,-0.03152
listing_id_encod,0.981383,1.0,0.56714,0.304877,0.48742,0.052759,-0.008014,-0.011601,-0.267713,0.222304,0.136978,-0.14779,-0.162915,-0.077767,-0.182565,-0.208394,0.005193,-0.137871,-0.105854,-0.027033
listing_id,0.438535,0.56714,1.0,0.134463,0.235956,0.001748,-0.047745,-0.019233,-0.212738,0.033005,-0.067056,-0.000297,-0.045665,-0.008313,-0.062566,-0.016669,-0.019758,0.012368,-0.002524,-0.001358
reviewer_id,0.31039,0.304877,0.134463,1.0,0.213574,0.002863,-0.06287,-0.090512,-0.004554,0.159924,0.145922,-0.123759,-0.111942,-0.095752,-0.095932,-0.134034,-0.115772,-0.079599,-0.130954,-0.034143
host_id,0.499155,0.48742,0.235956,0.213574,1.0,0.009781,0.000754,-0.067128,0.033624,0.306189,0.310619,-0.318466,-0.285054,-0.273539,-0.25046,-0.40265,-0.150389,-0.231043,-0.1397,-0.050343
latitude,0.054424,0.052759,0.001748,0.002863,0.009781,1.0,-0.241825,0.076604,0.037677,0.047297,-0.013315,-0.046947,-0.047809,-0.013721,-0.047349,-0.048834,0.174335,-0.0816,-0.025663,0.000712
longitude,-0.000434,-0.008014,-0.047745,-0.06287,0.000754,-0.241825,1.0,0.063577,0.196178,0.192954,0.1716,-0.032612,-0.030755,0.006002,-0.052353,-0.090581,0.337752,-0.100164,-0.001537,-0.001205
price,-0.00787,-0.011601,-0.019233,-0.090512,-0.067128,0.076604,0.063577,1.0,-0.108467,-0.099999,-0.06546,0.106121,0.060594,0.090124,0.065267,0.058037,0.217081,-0.02707,0.044146,0.013024
number_of_reviews,-0.233338,-0.267713,-0.212738,-0.004554,0.033624,0.037677,0.196178,-0.108467,1.0,0.796082,0.719178,-0.173547,-0.066957,-0.054207,-0.058576,-0.202842,-0.02747,-0.137787,-0.089736,-0.030514
reviews_per_month,0.251269,0.222304,0.033005,0.159924,0.306189,0.047297,0.192954,-0.099999,0.796082,1.0,0.879252,-0.294874,-0.181881,-0.124386,-0.200844,-0.367716,-0.067152,-0.246432,-0.156309,-0.051341


In [12]:
df_hosts_reviews_en_topics = pd.read_parquet(BASE_DIR / 'processed_data' / 'hosts_reviews_en_topics.parquet')

In [13]:
avg_topic_by_neighbourhood = (df_hosts_reviews_en_topics
                                                .groupby(['neighbourhood'])
                                                .agg(avg_topic=('dominant_topic', 'mean'))
                                                .sort_values('avg_topic', ascending=False)
                                                .reset_index())
avg_topic_by_neighbourhood

Unnamed: 0,neighbourhood,avg_topic
0,Bromma,0.584934
1,Södermalm,0.571199
2,Spånga-Tensta,0.555249
3,Kungsholmen,0.546932
4,Norrmalm,0.546856
5,Älvsjö,0.543924
6,Skärholmen,0.535676
7,Skarpnäck,0.513839
8,Enskede-Årsta-Vantör,0.485707
9,Rinkeby-Tensta,0.48


In [14]:
fig = px.density_heatmap(avg_topic_by_neighbourhood,
                   x='neighbourhood', y='avg_topic')
fig.update_layout(title='Average Dominant Topic by Neighbourhood')
fig.update_xaxes(title=None)
fig.update_yaxes(title='Average Topic')

In [15]:
numnber_topic_by_neighbourhood = (df_hosts_reviews_en_topics.groupby(['neighbourhood', 'dominant_topic'])
        .agg(number_of_reviews=('number_of_reviews', 'count'))
        .sort_values('number_of_reviews', ascending=False)
        .reset_index())
numnber_topic_by_neighbourhood 

Unnamed: 0,neighbourhood,dominant_topic,number_of_reviews
0,Södermalm,0,19098
1,Södermalm,1,8040
2,Södermalm,2,5222
3,Kungsholmen,0,4945
4,Norrmalm,0,4643
5,Östermalm,0,2994
6,Enskede-Årsta-Vantör,0,2594
7,Hägersten-Liljeholmen,0,2351
8,Kungsholmen,1,1643
9,Norrmalm,1,1553


In [16]:
# Define a custom color mapping function
def get_tile_color(tile_path):
    # Get the name of the neighbourhood and the dominant topic from the tile path
    neighbourhood, dominant_topic = tile_path[-2:]

    # Assign a specific color to each combination of neighbourhood and dominant topic
    if dominant_topic == 'topic 0' and neighbourhood == 'Neighbourhood 1':
        return 'red'
    elif dominant_topic == 'topic 1' and neighbourhood == 'Neighbourhood 2':
        return 'green'
    else:
        # Use the default color scale for all other tiles
        return None

# Create the treemap with the custom color mapping function
fig = px.treemap(
    numnber_topic_by_neighbourhood,
    path=['neighbourhood', 'dominant_topic'],
    values='number_of_reviews',
    color_discrete_sequence=PALETTE,
    color_continuous_midpoint=np.average(numnber_topic_by_neighbourhood['number_of_reviews']),
)

# Apply the custom color mapping function to each tile
fig.data[0].marker.colors = [get_tile_color(tile_path) for tile_path in fig.data[0].ids]
fig.update_layout(title='Treemap of Number of Reviews by Dominant Topic and Neighbourhood')
# Show the figure
fig.show()

In [None]:
words = [word for sublist in df_hosts_reviews_en_topics.query('neighbourhood == "Södermalm"')['text'] for word in sublist]
fdist = FreqDist(words)
freq_df = pd.DataFrame.from_dict(fdist, orient='index', columns=['Frequency'])

In [None]:
fig = px.bar(freq_df.nlargest(50, 'Frequency'), x=freq_df.nlargest(50, 'Frequency').index, y='Frequency', 
             labels={'x':'Word', 'y':'Frequency'},
             title='Most frequent words')
fig.update_traces(marker_color=MAIN_COLOR)
fig.update_layout(xaxis_tickangle=-45)

In [None]:
words = [word for sublist in df_hosts_reviews_en_topics['text'] for word in sublist]
fdist = FreqDist(words)
freq_df = pd.DataFrame.from_dict(fdist, orient='index', columns=['Frequency'])


In [None]:
fig = px.bar(freq_df.nlargest(50, 'Frequency'), x=freq_df.nlargest(50, 'Frequency').index, y='Frequency', 
             labels={'x':'Word', 'y':'Frequency'},
             title='Most frequent words')
fig.update_traces(marker_color=MAIN_COLOR)
fig.update_layout(xaxis_tickangle=-45)

In [None]:
max_comment_idx = df_hosts_reviews_en_labeled['comments'].str.split().apply(len).idxmax()
fdist = FreqDist(df_hosts_reviews_en_topics.loc[max_comment_idx, 'text'])
freq_df_max_review = pd.DataFrame.from_dict(fdist, orient='index', columns=['Frequency'])

NameError: name 'df_hosts_reviews_en_labeled' is not defined

In [None]:
fig = px.bar(freq_df_max_review.sort_values('Frequency', ascending=False).nlargest(50, 'Frequency'), 
             x=freq_df_max_review.nlargest(50, 'Frequency').index, y='Frequency', 
             labels={'x':'Word', 'y':'Frequency'},
             title='Most frequent words')
fig.update_traces(marker_color=MAIN_COLOR)
fig.update_layout(xaxis_tickangle=-45)