In [136]:
from pathlib import Path
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import plotly.express as px
import umap
import gensim
import warnings
warnings.filterwarnings("ignore")

In [167]:
BASE_DIR = Path.cwd().parent
PALETTE = px.colors.qualitative.Prism

In [138]:
df_hosts_reviews_en = pd.read_parquet(BASE_DIR / 'processed_data' / 'hosts_reviews_en_topics.parquet')

In [139]:
df_texts = df_hosts_reviews_en.query('(comments_language == "en") & (description_language == "en")')

In [140]:
# define a function to calculate Jaccard similarity
def jaccard_similarity(col_1, col_2):
    tokens_1 = set(word_tokenize(col_1))
    tokens_2 = set(word_tokenize(col_2))
    intersection = tokens_1.intersection(tokens_2)
    union = tokens_1.union(tokens_2)
    return len(intersection) / len(union)


# apply the Jaccard similarity function to each pair of text strings
df_texts['jaccard_similarity'] = df_texts.apply(lambda row: jaccard_similarity(row['comments'], row['host_description']), axis=1)


In [141]:
# define a function to calculate cosine similarity
def calculate_cosine_similarity(col_1, col_2):
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform([col_1, col_2])
    return cosine_similarity(tfidf_matrix)[0][1]

# apply the cosine similarity function to each pair of text strings
df_texts['cosine_similarity'] = df_texts.apply(lambda row: calculate_cosine_similarity(row['comments'], row['host_description']), axis=1)

In [142]:
df_texts[['jaccard_similarity', 'cosine_similarity']].mean()

jaccard_similarity    0.033023
cosine_similarity     0.050479
dtype: float64

In [143]:
df_texts[['jaccard_similarity', 'cosine_similarity']].max()

jaccard_similarity    0.300000
cosine_similarity     0.488891
dtype: float64

In [144]:
cosine_sim_by_neighbourhood = (df_texts.groupby(['listing_id_encod', 'neighbourhood', 
                                                 'review_scores_value', 'price',
                                                 'dominant_topic'])
        .agg(number_of_reviews=('number_of_reviews', 'count'), cosine_similarity=('cosine_similarity', 'mean'))
        .sort_values('cosine_similarity', ascending=False)
        .reset_index())
cosine_sim_by_neighbourhood

Unnamed: 0,listing_id_encod,neighbourhood,review_scores_value,price,dominant_topic,number_of_reviews,cosine_similarity
0,2534.0,Hässelby-Vällingby,3.00,550.0,2,1,0.336677
1,436.0,Skarpnäcks,5.00,2500.0,1,2,0.250108
2,436.0,Skarpnäcks,5.00,2500.0,2,2,0.230351
3,777.0,Norrmalms,5.00,7400.0,2,1,0.216061
4,2519.0,Kungsholmens,4.81,2870.0,0,1,0.210848
...,...,...,...,...,...,...,...
4750,1897.0,Södermalms,4.75,1100.0,0,1,0.000000
4751,2432.0,Enskede-Årsta-Vantörs,5.00,1350.0,1,1,0.000000
4752,2281.0,Norrmalms,4.25,1200.0,2,5,0.000000
4753,1623.0,Rinkeby-Tensta,4.50,500.0,1,2,0.000000


In [168]:
px.density_heatmap(cosine_sim_by_neighbourhood,
           x='neighbourhood', y='cosine_similarity',
           color_continuous_scale=PALETTE)

In [146]:
df_texts.columns

Index(['date', 'listing_id_encod', 'listing_id', 'reviewer_id',
       'reviewer_name', 'comments', 'host_id', 'host_name', 'host_description',
       'neighbourhood', 'latitude', 'longitude', 'room_type', 'price',
       'number_of_reviews', 'last_review', 'reviews_per_month',
       'number_of_reviews_ltm', 'review_scores_rating',
       'review_scores_accuracy', 'review_scores_cleanliness',
       'review_scores_checkin', 'review_scores_communication',
       'review_scores_location', 'review_scores_value', 'comments_language',
       'description_language', 'dominant_topic', 'percent_contribution',
       'topic_keywords', 'text', 'jaccard_similarity', 'cosine_similarity'],
      dtype='object')

In [147]:
px.imshow(df_texts[['price', 'number_of_reviews', 
                           'reviews_per_month', 'number_of_reviews_ltm', 
                           'review_scores_rating', 'jaccard_similarity', 'cosine_similarity']].corr())

In [148]:
df_texts = pd.get_dummies(df_texts, columns=['room_type', 'neighbourhood'])

In [149]:
df_texts.columns

Index(['date', 'listing_id_encod', 'listing_id', 'reviewer_id',
       'reviewer_name', 'comments', 'host_id', 'host_name', 'host_description',
       'latitude', 'longitude', 'price', 'number_of_reviews', 'last_review',
       'reviews_per_month', 'number_of_reviews_ltm', 'review_scores_rating',
       'review_scores_accuracy', 'review_scores_cleanliness',
       'review_scores_checkin', 'review_scores_communication',
       'review_scores_location', 'review_scores_value', 'comments_language',
       'description_language', 'dominant_topic', 'percent_contribution',
       'topic_keywords', 'text', 'jaccard_similarity', 'cosine_similarity',
       'room_type_Entire home/apt', 'room_type_Hotel room',
       'room_type_Private room', 'room_type_Shared room',
       'neighbourhood_Bromma', 'neighbourhood_Enskede-Årsta-Vantörs',
       'neighbourhood_Farsta', 'neighbourhood_Hägersten-Liljeholmens',
       'neighbourhood_Hässelby-Vällingby', 'neighbourhood_Kungsholmens',
       'neighbourho

In [150]:
text_umap = df_texts[['dominant_topic', 'price', 'number_of_reviews', 
                           'reviews_per_month', 'number_of_reviews_ltm', 
                           'review_scores_rating', 'review_scores_accuracy', 
                           'review_scores_cleanliness', 'review_scores_checkin', 
                           'review_scores_communication', 'review_scores_location', 'review_scores_value',
                            'cosine_similarity', 'room_type_Entire home/apt',
                             'room_type_Hotel room', 'room_type_Private room',
                             'neighbourhood_Bromma', 'neighbourhood_Enskede-Årsta-Vantörs',
       'neighbourhood_Farsta', 'neighbourhood_Hägersten-Liljeholmens',
       'neighbourhood_Hässelby-Vällingby', 'neighbourhood_Kungsholmens',
       'neighbourhood_Norrmalms', 'neighbourhood_Rinkeby-Tensta',
       'neighbourhood_Skarpnäcks', 'neighbourhood_Skärholmens',
       'neighbourhood_Spånga-Tensta', 'neighbourhood_Södermalms',
       'neighbourhood_Älvsjö', 'neighbourhood_Östermalms']]
X = text_umap.drop('dominant_topic', axis=1)

In [151]:
text_umap = df_texts[['dominant_topic', 'price', 'number_of_reviews', 
                           'reviews_per_month', 
                           'review_scores_rating', 
                            'cosine_similarity'
                             ]].sample(2000)
X = text_umap.drop('dominant_topic', axis=1)

In [152]:
umap_ = umap.UMAP(n_neighbors=3, min_dist=0.9, metric='cosine', random_state=42)

In [153]:
text_umap_trans = umap_.fit_transform(X)

In [154]:
text_umap['x'], text_umap['y'] = text_umap_trans[:,0], text_umap_trans[:,1]

In [155]:
px.scatter(text_umap,
                 x='x', y='y', color='dominant_topic')