In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import plotly.express as px
import umap
import gensim
import warnings
warnings.filterwarnings("ignore")

In [2]:
BASE_DIR = Path.cwd().parent
PALETTE = px.colors.qualitative.Prism

In [3]:
df_hosts_reviews_en = pd.read_parquet(BASE_DIR / 'processed_data' / 'hosts_reviews_en_topics.parquet')

In [4]:
df_texts = df_hosts_reviews_en.query('(comments_language == "en") & (description_language == "en")')

In [5]:
# define a function to calculate Jaccard similarity
def jaccard_similarity(col_1, col_2):
    tokens_1 = set(word_tokenize(col_1))
    tokens_2 = set(word_tokenize(col_2))
    intersection = tokens_1.intersection(tokens_2)
    union = tokens_1.union(tokens_2)
    return len(intersection) / len(union)


# apply the Jaccard similarity function to each pair of text strings
df_texts['jaccard_similarity'] = df_texts.apply(lambda row: jaccard_similarity(row['comments'], row['host_description']), axis=1)


In [6]:
# define a function to calculate cosine similarity
def calculate_cosine_similarity(col_1, col_2):
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform([col_1, col_2])
    return cosine_similarity(tfidf_matrix)[0][1]

# apply the cosine similarity function to each pair of text strings
df_texts['cosine_similarity'] = df_texts.apply(lambda row: calculate_cosine_similarity(row['comments'], row['host_description']), axis=1)

In [7]:
df_texts[['jaccard_similarity', 'cosine_similarity']].mean()

jaccard_similarity    0.033060
cosine_similarity     0.050534
dtype: float64

In [8]:
df_texts[['jaccard_similarity', 'cosine_similarity']].max()

jaccard_similarity    0.300000
cosine_similarity     0.488891
dtype: float64

In [9]:
cosine_sim_by_neighbourhood = (df_texts.groupby(['listing_id_encod', 'neighbourhood', 
                                                 'review_scores_value', 'price',
                                                 'dominant_topic'])
        .agg(number_of_reviews=('number_of_reviews', 'count'), cosine_similarity=('cosine_similarity', 'mean'))
        .sort_values('cosine_similarity', ascending=False)
        .reset_index())
cosine_sim_by_neighbourhood

Unnamed: 0,listing_id_encod,neighbourhood,review_scores_value,price,dominant_topic,number_of_reviews,cosine_similarity
0,2534.0,Hässelby-Vällingby,3.00,550.0,0,1,0.336677
1,436.0,Skarpnäcks,5.00,2500.0,1,3,0.268278
2,575.0,Södermalms,4.75,318.0,1,1,0.264211
3,759.0,Södermalms,4.25,850.0,0,2,0.236402
4,2102.0,Bromma,5.00,1195.0,0,1,0.224910
...,...,...,...,...,...,...,...
4844,1459.0,Södermalms,4.86,499.0,0,1,0.000000
4845,2649.0,Östermalms,4.67,290.0,0,1,0.000000
4846,1461.0,Södermalms,5.00,1550.0,1,1,0.000000
4847,225.0,Rinkeby-Tensta,5.00,1000.0,2,1,0.000000


In [10]:
px.density_heatmap(cosine_sim_by_neighbourhood,
           x='neighbourhood', y='cosine_similarity',
           color_continuous_scale=PALETTE)

In [11]:
df_texts.columns

Index(['date', 'listing_id_encod', 'listing_id', 'reviewer_id',
       'reviewer_name', 'comments', 'host_id', 'host_name', 'host_description',
       'neighbourhood', 'latitude', 'longitude', 'room_type', 'price',
       'number_of_reviews', 'last_review', 'reviews_per_month',
       'number_of_reviews_ltm', 'review_scores_rating',
       'review_scores_accuracy', 'review_scores_cleanliness',
       'review_scores_checkin', 'review_scores_communication',
       'review_scores_location', 'review_scores_value', 'comments_language',
       'description_language', 'dominant_topic', 'percent_contribution',
       'topic_keywords', 'text', 'jaccard_similarity', 'cosine_similarity'],
      dtype='object')

In [12]:
px.imshow(df_texts[['price', 'number_of_reviews', 
                           'reviews_per_month', 'number_of_reviews_ltm', 
                           'review_scores_rating', 'jaccard_similarity', 'cosine_similarity']].corr())

In [13]:
df_texts = pd.get_dummies(df_texts, columns=['room_type', 'neighbourhood'])

In [14]:
df_texts.columns

Index(['date', 'listing_id_encod', 'listing_id', 'reviewer_id',
       'reviewer_name', 'comments', 'host_id', 'host_name', 'host_description',
       'latitude', 'longitude', 'price', 'number_of_reviews', 'last_review',
       'reviews_per_month', 'number_of_reviews_ltm', 'review_scores_rating',
       'review_scores_accuracy', 'review_scores_cleanliness',
       'review_scores_checkin', 'review_scores_communication',
       'review_scores_location', 'review_scores_value', 'comments_language',
       'description_language', 'dominant_topic', 'percent_contribution',
       'topic_keywords', 'text', 'jaccard_similarity', 'cosine_similarity',
       'room_type_Entire home/apt', 'room_type_Hotel room',
       'room_type_Private room', 'room_type_Shared room',
       'neighbourhood_Bromma', 'neighbourhood_Enskede-Årsta-Vantörs',
       'neighbourhood_Farsta', 'neighbourhood_Hägersten-Liljeholmens',
       'neighbourhood_Hässelby-Vällingby', 'neighbourhood_Kungsholmens',
       'neighbourho

In [15]:
text_umap = df_texts[['dominant_topic', 'price', 'number_of_reviews', 
                           'reviews_per_month', 'number_of_reviews_ltm', 
                           'review_scores_rating', 'review_scores_accuracy', 
                           'review_scores_cleanliness', 'review_scores_checkin', 
                           'review_scores_communication', 'review_scores_location', 'review_scores_value',
                            'cosine_similarity', 'room_type_Entire home/apt',
                             'room_type_Hotel room', 'room_type_Private room',
                             'neighbourhood_Bromma', 'neighbourhood_Enskede-Årsta-Vantörs',
       'neighbourhood_Farsta', 'neighbourhood_Hägersten-Liljeholmens',
       'neighbourhood_Hässelby-Vällingby', 'neighbourhood_Kungsholmens',
       'neighbourhood_Norrmalms', 'neighbourhood_Rinkeby-Tensta',
       'neighbourhood_Skarpnäcks', 'neighbourhood_Skärholmens',
       'neighbourhood_Spånga-Tensta', 'neighbourhood_Södermalms',
       'neighbourhood_Älvsjö', 'neighbourhood_Östermalms']]
X = text_umap.drop('dominant_topic', axis=1)

In [16]:
text_umap = df_texts[['dominant_topic', 'price', 'number_of_reviews', 
                           'reviews_per_month', 
                           'review_scores_rating', 
                            'cosine_similarity'
                             ]].sample(2000)
X = text_umap.drop('dominant_topic', axis=1)

In [17]:
umap_ = umap.UMAP(n_neighbors=3, min_dist=0.9, metric='cosine', random_state=42)

In [18]:
text_umap_trans = umap_.fit_transform(X)

In [19]:
text_umap['x'], text_umap['y'] = text_umap_trans[:,0], text_umap_trans[:,1]

In [20]:
px.scatter(text_umap,
                 x='x', y='y', color='dominant_topic')