# Cosine and Jaccard similarities

In [2]:
# import libraries
from pathlib import Path
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")

In [3]:
# default
BASE_DIR = Path.cwd().parent
PALETTE = px.colors.qualitative.Prism

In [4]:
# load the topics data
df_hosts_reviews_en = pd.read_parquet(BASE_DIR / 'processed_data' / 'hosts_reviews_en_topics.parquet')

In [5]:
# subset only the English reviews and property descriptions
df_texts = df_hosts_reviews_en.query('(comments_language == "en") & (description_language == "en")')

In [6]:
# UDF to calculate Jaccard similarity
def jaccard_similarity(col_1, col_2):
    tokens_1 = set(word_tokenize(col_1))
    tokens_2 = set(word_tokenize(col_2))
    intersection = tokens_1.intersection(tokens_2)
    union = tokens_1.union(tokens_2)
    return len(intersection) / len(union)


# apply the Jaccard similarity function to each pair of text strings
df_texts['jaccard_similarity'] = df_texts.apply(lambda row: jaccard_similarity(row['comments'], row['host_description']), axis=1)


In [7]:
# UDF to calculate cosine similarity
def calculate_cosine_similarity(col_1, col_2):
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform([col_1, col_2])
    return cosine_similarity(tfidf_matrix)[0][1]

# apply the cosine similarity function to each pair of text strings
df_texts['cosine_similarity'] = df_texts.apply(lambda row: calculate_cosine_similarity(row['comments'], row['host_description']), axis=1)

In [8]:
df_texts[['jaccard_similarity', 'cosine_similarity']].mean()

jaccard_similarity    0.033042
cosine_similarity     0.050484
dtype: float64

In [9]:
df_texts[['jaccard_similarity', 'cosine_similarity']].max()

jaccard_similarity    0.300000
cosine_similarity     0.488891
dtype: float64

In [11]:
# find the average cosine similarity by listings
cosine_sim_by_neighbourhood = (df_texts.groupby(['listing_id_encod', 'neighbourhood', 
                                                 'review_scores_value', 'price',
                                                 'dominant_topic'])
        .agg(number_of_reviews=('number_of_reviews', 'count'), cosine_similarity=('cosine_similarity', 'mean'))
        .sort_values('cosine_similarity', ascending=False)
        .reset_index())
cosine_sim_by_neighbourhood

Unnamed: 0,listing_id_encod,neighbourhood,review_scores_value,price,dominant_topic,number_of_reviews,cosine_similarity
0,2534.0,Hässelby-Vällingby,3.00,550.0,1,1,0.336677
1,436.0,Skarpnäck,5.00,2500.0,0,3,0.268278
2,2102.0,Bromma,5.00,1195.0,1,1,0.224910
3,777.0,Norrmalm,5.00,7400.0,1,1,0.216061
4,194.0,Norrmalm,4.88,5000.0,2,1,0.215819
...,...,...,...,...,...,...,...
4749,539.0,Norrmalm,4.89,1600.0,1,1,0.000000
4750,2436.0,Spånga-Tensta,4.00,200.0,0,1,0.000000
4751,2041.0,Södermalm,4.88,2000.0,1,3,0.000000
4752,759.0,Södermalm,4.25,850.0,2,1,0.000000


In [12]:
# plot the average cosine similary
px.density_heatmap(cosine_sim_by_neighbourhood,
           x='neighbourhood', y='cosine_similarity',
           color_continuous_scale=PALETTE)