In [1]:
from pathlib import Path
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import silhouette_score
import plotly.express as px
import plotly.graph_objects as go
import umap

In [2]:
BASE_DIR = Path.cwd().parent

In [3]:
df_hosts_reviews_en_labeled = pd.read_parquet(BASE_DIR / 'processed_data' / 'hosts_reviews_en_topics.parquet')

In [4]:
df_hosts_reviews_en_labeled = df_hosts_reviews_en_labeled.query('neighbourhood == "Östermalms" or neighbourhood == "Enskede-Årsta-Vantörs"')

In [5]:
X = df_hosts_reviews_en_labeled['comments']
y = df_hosts_reviews_en_labeled['dominant_topic']

In [6]:
X[X.str.contains('enskede|årsta|vantörs|östermalm') == True]

1720     staying hosts östermalm comfortable flat well ...
1838     hosts apartment beautiful cozy rooftop views a...
1863     studio quiet cozy comfortable little place clo...
1876     amazing studio center östermalm place perfectl...
1883     small bur lovely little studio middle östermal...
                               ...                        
66815    great location peaceful neighbourhood close ka...
69187    matthias really nice responsive apartment desc...
69304    spotless cozy home center stockholm 5minute wa...
69307    loved place clean cosy top apartment super cen...
69402    great apartment östermalm short walk saluhall ...
Name: comments, Length: 109, dtype: object

In [7]:
X = X.str.replace('enskede|årsta|vantörs|östermalm', '', regex=True)

In [8]:
df = pd.read_parquet(BASE_DIR / 'processed_data' / 'hosts_reviews_en_labeled.parquet')

In [9]:
df = df.query('neighbourhood == "Östermalms" or neighbourhood == "Enskede-Årsta-Vantörs"')

In [10]:
vectorizer = TfidfVectorizer(stop_words='english',
                             ngram_range=(1,2),
                            min_df=5,
                            max_df=0.5
                            )
X_vect = vectorizer.fit_transform(X)

In [11]:
umap_emb = umap.UMAP(n_components=3, n_neighbors=300, min_dist=0.9, 
                        metric='cosine', random_state=42).fit_transform(X_vect)

In [12]:
omrade_colors = dict(zip(['Östermalms', 'Enskede-Årsta-Vantörs'], ['tomato', 'steelblue']))

In [13]:
trace = go.Scatter(
    x=umap_emb[:,0],
    y=umap_emb[:,1],
    mode='markers',
    text=df_hosts_reviews_en_labeled['comments'],
    marker=dict(
        size=5,
        color=[omrade_colors[rt] for rt in df_hosts_reviews_en_labeled['neighbourhood']],
        )
    )


data = [trace]
layout = go.Layout(title='UMAP Visualization of Movie Reviews')
fig = go.Figure(data=data, layout=layout)
fig.show()


In [15]:
trace = go.Scatter3d(
    x=umap_emb[:,0],
    y=umap_emb[:,1],
    z=df_hosts_reviews_en_labeled['dominant_topic'],
    mode='markers',
    marker=dict(
        size=5,
        color=df['sentiment_label_class'],
        colorscale='Viridis',
        opacity=0.8
    )
)

data = [trace]
layout = go.Layout(title='UMAP Visualization of Movie Reviews')
fig = go.Figure(data=data, layout=layout)
fig.show()


In [16]:
df_hosts_reviews_en_labeled['neighbourhood'].unique().tolist()

['Enskede-Årsta-Vantörs', 'Östermalms']

In [17]:
print(px.colors.qualitative.Prism)

['rgb(95, 70, 144)', 'rgb(29, 105, 150)', 'rgb(56, 166, 165)', 'rgb(15, 133, 84)', 'rgb(115, 175, 72)', 'rgb(237, 173, 8)', 'rgb(225, 124, 5)', 'rgb(204, 80, 62)', 'rgb(148, 52, 110)', 'rgb(111, 64, 112)', 'rgb(102, 102, 102)']


In [18]:
room_type_colors = {
    'Hässelby-Vällingby': 'blue',
    'Rinkeby-Tensta': 'green',
    'Spånga-Tensta': 'red',
}
room_type_colors.update(dict(zip(df_hosts_reviews_en_labeled['neighbourhood'].unique().tolist(), px.colors.qualitative.Prism)))

In [19]:
room_type_colors = {
    'Entire home/apt': 'blue',
    'Private room': 'green',
    'Shared room': 'red',
    'Hotel room': 'yellow'
}

In [20]:
trace = go.Scatter(
    x=umap_emb[:,0],
    y=umap_emb[:,1],
    mode='markers',
    marker=dict(
        size=5,
        color=df['sentiment_label_class'],
        colorbar=dict(title='Room Type')
        )
    )


data = [trace]
layout = go.Layout(title='UMAP Visualization of Movie Reviews')
fig = go.Figure(data=data, layout=layout)
fig.show()