In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from langdetect import detect
import re
import plotly.express as px
import plotly.io as pio

In [2]:
pio.templates.default = 'plotly_white'
BASE_DIR = Path.cwd().parent

In [3]:
def create_df(file_names, cols, drop_cols, merge_col='id'):
  dfs = [pd.read_csv(BASE_DIR / 'row_data' / file) for file in file_names]
  df = dfs[0].merge(dfs[1][cols], on=merge_col, how='inner').drop(drop_cols, axis=1)
  return df

In [4]:
def missing_values(df):
    nan = pd.DataFrame({
    'Missing Values': 
        df.isnull().sum().sort_values(ascending=False),
    'Missing Values, %': 
        (df.isnull().sum()
            .sort_values(ascending=False)/len(df) * 100)
            .astype(int)
        })
    return nan[nan['Missing Values'] > 0]

In [4]:
file_names = ['listings_short.csv', 'listings.csv']
cols = ['id', 'description', 'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value']
drop_cols = ['license', 'neighbourhood_group', 'minimum_nights', 
             'calculated_host_listings_count', 'availability_365']
df_listing = create_df(file_names, cols, drop_cols)

In [27]:
df_listing.describe()

Unnamed: 0,id,host_id,latitude,longitude,price,number_of_reviews,reviews_per_month,number_of_reviews_ltm,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value
count,3500.0,3500.0,3500.0,3500.0,3500.0,3500.0,2798.0,3500.0,2798.0,2780.0,2780.0,2780.0,2780.0,2780.0,2780.0
mean,2.477073e+17,135140300.0,59.318811,18.031954,1739.584286,26.758857,1.054982,8.879143,4.755425,4.804396,4.736216,4.852939,4.857327,4.800428,4.714601
std,3.442714e+17,148979600.0,0.03,0.063833,11324.378729,57.409084,1.468412,19.706462,0.537539,0.37006,0.424896,0.353636,0.344294,0.321867,0.385781
min,32984.0,4457.0,59.23195,17.77311,0.0,0.0,0.01,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,23033410.0,21072600.0,59.300792,18.004033,714.0,1.0,0.19,0.0,4.72,4.76,4.67,4.83,4.85,4.73,4.62
50%,44029510.0,64917660.0,59.3179,18.04954,1100.0,6.0,0.45,2.0,4.89,4.91,4.86,4.96,4.98,4.895,4.8
75%,6.482125e+17,220860400.0,59.33787,18.076097,1714.0,24.0,1.16,8.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
max,8.580712e+17,507582600.0,59.417742,18.16607,480500.0,976.0,18.94,571.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0


In [30]:
missing_values(df_listing)

Unnamed: 0,Missing Values,"Missing Values, %"
review_scores_value,684,20
review_scores_location,684,20
review_scores_communication,684,20
review_scores_checkin,684,20
review_scores_cleanliness,684,20
review_scores_accuracy,684,20
reviews_per_month,668,19
review_scores_rating,668,19
last_review,668,19


In [29]:
df_listing = df_listing.dropna(subset=['name', 'description'])

In [31]:
df_listing['decr_lang'] = [detect(comment) for comment in df_listing['description']]
df_listing['decr_lang'].value_counts()

en    2404
sv     975
no       4
es       3
it       2
tr       2
fr       2
ru       1
id       1
vi       1
nl       1
sw       1
Name: decr_lang, dtype: int64

In [36]:
df_listing = df_listing.drop([765, 1436, 3318])

In [37]:
df_listing[df_listing['name'].apply(lambda x: len(x.split())).sort_values() == 1]

  df_listing[df_listing['name'].apply(lambda x: len(x.split())).sort_values() == 1]


Unnamed: 0,id,name,host_id,host_name,neighbourhood,latitude,longitude,room_type,price,number_of_reviews,...,number_of_reviews_ltm,description,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,decr_lang
175,3892223,City/Room20sqm/Bed4sqm/Balcony5sqm,6422311,Christian,Norrmalms,59.34011,18.03048,Private room,610,36,...,0,"When you stay here includes fresh linen, towel...",4.97,4.86,4.94,5.0,4.94,4.94,4.89,en
487,12635870,Stockholm,68620876,Inger,Norrmalms,59.34464,18.06084,Entire home/apt,3100,56,...,8,This designed apartment is located in the city...,5.0,5.0,5.0,5.0,4.98,4.88,4.88,en
919,23875622,Rågsved,73463598,Marlena,Enskede-Årsta-Vantörs,59.25919,18.02751,Private room,383,125,...,16,One room in a 3 room flat. The flat is located...,4.88,4.93,4.89,4.9,4.96,4.59,4.88,en
1146,30686801,Livsten,229695091,Alex,Spånga-Tensta,59.38002,17.90026,Entire home/apt,2586,5,...,0,Welcome to our Stockholm Villa in Spånga area,5.0,4.6,4.6,5.0,5.0,4.8,4.6,sv
1331,35077568,Mariaberget,262031746,Vera,Södermalms,59.31965,18.06476,Private room,750,86,...,34,Small apartment in a building from the 18th ce...,4.83,4.83,4.94,4.87,4.86,4.97,4.84,en
1412,36502178,Stockholm,160072992,Erwa,Hägersten-Liljeholmens,59.29459,17.99186,Private room,350,16,...,9,A beautiful furnished apartment with all faci...,4.75,4.69,4.69,4.75,4.5,4.44,4.81,en
1521,39811232,Cosy,306161349,Tuija,Bromma,59.3405,17.98433,Entire home/apt,807,4,...,1,Hemma Hos mig Kan Du laga mat och Njuta av tid...,4.75,5.0,4.75,5.0,5.0,4.75,4.5,sv
1522,39811489,Cosy,306161349,Tuija,Bromma,59.34006,17.98238,Entire home/apt,779,19,...,11,Mysig lägenhet Du kan laga mat och njuta av en...,4.83,4.89,4.72,4.94,5.0,4.67,4.78,sv
1582,40830201,Gröndal/liljeholmen,317621159,Alireza,Hägersten-Liljeholmens,59.31473,17.9982,Entire home/apt,836,4,...,0,Välkommen till en nybyggd fräsch lägenhet i Gr...,4.75,5.0,4.75,4.75,4.75,5.0,4.5,sv
1584,40857805,CozyWarhol,318023749,Maria,Kungsholmens,59.33113,18.02746,Entire home/apt,800,24,...,6,"It's a cozy, colourful artsy 24 Sqm studio wi...",4.96,5.0,4.79,5.0,5.0,4.92,4.92,en


In [43]:
df_listing.tail()

Unnamed: 0,id,name,host_id,host_name,neighbourhood,latitude,longitude,room_type,price,number_of_reviews,...,description,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,decr_lang,name_lang
3495,857170901712752341,Underbart boende vid vattnet,122883893,Sandra,Hässelby-Vällingby,59.358236,17.834544,Entire home/apt,1061,0,...,Underbar lägenhet i Hässelby med härlig balkon...,,,,,,,,sv,da
3496,857283451869597188,Minimalist Chic 2-Bedroom,4386635,Ozan,Södermalms,59.314885,18.061956,Entire home/apt,1398,0,...,Are you looking for a sunny and stylish apartm...,,,,,,,,en,en
3497,857676534629895609,Rymlig 3:a,507582586,Jessica,Farsta,59.256137,18.114366,Entire home/apt,597,0,...,Hela gruppen kommer att ha enkel tillgång till...,,,,,,,,sv,cy
3498,857843172832279237,Place on Södermalm Stockholm city,465046893,Thomas,Södermalms,59.315926,18.062228,Entire home/apt,800,0,...,A nice one room apartment on 40m2 in the middl...,,,,,,,,en,sv
3499,858071154096647138,Grön oas i liten Studiolägenhet,187680673,Maria,Norrmalms,59.345556,18.029101,Entire home/apt,680,0,...,Välkommen till en centralt belägen mysig liten...,,,,,,,,sv,sv


In [41]:
df_listing[['name_lang', 'decr_lang']]

Unnamed: 0,name_lang,decr_lang
0,sv,en
1,en,en
2,en,en
3,en,en
4,nl,en
...,...,...
3495,da,sv
3496,en,en
3497,cy,sv
3498,sv,en


In [39]:
df_listing['name_lang'] = [detect(comment) for comment in df_listing['name']]
df_listing['name_lang'].value_counts()

en    2054
sv    1053
ro      45
nl      42
fr      33
da      29
no      27
it      17
af      17
ca      14
et      11
de      10
fi       9
es       7
sk       5
cy       4
pt       3
sq       3
lt       2
pl       2
id       2
vi       1
hu       1
tl       1
so       1
hr       1
Name: name_lang, dtype: int64

In [None]:
df_listing

In [307]:
df_canceled_visits = (df_host_reviews[df_host_reviews['comments']
                                      .str.contains('The host canceled this')])
df_canceled_visits.to_csv((BASE_DIR / 'staging_data' / 'canceled_visits.csv'), index=False)

In [308]:
df_host_reviews = (df_host_reviews[~df_host_reviews
                            .isin(df_canceled_visits)]
                            .dropna()
                            .reset_index(drop=True))

In [309]:
df_host_reviews.to_parquet(BASE_DIR / 'staging_data' / 'hosts_reviews_cleaned.parquet')

here!

In [None]:
def text_preproc():
    pass

In [232]:
df_host_reviews['host_description']

0        City Hall, Stockholm <b>The space</b><br />Bea...
1        City Hall, Stockholm <b>The space</b><br />Bea...
2        City Hall, Stockholm <b>The space</b><br />Bea...
3        City Hall, Stockholm <b>The space</b><br />Bea...
4        City Hall, Stockholm <b>The space</b><br />Bea...
                               ...                        
93035    Room2stay Lev det enkla livet i detta fridfull...
93036    Spacious & luxurious 2br Apt on Östermalms-tor...
93037    Spacious & luxurious 2br Apt on Östermalms-tor...
93038    Spacious & luxurious 2br Apt on Östermalms-tor...
93039    Spacious & luxurious 2br Apt on Östermalms-tor...
Name: host_description, Length: 93040, dtype: object

In [231]:
df_host_reviews['host_description'].str.replace('[!#&/,\U0001f600-\U0001f6ff]', '', regex=True)

0        City Hall Stockholm <b>The space<b><br >Beauti...
1        City Hall Stockholm <b>The space<b><br >Beauti...
2        City Hall Stockholm <b>The space<b><br >Beauti...
3        City Hall Stockholm <b>The space<b><br >Beauti...
4        City Hall Stockholm <b>The space<b><br >Beauti...
                               ...                        
93035    Room2stay Lev det enkla livet i detta fridfull...
93036    Spacious  luxurious 2br Apt on Östermalms-torg...
93037    Spacious  luxurious 2br Apt on Östermalms-torg...
93038    Spacious  luxurious 2br Apt on Östermalms-torg...
93039    Spacious  luxurious 2br Apt on Östermalms-torg...
Name: host_description, Length: 93040, dtype: object

In [None]:
df_host_reviews['host_description']

In [167]:
df_host_reviews['host_decr_lang'] = [detect(comment) for comment in df_host_reviews['host_description']]

In [277]:
missing_values(df_listing)

Unnamed: 0,Missing Values,"Missing Values, %"


In [262]:
df_listing = df_listing.dropna()

In [263]:
df_listing['decr_lang'] = [detect(comment) for comment in df_listing['description']]
df_listing['decr_lang'].value_counts()

en    2033
sv     669
es       2
tr       2
id       1
de       1
vi       1
it       1
nl       1
no       1
fr       1
Name: decr_lang, dtype: int64

In [278]:
df_listing['name_lang'] = [detect(comment) for comment in df_listing['name']]
df_listing['name_lang'].value_counts()

LangDetectException: No features in text.

In [173]:
df_host_reviews[~df_host_reviews['host_decr_lang'].isin(['en', 'sv'])]['host_description']

26501    2.5 rooms, own kitchen. Basement apt-ment, Älv...
26502    2.5 rooms, own kitchen. Basement apt-ment, Älv...
26503    2.5 rooms, own kitchen. Basement apt-ment, Älv...
26504    2.5 rooms, own kitchen. Basement apt-ment, Älv...
26505    2.5 rooms, own kitchen. Basement apt-ment, Älv...
                               ...                        
88028    Villa esprit scandinave près du lac et du cent...
88029    Villa esprit scandinave près du lac et du cent...
90068    Comfortable apartment(Stockholm/Vällingby) Bu ...
90806    Cómoda habitación, valor por persona 700kr. Ha...
90807    Cómoda habitación, valor por persona 700kr. Ha...
Name: host_description, Length: 313, dtype: object

In [None]:
df_host_reviews['comments']

In [112]:
df_host_reviews['comments_lang'] = [detect(comment) for comment in df_host_reviews['comments']]

LangDetectException: No features in text.

In [113]:
df_host_reviews['comments_lang'].value_counts()

KeyError: 'comments_lang'

In [158]:
df_host_reviews['comments'].apply(lambda x: len(x.split())).sort_values()

67119      1
24177      1
49302      1
84010      1
34439      1
        ... 
38749    674
72586    714
35770    722
12942    849
61048    999
Name: comments, Length: 93621, dtype: int64

In [162]:
detect(df_host_reviews['comments'][93517])

'af'

In [166]:
df_host_reviews[df_host_reviews['comments'].apply(lambda x: len(x.split())) == 1]['comments']

132      Lidia的公寓附近有地铁站<br/>和开往机场的火车站，距离都不到十分钟，关键是很容易找到...
263      Li是一個非常熱情和友善的房東，她為我們在斯德哥爾摩的行程提供了很多建議。房間非常整潔，設備...
340      房东很好，我们去前和她联系告诉到的时间，她就在楼下等我们。还告诉我们交通线路。下次到斯德哥尔...
386                                                      .
521      我们非常幸运，遇到Tina这么好的房东！房间干净整洁，每周房东还过来打扫卫生。对我们照顾的很...
                               ...                        
93310                                           Fantastic!
93322                                                 Good
93437                                                추천합니다
93517                                                   Ok
93585                        很不错<br/>向房东询问推荐的餐厅他们也很热情的告知了。
Name: comments, Length: 1651, dtype: object