# Data Cleaning

In [1]:
# import libraries
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from langdetect import detect
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import plotly.express as px
import plotly.io as pio

In [5]:
#run it once
#!python3 -m spacy download en_core_web_sm
#!python3 -m spacy download sv_core_news_sm
#!python3 -m spacy download xx_ent_wiki_sm

In [2]:
# default
pio.templates.default = 'plotly_white'
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
BASE_DIR = Path.cwd().parent # get base directory

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tatiana.ilyasova/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# UDF to create a dataframe
def create_df(file_names, cols, drop_cols, merge_col='id'):
  dfs = [pd.read_csv(BASE_DIR / 'row_data' / file) for file in file_names]
  df = dfs[0].merge(dfs[1][cols], on=merge_col, how='inner').drop(drop_cols, axis=1)
  return df

In [4]:
# UDF to find missing values
def missing_values(df):
    nan = pd.DataFrame({
    'Missing Values': 
        df.isnull().sum().sort_values(ascending=False),
    'Missing Values, %': 
        (df.isnull().sum()
            .sort_values(ascending=False)/len(df) * 100)
            .astype(int)
        })
    return nan[nan['Missing Values'] > 0]

In [6]:
# read a row data and create a dataframe
file_names = ['listings_short.csv', 'listings.csv']
cols = ['id', 'description', 'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value']
drop_cols = ['license', 'neighbourhood_group', 'minimum_nights', 
             'calculated_host_listings_count', 'availability_365']
df_listing = create_df(file_names, cols, drop_cols)
df_listing.head(3)

Unnamed: 0,id,name,host_id,host_name,neighbourhood,latitude,longitude,room_type,price,number_of_reviews,...,reviews_per_month,number_of_reviews_ltm,description,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value
0,32984,"City Hall, Stockholm",53396,Chris,Kungsholmens,59.32779,18.04998,Entire home/apt,1350,24,...,0.16,0,<b>The space</b><br />Beautiful very bright 43...,4.23,4.43,3.81,4.29,4.52,4.81,4.2
1,75590,Amazing nature location by a lake,397766,Peter,Skarpnäcks,59.30117,18.12833,Entire home/apt,949,10,...,0.09,0,"Apartment on the top floor, overlooking a lake...",4.8,5.0,4.89,4.89,5.0,4.78,4.78
2,164448,Double room in central Stockholm with Wi-Fi,784312,Li,Södermalms,59.31389,18.06087,Private room,829,358,...,2.52,34,I am renting out a nice double room on the top...,4.85,4.87,4.81,4.96,4.97,4.83,4.77


In [7]:
# rename neighbourhood, remove -s at the end
new_neighbourhood = ['Kungsholmen',
                    'Skarpnäck',
                    'Södermalm',
                    'Norrmalm',
                    'Hägersten-Liljeholmen',
                    'Älvsjö',
                    'Enskede-Årsta-Vantör',
                    'Farsta',
                    'Östermalm',
                    'Bromma',
                    'Skärholmen',
                    'Hässelby-Vällingby',
                    'Rinkeby-Tensta',
                    'Spånga-Tensta']
rename_neighbourhood = dict(zip(df_listing['neighbourhood'].unique().tolist(), new_neighbourhood))
# apply the new values
df_listing['neighbourhood'] = df_listing['neighbourhood'].map(rename_neighbourhood)

Statistics

In [8]:
df_listing.describe()

Unnamed: 0,id,host_id,latitude,longitude,price,number_of_reviews,reviews_per_month,number_of_reviews_ltm,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value
count,3500.0,3500.0,3500.0,3500.0,3500.0,3500.0,2798.0,3500.0,2798.0,2780.0,2780.0,2780.0,2780.0,2780.0,2780.0
mean,2.477073e+17,135140300.0,59.318811,18.031954,1739.584286,26.758857,1.054982,8.879143,4.755425,4.804396,4.736216,4.852939,4.857327,4.800428,4.714601
std,3.442714e+17,148979600.0,0.03,0.063833,11324.378729,57.409084,1.468412,19.706462,0.537539,0.37006,0.424896,0.353636,0.344294,0.321867,0.385781
min,32984.0,4457.0,59.23195,17.77311,0.0,0.0,0.01,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,23033410.0,21072600.0,59.300792,18.004033,714.0,1.0,0.19,0.0,4.72,4.76,4.67,4.83,4.85,4.73,4.62
50%,44029510.0,64917660.0,59.3179,18.04954,1100.0,6.0,0.45,2.0,4.89,4.91,4.86,4.96,4.98,4.895,4.8
75%,6.482125e+17,220860400.0,59.33787,18.076097,1714.0,24.0,1.16,8.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
max,8.580712e+17,507582600.0,59.417742,18.16607,480500.0,976.0,18.94,571.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0


Missing values

In [9]:
missing_values(df_listing)

Unnamed: 0,Missing Values,"Missing Values, %"
review_scores_value,720,20
review_scores_location,720,20
review_scores_communication,720,20
review_scores_checkin,720,20
review_scores_cleanliness,720,20
review_scores_accuracy,720,20
reviews_per_month,702,20
review_scores_rating,702,20
last_review,702,20
description,102,2


In [10]:
# concatenate the name of a listing and description to one column
df_listing[['name', 'description']] = df_listing[['name', 'description']].fillna('')
df_listing = df_listing.fillna(0)
df_listing['host_description'] = df_listing['name'] + ' ' + df_listing['description']
df_listing.drop(['name', 'description'], axis=1, inplace=True)

Variable "price"

In [11]:
fig = px.histogram(df_listing, x='price')
fig.update_layout(title='Distribution of Price is highly skewed to the right')
fig.update_yaxes(title='frequency')
fig

In [12]:
# find outliers in "price"
df_listing.query('price > 25000 or price == 0')

Unnamed: 0,id,host_id,host_name,neighbourhood,latitude,longitude,room_type,price,number_of_reviews,last_review,reviews_per_month,number_of_reviews_ltm,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,host_description
755,19529551,137155339,Hillerodsgrand,Rinkeby-Tensta,59.39556,17.94929,Entire home/apt,480500,8,2019-08-01,0.12,0,5.0,5.0,5.0,4.71,4.71,4.71,4.86,Sunny apartment 27 mins to Centralen The entir...
1694,43012846,310620582,Brommavik Hotel,Bromma,59.35951,17.96011,Hotel room,0,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Brommavik Hotel
1796,45571018,368998845,Forenom,Bromma,59.33435,17.97866,Hotel room,0,3,2022-08-09,0.25,3,3.67,4.0,4.0,2.67,2.33,4.33,4.0,Forenom Aparthotel Stockholm Alvik
1797,45571095,368999421,Forenom,Bromma,59.3454,17.97001,Hotel room,0,4,2023-02-27,0.44,4,4.75,4.75,4.75,4.0,4.25,4.75,4.5,Forenom Aparthotel Stockholm Bromma
1803,45680148,370101820,Forenom,Norrmalm,59.33799,18.06619,Hotel room,0,8,2023-02-24,0.28,4,4.13,4.25,4.5,4.13,4.25,4.5,3.88,Forenom Serviced Apartments Stockholm Johannes...
1804,45730331,343919205,Forenom APH Stockholm Kista,Rinkeby-Tensta,59.40813,17.95183,Hotel room,0,5,2022-09-24,0.16,2,3.4,4.4,4.8,3.4,3.2,4.2,4.0,Forenom Aparthotel Stockholm Kista
2202,53198829,430664934,Camilla,Hässelby-Vällingby,59.37333,17.83749,Entire home/apt,450590,11,2022-12-11,0.73,5,5.0,5.0,5.0,5.0,5.0,4.64,5.0,2:a i Hässelby Fin liten lägenhet i lugnt områ...
3110,776549730665441181,46194379,Jakob,Kungsholmen,59.3238,18.0045,Entire home/apt,99999,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Helhet Take it easy at this unique and tranqui...
3453,849709777456195565,505907207,FaVillas,Södermalm,59.32347,18.07524,Entire home/apt,25427,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,stockss place with 1 bedroom stock''s place wi...
3454,849709953466213539,505907207,FaVillas,Södermalm,59.322931,18.073034,Entire home/apt,25427,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,stockss place with 1 bedroom stock''s place wi...


In [13]:
# load up the calendar data to replace the outliers
df_calendar = pd.read_csv(BASE_DIR / 'row_data' / 'calendar.csv')
df_calendar['price'] = df_calendar['price'].str.replace('[\$,]', '', regex=True)
df_calendar['price'].agg(['max', 'min'])

max    99999.00
min        0.00
Name: price, dtype: object

In [14]:
# replace the outliers
min_price_calend = df_calendar.groupby('listing_id')['price'].min().astype(float).to_dict()
df_listing['price'] = df_listing['id'].map(min_price_calend)

In [15]:
# get index of rows with values higher than 90000
df_listing.query('price > 90000')

Unnamed: 0,id,host_id,host_name,neighbourhood,latitude,longitude,room_type,price,number_of_reviews,last_review,reviews_per_month,number_of_reviews_ltm,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,host_description
3110,776549730665441181,46194379,Jakob,Kungsholmen,59.3238,18.0045,Entire home/apt,99999.0,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Helhet Take it easy at this unique and tranqui...


In [16]:
# drop this row
df_listing.drop(3110, axis=0, inplace=True)
df_listing.fillna(0, inplace=True)

In [17]:
fig = px.histogram(df_listing, x='price')
fig.update_layout(title='Distribution of Price remains right-skewed but improved after updating values')
fig

In [19]:
# save a cleaned lising dataframe to staging_data
df_listing.to_csv((BASE_DIR / 'staging_data' / 'listing_cleaned.csv'), index=False)

In [10]:
# load up row reviews to clean up and merge with listings
df_reviews = pd.read_csv(Path(BASE_DIR / 'row_data' / 'reviews_2023_03.csv'), parse_dates=['date'])
df_host_reviews = (df_reviews.merge(df_listing, left_on='listing_id', right_on='id', how='inner')
                      .drop(['id_x', 'id_y'], axis=1))

In [21]:
# encode "listing_id", it can be useful while visualizing
le = LabelEncoder()
df_host_reviews['listing_id_encod'] = le.fit_transform(df_host_reviews['listing_id'])

In [22]:
# change the order of columns
df_host_reviews = df_host_reviews.iloc[:, np.r_[1, 24, 0, 2:7, 23, 7:23]]
df_host_reviews.head(1)

Unnamed: 0,date,listing_id_encod,listing_id,reviewer_id,reviewer_name,comments,host_id,host_name,host_description,neighbourhood,...,last_review,reviews_per_month,number_of_reviews_ltm,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value
0,2010-09-06,0,32984,200247,Keenan,Great little apartment in the perfect spot in ...,53396,Chris,"City Hall, Stockholm <b>The space</b><br />Bea...",Kungsholmen,...,2016-06-28,0.16,0,4.23,4.43,3.81,4.29,4.52,4.81,4.2


Statistics

In [23]:
df_host_reviews.describe()

Unnamed: 0,listing_id_encod,listing_id,reviewer_id,host_id,latitude,longitude,price,number_of_reviews,reviews_per_month,number_of_reviews_ltm,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value
count,93656.0,93656.0,93656.0,93656.0,93656.0,93656.0,93656.0,93656.0,93656.0,93656.0,93656.0,93656.0,93656.0,93656.0,93656.0,93656.0,93656.0
mean,970.009877,4.739066e+16,133369500.0,95301740.0,59.319395,18.04902,1261.341409,149.890472,2.797927,40.848947,4.75469,4.8109,4.750516,4.861461,4.838456,4.826085,4.694639
std,665.72111,1.710297e+17,134726200.0,116442900.0,0.022302,0.04854,1088.596961,158.725379,2.711734,64.173547,0.226551,0.19699,0.25334,0.165376,0.201699,0.188714,0.208919
min,0.0,32984.0,81.0,4457.0,59.23195,17.79731,0.0,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,394.0,10222510.0,26234100.0,8547277.0,59.31176,18.03108,730.0,41.0,0.83,7.0,4.64,4.74,4.66,4.81,4.78,4.75,4.59
50%,862.0,24074860.0,81293980.0,46720320.0,59.32294,18.06604,1035.0,103.0,2.29,29.0,4.81,4.86,4.81,4.9,4.9,4.88,4.72
75%,1442.0,40344030.0,203061500.0,138864700.0,59.33407,18.07506,1450.0,200.0,3.97,56.0,4.9,4.93,4.91,4.95,4.96,4.95,4.83
max,2797.0,8.478345e+17,507030200.0,505546100.0,59.41744,18.16607,22995.0,976.0,18.94,571.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0


In [24]:
df_host_reviews.describe(include='object')

Unnamed: 0,reviewer_name,comments,host_name,host_description,neighbourhood,room_type,last_review
count,93656,93654,93656,93656,93656,93656,93656
unique,19810,91384,1067,2784,14,4,752
top,Anna,.,Hedvig,Studio Apartment Double bed These studio apart...,Södermalm,Entire home/apt,2023-03-26
freq,576,151,4189,976,42496,70334,8027


Missing values

In [25]:
missing_values(df_host_reviews)

Unnamed: 0,Missing Values,"Missing Values, %"
comments,2,0


In [26]:
# find observations containing missing values
df_host_reviews[df_host_reviews['comments'].isna()]

Unnamed: 0,date,listing_id_encod,listing_id,reviewer_id,reviewer_name,comments,host_id,host_name,host_description,neighbourhood,...,last_review,reviews_per_month,number_of_reviews_ltm,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value
47701,2019-04-23,902,25051575,244070000,David,,26990407,Moa,Stor villa med sol hela dagen Stor villa på 17...,Skarpnäck,...,2023-01-03,0.42,7,5.0,4.88,4.96,4.96,4.92,4.83,4.79
62929,2019-05-30,1090,31301631,263373046,Deepak,,362541894,ApartDirect,Superior Studio Apartment with Sofa Bed Locate...,Älvsjö,...,2023-03-13,1.63,24,4.38,4.52,4.34,4.85,4.46,4.41,4.41


In [27]:
# drop NaN in "comments"
df_host_reviews = df_host_reviews.dropna(subset=['comments'])

In [28]:
missing_values(df_host_reviews)

Unnamed: 0,Missing Values,"Missing Values, %"


In [29]:
# Identify reviewers who have left more than one review
df_host_reviews['reviewer_id'].value_counts()

348992094    31
24622700     25
110641325    16
10805888     11
280949885    11
             ..
32023642      1
59702442      1
38577770      1
42229921      1
77261435      1
Name: reviewer_id, Length: 88204, dtype: int64

In [30]:
df_host_reviews.query('reviewer_id == 348992094').head()

Unnamed: 0,date,listing_id_encod,listing_id,reviewer_id,reviewer_name,comments,host_id,host_name,host_description,neighbourhood,...,last_review,reviews_per_month,number_of_reviews_ltm,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value
53092,2020-11-30,1099,31347264,348992094,Roger,Bra läge och mycket smidig in och utcheckning,8098890,Robin,# 12 Studiolägenhet utan fönster(no window) Ni...,Kungsholmen,...,2023-03-30,4.39,58,4.73,4.84,4.78,4.92,4.79,4.75,4.68
53097,2021-02-12,1099,31347264,348992094,Roger,Bra läge och alltid bra kommunikation. Allt fu...,8098890,Robin,# 12 Studiolägenhet utan fönster(no window) Ni...,Kungsholmen,...,2023-03-30,4.39,58,4.73,4.84,4.78,4.92,4.79,4.75,4.68
53099,2021-03-20,1099,31347264,348992094,Roger,Bra läge och mycket bra boende,8098890,Robin,# 12 Studiolägenhet utan fönster(no window) Ni...,Kungsholmen,...,2023-03-30,4.39,58,4.73,4.84,4.78,4.92,4.79,4.75,4.68
53132,2021-12-01,1099,31347264,348992094,Roger,Rent och fint. Allt man behöver finns. Lätt at...,8098890,Robin,# 12 Studiolägenhet utan fönster(no window) Ni...,Kungsholmen,...,2023-03-30,4.39,58,4.73,4.84,4.78,4.92,4.79,4.75,4.68
54397,2021-04-28,1062,30553531,348992094,Roger,Had yet another good stayover. Easy to find an...,8098890,Robin,# 3 Studiolägenhet (3 pers) Niro Hotel Apartme...,Kungsholmen,...,2023-03-19,4.26,72,4.78,4.85,4.86,4.91,4.87,4.8,4.78


We cannot be certain whether a reviewer actually stayed at a property or simply wrote a fake review, which is why we keep all the reviews

In [31]:
fig = px.histogram(df_host_reviews, x='listing_id_encod')
fig.update_layout(title='Distribution of Listings')
fig

In [32]:
# find the comments containng cancelled visits and save them to staging_data
df_canceled_visits = (df_host_reviews[df_host_reviews['comments']
                                      .str.contains('This is an automated posting')])
df_canceled_visits.to_csv((BASE_DIR / 'staging_data' / 'canceled_visits.csv'), index=False)

In [33]:
# remove the omments containng cancelled visits
df_host_reviews = (df_host_reviews[~df_host_reviews
                            .isin(df_canceled_visits)]
                            .dropna()
                            .reset_index(drop=True))

In [34]:
# check "comments"
df_host_reviews['comments']

0        Great little apartment in the perfect spot in ...
1        Nice flat in a great area. Chris sorted things...
2        Great location, Chris was responsive by email ...
3        Chris' place is super great and close to every...
4        This was a \very nice apartment in a GREAT loc...
                               ...                        
92993    Nous avons passé un super séjour chez Göran et...
92994    Beautiful apartment. Central. Everything funct...
92995    Bonjour<br/>Appartement conforme au descriptif...
92996    This apartment was absolutely beautiful! It wa...
92997    Me and my husband recently booked and stayed a...
Name: comments, Length: 92998, dtype: object

In [35]:
# replace host name with "host"
names = pd.melt((df_listing['host_name'].str.replace(rf'[&+()/程]', ' ', regex=True)
                        .str.split(' ', expand=True)))['value']
names_list = names.drop_duplicates().tolist()
not_names = ['And', 'Och', 'Apartment', 'Co-Host', 'Bromma', 'A', 'A.', 'AB', 'Ab', 
             'APH', 'ApartDirect', 'Aparthotel', 'B', '', 'Gamla', 'H', 'I', 'II', 'J', 'K', 
             'Man', 'My', 'O', 'P.', 'Ab', 'Sweden_92', 'Appartement', 'Hillerodsgrand', None, 'Ea', 
             'Your.Rentals', 'Brommavik', 'Old', 'Town', 'Hk9919', 'Ch', 'Co', 'L', 'V', 'Odd', 
             'Familjen', 'Mornington', 'Kvarteret', 'Generator', 'CC', 'Studio', 'E', 'Forenom', 
             'HOMEstate', 'Brf', 'M', 'UNITY', 'Lodge32', 'D', 'Residence', 'Марина', 'Love', 
             'City', "Yo'Av", 'Stockholm', 'Söder', 'Stan', 'TravelNest', 'Stay', 'Longstay', 
             'Lindberg', 'Hammarby', 'Fastighets'
            ]
names_to_find = {x for x in names_list if x not in not_names}
names_dict = dict(zip(names_to_find, ['host'] * len(names_to_find)))
repl = names_dict
pattern = '|'.join(rf'\b{k}\b' for k in repl.keys())

# Use the pattern and replacement dictionary to replace the matched strings
df_host_reviews['comments'] = df_host_reviews['comments'].str.replace(pattern, lambda m: repl[m.group()], regex=True).tolist()

In [36]:
# check host_description
df_host_reviews['host_description']

0        City Hall, Stockholm <b>The space</b><br />Bea...
1        City Hall, Stockholm <b>The space</b><br />Bea...
2        City Hall, Stockholm <b>The space</b><br />Bea...
3        City Hall, Stockholm <b>The space</b><br />Bea...
4        City Hall, Stockholm <b>The space</b><br />Bea...
                               ...                        
92993    Room2stay Lev det enkla livet i detta fridfull...
92994    Spacious & luxurious 2br Apt on Östermalms-tor...
92995    Spacious & luxurious 2br Apt on Östermalms-tor...
92996    Spacious & luxurious 2br Apt on Östermalms-tor...
92997    Spacious & luxurious 2br Apt on Östermalms-tor...
Name: host_description, Length: 92998, dtype: object

In [37]:
def text_preproc(df, subset):
# remove unnecessary symbols and emojis
    df[subset] = (df[subset].str.lower()
                            .str.replace('<b>the space</b><br />', ' ', regex=True)
                            .str.replace('\r<br/>', ' ', regex=True)
                            .str.replace('<br/>', ' ', regex=True)
                            .str.replace(r'[^\w\s]|[\U0001F600-\U0001F6FF]', '', regex=True)
                            )

In [38]:
text_preproc(df_host_reviews, 'comments')
text_preproc(df_host_reviews, 'host_description')

In [39]:
# check both text columns
df_host_reviews[['comments', 'host_description']]

Unnamed: 0,comments,host_description
0,great little apartment in the perfect spot in ...,city hall stockholm beautiful very bright 43 ...
1,nice flat in a great area host sorted things o...,city hall stockholm beautiful very bright 43 ...
2,great location host was responsive by email he...,city hall stockholm beautiful very bright 43 ...
3,host place is super great and close to everyth...,city hall stockholm beautiful very bright 43 ...
4,this was a very nice apartment in a great loca...,city hall stockholm beautiful very bright 43 ...
...,...,...
92993,nous avons passé un super séjour chez host et ...,room2stay lev det enkla livet i detta fridfull...
92994,beautiful apartment central everything functio...,spacious luxurious 2br apt on östermalmstorg ...
92995,bonjour appartement conforme au descriptif de ...,spacious luxurious 2br apt on östermalmstorg ...
92996,this apartment was absolutely beautiful it was...,spacious luxurious 2br apt on östermalmstorg ...


In [40]:
# save the cleaned reviews data set to staging_data
df_host_reviews.to_parquet(BASE_DIR / 'staging_data' / 'hosts_reviews_all_cleaned.parquet')

In [41]:
# Identify comments that are too short, as language identification cannot be reliably performed on such comments
def short_texts(df, subset):
    short_text_indexes = (df[df[subset]
            .apply(lambda x: len(x.split())) <= 1][subset]
            .index.tolist())
    print(f'Number of too short texts to define a language: {len(short_text_indexes)}')
    return df.drop(short_text_indexes, axis=0, inplace=True)

In [42]:
short_texts(df_host_reviews, 'comments')

Number of too short texts to define a language: 1653


In [43]:
short_texts(df_host_reviews, 'host_description')

Number of too short texts to define a language: 4


In [44]:
# language identification
def define_language(df, col, subset):
    df[col] = [detect(text) for text in df[subset]]

In [45]:
define_language(df_host_reviews, 'comments_language', 'comments')
define_language(df_host_reviews, 'description_language', 'host_description')

In [46]:
# check number of different languages
df_host_reviews['comments_language'].value_counts()

en       69631
fr        5789
sv        4719
de        3969
es        1505
it         958
no         862
da         740
nl         701
ru         441
ro         267
ko         259
fi         184
cs         174
ca         158
pl         156
pt         152
af         145
zh-cn       70
ja          55
tr          49
hr          45
el          43
so          40
cy          37
hu          30
uk          29
tl          23
et          22
sk          16
sl          13
id          12
lv           9
bg           7
sw           5
he           5
lt           5
ar           5
sq           4
zh-tw        3
vi           2
mk           1
th           1
Name: comments_language, dtype: int64

In [47]:
df_host_reviews = df_host_reviews.query('comments_language == "en"').reset_index()

In [48]:
# remove stop words
def remove_stop_words(text):
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if not word in stop_words]
    return ' '.join(filtered_tokens)

# apply function to dataframe column
df_host_reviews['comments'] = df_host_reviews['comments'].apply(lambda x: remove_stop_words(x))
df_host_reviews['host_description'] = df_host_reviews['host_description'].apply(lambda x: remove_stop_words(x))

# Show updated dataframe
print(df_host_reviews[['comments', 'host_description']])

                                                comments  \
0      great little apartment perfect spot town nice ...   
1      nice flat great area host sorted things make s...   
2      great location host responsive email country t...   
3      host place super great close everything quite ...   
4      nice apartment great location great price host...   
...                                                  ...   
69626  host really great host place clean area stay g...   
69627  lucky meet host always said us make home reall...   
69628  beautiful apartment central everything functio...   
69629  apartment absolutely beautiful clean nice host...   
69630  husband recently booked stayed apartment child...   

                                        host_description  
0      city hall stockholm beautiful bright 43 sqm 1b...  
1      city hall stockholm beautiful bright 43 sqm 1b...  
2      city hall stockholm beautiful bright 43 sqm 1b...  
3      city hall stockholm beautiful bright

In [49]:
# find how many comments and host descriptions have the same language
df_host_reviews[['comments_language', 'description_language']].value_counts()

comments_language  description_language
en                 en                      60760
                   sv                       8624
                   es                         99
                   da                         58
                   vi                         53
                   nl                         28
                   af                          4
                   tr                          3
                   ca                          1
                   fr                          1
dtype: int64

In [50]:
# save the cleaned dataset containig only English comments
df_host_reviews.to_parquet(BASE_DIR / 'staging_data' / 'hosts_reviews_en_cleaned.parquet')