# Imports 

In [1]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import re

# Initialise stop words 

In [2]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/svengerloff/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/svengerloff/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Load data 
Data source: https://www.kaggle.com/datasets/jiashenliu/515k-hotel-reviews-data-in-europe [05.06.2024]
Delete rows with missing lat and lng
Select hotel from Vienna only 

In [3]:
df = pd.read_csv('./data/hotels.csv')
df = df.dropna(subset=['lat', 'lng'])
df = df[df['Hotel_Address'].str.contains('Vienna', case=False, na=False)]

# Define functions for data preparation

In [4]:
stop_words_english = set(stopwords.words('english'))
custom_stopwords = {'negative', 'positive', 'good', 'everything', 'hotel'}
stop_words_english.update(custom_stopwords)
stop_words_german = set(stopwords.words('german'))

In [5]:
# Get postal code 
def extract_postal_code(address):
    match = re.search(r'\b\d{4}\b', address)
    return match.group(0) if match else None

In [6]:
# Transform postal code to district number
def postal_code_to_district(postal_code):
    if postal_code is not None:
        return int(postal_code[1:3])
    return None

In [7]:
# Use tokenizer and remove stopwords
def remove_stopwords(text):
    word_tokens = word_tokenize(text)
    word_tokens = [word.lower() for word in word_tokens]
    filtered_text = [word for word in word_tokens if word not in stop_words_english and word not in stop_words_german]
    return ' '.join(filtered_text)

# Apply methods of data preparation 

In [8]:
df['Bezirk'] = df['Hotel_Address'].apply(extract_postal_code)
df['District'] = df['Bezirk'].apply(postal_code_to_district)

In [9]:
grouped_df = df.groupby('Hotel_Name').agg({
    'lat': 'first',
    'lng': 'first',
    'Average_Score': 'mean',
    'Total_Number_of_Reviews': 'max',
    'Positive_Review': ' '.join, 
    'Negative_Review': ' '.join,
    'District': 'first'
}).reset_index()

In [10]:
grouped_df['Positive_Review_prepared'] = grouped_df['Positive_Review'].apply(remove_stopwords)
grouped_df['Negative_Review_prepared'] = grouped_df['Negative_Review'].apply(remove_stopwords)

In [11]:
grouped_df.drop(columns=['Positive_Review', 'Negative_Review'], inplace=True)
grouped_df.rename(columns={'Positive_Review_prepared': 'Positive_Review', 'Negative_Review_prepared': 'Negative_Review'}, inplace=True)

In [12]:
grouped_df.head(5)

Unnamed: 0,Hotel_Name,lat,lng,Average_Score,Total_Number_of_Reviews,District,Positive_Review,Negative_Review
0,25hours Hotel beim MuseumsQuartier,48.206474,16.35463,8.8,4324,7,cool vintage style middle museum quarter metro...,breakfast included buffet really expensive bre...
1,ARCOTEL Kaiserwasser Superior,48.231915,16.417026,8.6,1257,22,great location vic meetings amazing value mone...,booked suites room double room asked connected...
2,ARCOTEL Wimberger,48.2006,16.338633,8.2,1886,7,close public transportation gentleman front de...,stuff canteen prepared tourist crowd clean fre...
3,AZIMUT Hotel Vienna,48.183479,16.376276,8.2,1060,10,comfortable room clean quiet friendly polite s...,excellent definitely security stayed one night...
4,Alma Boutique Hotel,48.211249,16.377652,8.7,730,1,could check couple hours earlier exhausting fl...,bread croissants old coffee mediocre breakfast...


# Save data

In [13]:
grouped_df.to_csv('./data/data_prepared.csv', index=False)