# Data Gathering

### 1. Extracting Hotel Names

In [1]:
import json

def filter_businesses_with_hotels(input_file, output_file):

    try:
        # Open the input file and output file with UTF-8 encoding
        with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
            # Iterate through each line (assuming each business is a JSON object in a new line)
            for line in infile:
                business = json.loads(line)  # Parse JSON string
                categories = business.get("categories", "")

                # Check if 'Hotels' is in categories
                if categories and "Hotels" in categories:
                    outfile.write(json.dumps(business) + '\n')  # Write JSON object as a single line

            print(f"Filtered businesses have been written to {output_file}")

    except Exception as e:
        print(f"An error occurred: {e}")

# Input and output file paths
input_file_path = "datasets/yelp_academic_dataset_business.json"  # Replace with the correct input file path
output_file_path = "datasets/filtered_hotel_names.json"  # Replace with the desired output file path

filter_businesses_with_hotels(input_file_path, output_file_path)


Filtered businesses have been written to datasets/filtered_hotel_names.json


### 2. Filtering Hotel reviews 

In [2]:
import json

def filter_reviews_by_business_ids(filtered_business_file, reviews_file, output_file):
 
    try:
        # Step 1: Load business_ids from filtered hotels
        with open(filtered_business_file, 'r', encoding='utf-8') as f:
            business_ids = {json.loads(line)['business_id'] for line in f}

        # Step 2: Filter reviews by matching business_ids
        with open(reviews_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
            for line in infile:
                review = json.loads(line)  # Parse review JSON
                if review['business_id'] in business_ids:  # Check if business_id matches
                    outfile.write(json.dumps(review) + '\n')  # Write review to output file

        print(f"Filtered reviews have been written to {output_file}")

    except Exception as e:
        print(f"An error occurred: {e}")

# File paths
filtered_business_file = "datasets/filtered_hotel_names.json"  # File with filtered hotels
reviews_file = "datasets/yelp_academic_dataset_review.json"  # File containing reviews
output_file = "datasets/filtered_hotel_reviews.json"  # Output file for filtered reviews

filter_reviews_by_business_ids(filtered_business_file, reviews_file, output_file)

Filtered reviews have been written to datasets/filtered_hotel_reviews.json


# Data Cleaning

### 1. Removing Non-English Reviews

In [3]:
import json
from langdetect import detect, DetectorFactory

# Ensure consistent results from the langdetect library
DetectorFactory.seed = 0

def remove_non_english_reviews(input_file, output_file):

    try:
        with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
            count = 0
            filtered_count = 0

            for line in infile:
                count += 1
                review = json.loads(line)
                text = review.get('text', '')

                try:
                    # Detect language and keep only English reviews
                    if detect(text) == 'en':
                        json.dump(review, outfile)
                        outfile.write('\n')
                        filtered_count += 1
                except Exception as e:
                    # Skip reviews where language detection fails
                    print(f"Skipping review {count}: {e}")

                if count % 10000 == 0:  # Progress update every 10k reviews
                    print(f"Processed {count} reviews so far. Filtered {filtered_count} English reviews.")

        print(f"Completed. Filtered {filtered_count} English reviews out of {count} total reviews.")

    except Exception as e:
        print(f"An error occurred: {e}")

# File paths
input_file = "datasets/filtered_hotel_reviews.json"  # Input JSON file with reviews
output_file = "datasets/filtered_hotel_reviews_cleaned.json"  # Output JSON file with only English reviews

remove_non_english_reviews(input_file, output_file)

Processed 10000 reviews so far. Filtered 9984 English reviews.
Processed 20000 reviews so far. Filtered 19966 English reviews.
Processed 30000 reviews so far. Filtered 29957 English reviews.
Processed 40000 reviews so far. Filtered 39947 English reviews.
Processed 50000 reviews so far. Filtered 49930 English reviews.
Processed 60000 reviews so far. Filtered 59921 English reviews.
Processed 70000 reviews so far. Filtered 69914 English reviews.
Processed 80000 reviews so far. Filtered 79900 English reviews.
Processed 90000 reviews so far. Filtered 89885 English reviews.
Processed 100000 reviews so far. Filtered 99877 English reviews.
Processed 110000 reviews so far. Filtered 109853 English reviews.
Processed 120000 reviews so far. Filtered 119837 English reviews.
Processed 130000 reviews so far. Filtered 129824 English reviews.
Processed 140000 reviews so far. Filtered 139807 English reviews.
Skipping review 145345: No features in text.
Processed 150000 reviews so far. Filtered 149791 En

### 2. Cleaning Further

In [4]:
import json
import re
from nltk.corpus import stopwords
import nltk

# Download NLTK data (run once)
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

def clean_reviews(input_file, output_file):
    try:
        stop_words = set(stopwords.words('english'))
        
        # Read and clean reviews, then write them back
        with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
            for line in infile:
                review_obj = json.loads(line)
                # Clean the review text
                review_obj['text'] = clean_text(review_obj['text'], stop_words)
                # Write the modified review object to output file
                outfile.write(json.dumps(review_obj) + '\n')
        
        print(f"Cleaned reviews saved to {output_file}")
        
    except Exception as e:
        print(f"An error occurred: {e}")

def clean_text(text, stop_words):
    """Clean text by removing URLs, HTML, special characters, and stop words."""
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove email addresses
    text = re.sub(r'\S*@\S*\s?', '', text)
    
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s.,!?]', '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Remove stop words
    words = text.split()
    text = ' '.join([word for word in words if word not in stop_words])
    
    return text

input_file = "datasets/filtered_hotel_reviews_cleaned.json"
output_file = "datasets/filtered_hotel_reviews_cleaned_2.json"
    
clean_reviews(input_file, output_file)

Cleaned reviews saved to datasets/filtered_hotel_reviews_cleaned_2.json
