In [24]:
import requests
import pandas as pd
from urllib.parse import urlencode
import time

def get_data(endpoint, params=None):
    base_url = "http://tour-pedia.org/api"
    url = f"{base_url}/{endpoint}"
    if params:
        url += "?" + urlencode(params)
    
    print(f"Requesting: {url}")
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error {response.status_code}: {response.text}")
        return None

def fetch_all_data():
    locations = ["Amsterdam", "Barcelona", "Berlin", "Dubai", "London", "Paris", "Rome", "Tuscany"]
    categories = ["accommodation", "attraction", "poi", "restaurant"]
    
    all_places = []
    all_reviews = []
    
    for location in locations:
        for category in categories:
            print(f"Fetching {category} in {location}")
            
            # Fetch places
            places = get_data("getPlaces", {"location": location, "category": category})
            if places:
                all_places.extend(places)
                
                # Fetch reviews for each place
                for place in places:
                    reviews = get_data("getReviews", {"location": location, "category": category, "placeId": place['id']})
                    if reviews:
                        for review in reviews:
                            review['place_id'] = place['id']  # Add place_id to the review
                        all_reviews.extend(reviews)
                    
    # Create DataFrames
    places_df = pd.DataFrame(all_places)
    reviews_df = pd.DataFrame(all_reviews)
    
    return places_df, reviews_df

# Fetch the data
places_df, reviews_df = fetch_all_data()

# Display info about the DataFrames
print("\nPlaces DataFrame:")
print(places_df.info())
print("\nReviews DataFrame:")
print(reviews_df.info())

# Save to CSV (optional)
places_df.to_csv('places_data.csv', index=False)
reviews_df.to_csv('reviews_data.csv', index=False)

print("\nData fetching complete. DataFrames created and saved to CSV files.")

Fetching accommodation in Amsterdam
Requesting: http://tour-pedia.org/api/getPlaces?location=Amsterdam&category=accommodation


KeyboardInterrupt: 

In [10]:
import requests
import pandas as pd
from urllib.parse import urlencode
import time

def get_data(endpoint, params=None):
    base_url = "http://tour-pedia.org/api"
    url = f"{base_url}/{endpoint}"
    if params:
        url += "?" + urlencode(params)
    
    print(f"Requesting: {url}")
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error {response.status_code}: {response.text}")
        return None

def fetch_amsterdam_accommodations_with_reviews():
    params = {
        "location": "Amsterdam",
        "category": "accommodation"
    }
    
    accommodations = get_data("getPlaces", params)
    
    if not accommodations:
        print("Failed to retrieve accommodation data")
        return None

    print(f"Retrieved {len(accommodations)} accommodations in Amsterdam")
    
    for accommodation in accommodations:
        # Fetch reviews
        reviews = get_data("getReviewsByPlaceId", {"placeId": accommodation['id']})
        if reviews:
            accommodation['reviews_text'] = [review.get('text', '') for review in reviews]
        else:
            accommodation['reviews_text'] = []

        # Fetch details
        details = get_data("getPlaceDetails", {"id": accommodation['id']})
        if details:
            accommodation['details_text'] = details
        else:
            accommodation['details_text'] = {}

    return pd.DataFrame(accommodations)

# Fetch the data
accommodations_df = fetch_amsterdam_accommodations_with_reviews()

if accommodations_df is not None:
    # Display info about the DataFrame
    print("\nAccommodations DataFrame:")
    print(accommodations_df.info())

    # Display the first few rows
    print("\nFirst few rows of the DataFrame:")
    print(accommodations_df.head())

    # Save to CSV (optional)
    accommodations_df.to_csv('amsterdam_accommodations_with_reviews.csv', index=False)
    print("\nData saved to 'amsterdam_accommodations_with_reviews.csv'")

    # Basic statistics
    print("\nBasic statistics of numerical columns:")
    print(accommodations_df.describe())

    # Check for missing values
    print("\nMissing values in each column:")
    print(accommodations_df.isnull().sum())

    # Sample of reviews
    print("\nSample of reviews for the first accommodation:")
    if len(accommodations_df) > 0 and len(accommodations_df.iloc[0]['reviews_text']) > 0:
        print(accommodations_df.iloc[0]['reviews_text'][0])
    else:
        print("No reviews available for the first accommodation.")

    # Sample of details
    print("\nSample of details for the first accommodation:")
    if len(accommodations_df) > 0:
        print(accommodations_df.iloc[0]['details_text'])
    else:
        print("No details available for the first accommodation.")
else:
    print("No data to analyze")

Requesting: http://tour-pedia.org/api/getPlaces?location=Amsterdam&category=accommodation
Retrieved 1393 accommodations in Amsterdam
Requesting: http://tour-pedia.org/api/getReviewsByPlaceId?placeId=223771
Requesting: http://tour-pedia.org/api/getPlaceDetails?id=223771
Requesting: http://tour-pedia.org/api/getReviewsByPlaceId?placeId=223776
Requesting: http://tour-pedia.org/api/getPlaceDetails?id=223776
Requesting: http://tour-pedia.org/api/getReviewsByPlaceId?placeId=223778
Requesting: http://tour-pedia.org/api/getPlaceDetails?id=223778
Requesting: http://tour-pedia.org/api/getReviewsByPlaceId?placeId=223779
Requesting: http://tour-pedia.org/api/getPlaceDetails?id=223779
Requesting: http://tour-pedia.org/api/getReviewsByPlaceId?placeId=223793
Requesting: http://tour-pedia.org/api/getPlaceDetails?id=223793
Requesting: http://tour-pedia.org/api/getReviewsByPlaceId?placeId=223818
Requesting: http://tour-pedia.org/api/getPlaceDetails?id=223818
Requesting: http://tour-pedia.org/api/getRevi

In [29]:
import requests
import pandas as pd
from urllib.parse import urlencode
import time
from concurrent.futures import ThreadPoolExecutor, as_completed  # Changed import
import sys

def get_data(endpoint, params=None):
    """
    Fetch data from the Tour-Pedia API.

    Args:
        endpoint (str): API endpoint to call.
        params (dict, optional): Query parameters for the API call.

    Returns:
        dict or list: Parsed JSON response from the API, or None if an error occurs.
    """
    base_url = "http://tour-pedia.org/api"
    url = f"{base_url}/{endpoint}"
    if params:
        url += "?" + urlencode(params)
    
    print(f"Requesting: {url}")
    try:
        response = requests.get(url, timeout=10)  # Added timeout for robustness
        response.raise_for_status()  # Raises HTTPError for bad responses
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
        return None

def process_accommodation(accommodation):
    """
    Process a single accommodation by fetching its reviews and details.

    Args:
        accommodation (dict): Accommodation data.

    Returns:
        dict: Updated accommodation data with reviews and details.
    """
    # Fetch reviews
    reviews = get_data("getReviewsByPlaceId", {"placeId": accommodation.get('id')})
    if reviews:
        accommodation['reviews_text'] = [review.get('text', '') for review in reviews]
    else:
        accommodation['reviews_text'] = []

    # Fetch details
    details = get_data("getPlaceDetails", {"id": accommodation.get('id')})
    if details:
        accommodation['details_text'] = details
    else:
        accommodation['details_text'] = {}

    time.sleep(1)  # Be nice to the API
    return accommodation

def fetch_amsterdam_accommodations_with_reviews():
    """
    Fetch accommodations in Amsterdam and enrich them with reviews and details.

    Returns:
        pd.DataFrame or None: DataFrame containing enriched accommodation data, or None if failed.
    """
    params = {
        "location": "Amsterdam",
        "category": "accommodation"
    }
    
    accommodations = get_data("getPlaces", params)
    
    if not accommodations:
        print("Failed to retrieve accommodation data")
        return None

    print(f"Retrieved {len(accommodations)} accommodations in Amsterdam")
    
    # Use ThreadPoolExecutor for parallel processing
    with ThreadPoolExecutor(max_workers=8) as executor:
        # Submit all tasks to the executor
        futures = {executor.submit(process_accommodation, accommodation): accommodation for accommodation in accommodations}
        
        processed_accommodations = []
        for future in as_completed(futures):
            try:
                result = future.result()
                if result:
                    processed_accommodations.append(result)
            except Exception as e:
                accommodation = futures[future]
                print(f"Error processing accommodation ID {accommodation.get('id')}: {e}", file=sys.stderr)
    
    return pd.DataFrame(processed_accommodations)

def main():
    """
    Main function to execute the data fetching and processing.
    """
    accommodations_df = fetch_amsterdam_accommodations_with_reviews()

    if accommodations_df is not None and not accommodations_df.empty:
        # Display info about the DataFrame
        print("\nAccommodations DataFrame:")
        print(accommodations_df.info())

        # Display the first few rows
        print("\nFirst few rows of the DataFrame:")
        print(accommodations_df.head())

        # Save to CSV
        accommodations_df.to_csv('amsterdam_accommodations_with_reviews.csv', index=False)
        print("\nData saved to 'amsterdam_accommodations_with_reviews.csv'")

        # Basic statistics
        print("\nBasic statistics of numerical columns:")
        print(accommodations_df.describe())

        # Check for missing values
        print("\nMissing values in each column:")
        print(accommodations_df.isnull().sum())

        # Sample of reviews
        print("\nSample of reviews for the first accommodation:")
        if len(accommodations_df) > 0 and accommodations_df.iloc[0].get('reviews_text'):
            print(accommodations_df.iloc[0]['reviews_text'][0])
        else:
            print("No reviews available for the first accommodation.")

        # Sample of details
        print("\nSample of details for the first accommodation:")
        if len(accommodations_df) > 0 and accommodations_df.iloc[0].get('details_text'):
            print(accommodations_df.iloc[0]['details_text'])
        else:
            print("No details available for the first accommodation.")
    else:
        print("No data to analyze")

if __name__ == '__main__':
    main()


Requesting: http://tour-pedia.org/api/getPlaces?location=Amsterdam&category=accommodation
Retrieved 1393 accommodations in Amsterdam
Requesting: http://tour-pedia.org/api/getReviewsByPlaceId?placeId=223771
Requesting: http://tour-pedia.org/api/getReviewsByPlaceId?placeId=223776
Requesting: http://tour-pedia.org/api/getReviewsByPlaceId?placeId=223778
Requesting: http://tour-pedia.org/api/getReviewsByPlaceId?placeId=223779
Requesting: http://tour-pedia.org/api/getReviewsByPlaceId?placeId=223793
Requesting: http://tour-pedia.org/api/getReviewsByPlaceId?placeId=223818
Requesting: http://tour-pedia.org/api/getReviewsByPlaceId?placeId=223829
Requesting: http://tour-pedia.org/api/getReviewsByPlaceId?placeId=223835
Requesting: http://tour-pedia.org/api/getPlaceDetails?id=223835
Requesting: http://tour-pedia.org/api/getPlaceDetails?id=223818
Requesting: http://tour-pedia.org/api/getPlaceDetails?id=223829
Requesting: http://tour-pedia.org/api/getPlaceDetails?id=223771
Requesting: http://tour-ped

In [2]:
# script1.py

import requests
import pandas as pd
from urllib.parse import urlencode
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
import sys

# Define the list of locations and categories
LOCATIONS = ["Amsterdam"]
CATEGORIES = ["accommodation", "restaurant", "poi", "attraction"]
#  "Barcelona", "Berlin"
# API Configuration
BASE_URL = "http://tour-pedia.org/api"
GET_PLACES_ENDPOINT = "getPlaces"
GET_REVIEWS_ENDPOINT = "getReviewsByPlaceId"
GET_DETAILS_ENDPOINT = "getPlaceDetails"

# Number of threads (adjust as needed, up to 192)
MAX_WORKERS = 60

def get_data(endpoint, params=None):
    """
    Fetch data from the Tour-Pedia API.

    Args:
        endpoint (str): API endpoint to call.
        params (dict, optional): Query parameters for the API call.

    Returns:
        dict or list: Parsed JSON response from the API, or None if an error occurs.
    """
    url = f"{BASE_URL}/{endpoint}"
    if params:
        url += "?" + urlencode(params)
    
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        print(f"Success: {url}")
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Request failed for URL: {url}\nError: {e}", file=sys.stderr)
        return None

def process_place(place, category, location):
    """
    Process a single place by fetching its reviews and details.

    Args:
        place (dict): Place data.
        category (str): Category of the place.
        location (str): Location of the place.

    Returns:
        dict: Updated place data with reviews and details.
    """
    place_id = place.get('id')
    place_name = place.get('name', 'N/A')

    # Fetch reviews
    reviews = get_data(GET_REVIEWS_ENDPOINT, {"placeId": place_id})
    if reviews:
        reviews_text = [review.get('text', '') for review in reviews]
    else:
        reviews_text = []

    # Fetch details
    details = get_data(GET_DETAILS_ENDPOINT, {"id": place_id})
    if details:
        details_text = details
    else:
        details_text = {}

    # Be nice to the API
    time.sleep(0.1)  # Reduced sleep to accommodate high concurrency

    return {
        "location": location,
        "category": category,
        "place_id": place_id,
        "place_name": place_name,
        "reviews_text": reviews_text,
        "details_text": details_text
    }

def fetch_data_for_category(location, category):
    """
    Fetch and process data for a specific location and category.

    Args:
        location (str): Location name.
        category (str): Category name.

    Returns:
        list: List of processed place data.
    """
    params = {
        "location": location,
        "category": category
    }
    
    places = get_data(GET_PLACES_ENDPOINT, params)
    
    if not places:
        print(f"Failed to retrieve data for {location} - {category}", file=sys.stderr)
        return []
    
    print(f"Retrieved {len(places)} places for {location} - {category}")
    
    processed_places = []
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = [
            executor.submit(process_place, place, category, location)
            for place in places
        ]
        for future in as_completed(futures):
            try:
                result = future.result()
                if result:
                    processed_places.append(result)
            except Exception as e:
                print(f"Error processing a place in {location} - {category}: {e}", file=sys.stderr)
    
    return processed_places

def main():
    """
    Main function to execute data fetching and processing.
    """
    all_data = []
    
    for location in LOCATIONS:
        for category in CATEGORIES:
            data = fetch_data_for_category(location, category)
            all_data.extend(data)
    
    if all_data:
        df = pd.DataFrame(all_data)
        csv_filename = 'amsterdam_barcelona_berlin_data.csv'
        df.to_csv(csv_filename, index=False)
        print(f"\nData saved to '{csv_filename}'")
        
        # Optional: Display DataFrame info and sample
        print("\nDataFrame Info:")
        print(df.info())
        
        print("\nFirst few rows of the DataFrame:")
        print(df.head())
        
        print("\nBasic statistics of numerical columns:")
        print(df.describe())
        
        print("\nMissing values in each column:")
        print(df.isnull().sum())
        
        # Sample of reviews and details
        if not df.empty:
            first_entry = df.iloc[0]
            print("\nSample of reviews for the first entry:")
            if first_entry['reviews_text']:
                print(first_entry['reviews_text'][0])
            else:
                print("No reviews available.")
            
            print("\nSample of details for the first entry:")
            if first_entry['details_text']:
                print(first_entry['details_text'])
            else:
                print("No details available.")
    else:
        print("No data was fetched.")

if __name__ == '__main__':
    main()


Success: http://tour-pedia.org/api/getPlaces?location=Amsterdam&category=accommodation
Retrieved 1393 places for Amsterdam - accommodation
Success: http://tour-pedia.org/api/getReviewsByPlaceId?placeId=223778
Success: http://tour-pedia.org/api/getReviewsByPlaceId?placeId=223793
Success: http://tour-pedia.org/api/getReviewsByPlaceId?placeId=223771
Success: http://tour-pedia.org/api/getReviewsByPlaceId?placeId=223776
Success: http://tour-pedia.org/api/getReviewsByPlaceId?placeId=223896
Success: http://tour-pedia.org/api/getReviewsByPlaceId?placeId=223818
Success: http://tour-pedia.org/api/getReviewsByPlaceId?placeId=223779
Success: http://tour-pedia.org/api/getReviewsByPlaceId?placeId=223829
Success: http://tour-pedia.org/api/getReviewsByPlaceId?placeId=223835
Success: http://tour-pedia.org/api/getReviewsByPlaceId?placeId=223847
Success: http://tour-pedia.org/api/getReviewsByPlaceId?placeId=223849
Success: http://tour-pedia.org/api/getReviewsByPlaceId?placeId=223861
Success: http://tour-