In [6]:
import os
import logging
import pandas as pd
import time
import requests
from bs4 import BeautifulSoup

In [None]:
# Configure logging
logging.basicConfig(level=logging.INFO)

def convert_to_oyo_ids(directory):
    """
    Converts CSV file names in a directory to OYO IDs.

    Args:
        directory (str): The directory path containing the CSV files.

    Returns:
        pandas.DataFrame: A DataFrame containing the OYO IDs.
    """
    try:
        # Get the list of folders in the directory
        folders = [f for f in os.listdir(directory) if os.path.isdir(os.path.join(directory, f))]

        oyo_ids = []  # List to store the OYO IDs

        # Iterate over each folder
        for folder in folders:
            folder_path = os.path.join(directory, folder)
            csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

            # Iterate over each CSV file in the folder
            for csv_file in csv_files:
                file_name = os.path.splitext(csv_file)[0]  # Extract the file name without extension
                oyo_id = "https://www.oyorooms.com/{}".format(file_name)
                oyo_ids.append(oyo_id)

        # Create a DataFrame with OYO IDs
        result_df = pd.DataFrame({'OYO ID': oyo_ids})

        return result_df

    except Exception as e:
        logging.error(f"An error occurred while converting CSV file names to OYO IDs: {str(e)}")
        return None


# Example usage
directory_path = './oyo_reviews_city_wise_csv'
result_df = convert_to_oyo_ids(directory_path)
if result_df is not None:
    logging.info("Conversion completed successfully.")
    print(result_df)
else:
    logging.warning("Conversion failed.")


In [None]:
# Configure logging
logging.basicConfig(level=logging.INFO)

def scrape_oyo_hotel_data(urls):
    """
    Scrapes OYO hotel data from a list of URLs.

    Args:
        urls (list): A list of URLs to scrape.

    Returns:
        pandas.DataFrame: A DataFrame containing the scraped hotel data.
    """
    try:
        data = []  # List to store the scraped data

        for i, url in enumerate(urls):
            # Set the user agent for the request headers
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
            }

            # Send a GET request to the URL with the headers
            response = requests.get(url, headers=headers)

            # Check if the request was successful
            if response.status_code == 200:
                # Parse the HTML content using BeautifulSoup
                soup = BeautifulSoup(response.content, 'html.parser')

                
                # Extract the hotel name
                hotel_name = soup.find('h1', {'class': 'c-1wj1luj'}).text.strip()

                # Check if the hotel is new
                is_new_element = soup.find('div', {'class': 'c-15duxhm'})
                is_new = 1 if is_new_element and is_new_element.text.strip() == 'NEW' else 0

                # Extract the total rating
                total_rating_element = soup.find('div', {'class': 'c-1qcdse5'})
                total_rating = total_rating_element.text.strip() if total_rating_element else None

                # Extract the price details
                price_element = soup.find('span', {'class': 'listingPrice__finalPrice listingPrice__finalPrice--black'})
                price = price_element.text.strip() if price_element else None

                orignal_price_element = soup.find('span', {'class': 'listingPrice__slashedPrice d-body-lg'})
                orignal_price = orignal_price_element.text.strip() if orignal_price_element else None

                discount_element = soup.find('span', {'class': 'listingPrice__percentage'})
                discount = discount_element.text.strip() if discount_element else None

                # Append the scraped data to the list
                data.append({
                    'Hotel ID': url,
                    'Hotel Name': hotel_name,
                    'Is New': is_new,
                    'Total Rating': total_rating,
                    'Price': price,
                    'Original Price': orignal_price,
                    'Discount': discount
                })
                logging.info(f"Scraped data for URL: {url}")

                # Create a DataFrame from the scraped data
                df = pd.DataFrame([data[-1]])  # Only the last scraped data

                # Save the DataFrame to the CSV file
                if os.path.exists('oyo_hotel_data.csv'):
                    df.to_csv('oyo_hotel_data.csv', mode='a', header=False, index=False)
                else:
                    df.to_csv('oyo_hotel_data.csv', index=False)

            else:
                logging.warning(f"Failed to fetch the data for URL: {url}")

            # Check if it's time for a break
            if (i + 1) % 500 == 0:
                logging.info("Taking a 3-second break...")
                time.sleep(3)

        # Create a DataFrame from the scraped data
        df = pd.DataFrame(data)

        # Check if the CSV file already exists
        if os.path.exists('oyo_hotel_data.csv'):
            # Read the existing CSV file
            existing_df = pd.read_csv('oyo_hotel_data.csv')

            # Append the new data to the existing DataFrame
            df = pd.concat([existing_df, df], ignore_index=True)

        # Save the DataFrame to the CSV file
        df.to_csv('oyo_hotel_data.csv', index=False)

        return df

    except Exception as e:
        logging.error(f"An error occurred while scraping OYO hotel data: {str(e)}")
        return None


# Example usage
data = scrape_oyo_hotel_data(result_df['OYO ID'])
if data is not None:
    logging.info("Scraping completed successfully.")
    print(data)
else:
    logging.warning("Scraping failed.")
