In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from concurrent.futures import ThreadPoolExecutor

# Define headers
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

# Read the input CSV file containing URLs (output.csv)
input_file = "output.csv"  # Replace with your input CSV file name
output_file = "final_book_url.csv"  # The file where final book URLs will be saved

# Load the URLs into a DataFrame
df = pd.read_csv(input_file)

# Initialize the list to store unique book URLs
unique_book_urls = set()

# Function to scrape books from a given URL and page
def scrape_books(url):
    page = 1
    local_book_urls = set()  # Local set to store book URLs for this thread
    print(f"Processing URL: {url}")  # Log the current URL being processed
    while True:
        print(f"Entering page {page} for URL: {url}")  # Print the page number being processed
        
        # Construct the full URL with the current page
        full_url = f"{url}&page={page}&per_page=30"
        print(full_url)
        # Request the page content
        response = requests.get(full_url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find all book items on the page
        book_links = soup.find_all('a', {"class": "bookTitle"})
        
        if not book_links:  # If no book links found, stop processing
            break
        
        # Loop through each book link and extract the book URL
        for link in book_links:
            book_url = 'https://www.goodreads.com' + link['href']
            print(f"Found book URL: {book_url}")  # Print the book URL before adding
            local_book_urls.add(book_url)  # Add the URL to the local set
        
        page += 1  # Move to the next page
    
    return local_book_urls  # Return the set of URLs scraped by this thread

# Function to process each author URL and collect all unique book URLs
def process_author_urls():
    global unique_book_urls
    # Using ThreadPoolExecutor to run multiple threads for each author URL
    with ThreadPoolExecutor(max_workers=5) as executor:
        results = executor.map(scrape_books, df['transformed_url'])
    
    # Combine results from all threads into the unique_book_urls set
    for result in results:
        unique_book_urls.update(result)

# Run the scraping process
process_author_urls()

# Convert the set of unique URLs into a list
final_book_urls = list(unique_book_urls)

# Convert the list into a pandas DataFrame
final_df = pd.DataFrame(final_book_urls, columns=["Book_URL"])

# Save the DataFrame to a CSV file
#final_df.to_csv(output_file, index=False)

# Print the number of unique book URLs scraped
print(f"Total unique book URLs scraped: {len(final_book_urls)}")


Processing URL: https://www.goodreads.com/author/list/6523270.Shahidullah_Kaiser?page=1&per_page=30
Entering page 1 for URL: https://www.goodreads.com/author/list/6523270.Shahidullah_Kaiser?page=1&per_page=30
https://www.goodreads.com/author/list/6523270.Shahidullah_Kaiser?page=1&per_page=30&page=1&per_page=30
Processing URL: https://www.goodreads.com/author/list/7032598.Anwar_Pasha?page=1&per_page=30
Entering page 1 for URL: https://www.goodreads.com/author/list/7032598.Anwar_Pasha?page=1&per_page=30
https://www.goodreads.com/author/list/7032598.Anwar_Pasha?page=1&per_page=30&page=1&per_page=30
Processing URL: https://www.goodreads.com/author/list/4243396.Jahanara_Imam?page=1&per_page=30
Entering page 1 for URL: https://www.goodreads.com/author/list/4243396.Jahanara_Imam?page=1&per_page=30
https://www.goodreads.com/author/list/4243396.Jahanara_Imam?page=1&per_page=30&page=1&per_page=30
Processing URL: https://www.goodreads.com/author/list/7359259.M_R_Akhtar_Mukul?page=1&per_page=30
En

In [11]:
import pandas as pd

# Read the final_book_url.csv file
input_file = "final_book_url.csv"  # Replace with your input CSV file name
output_file = "unique_final_book_url.csv"  # Output file for unique book URLs

# Load the CSV into a DataFrame
df = pd.read_csv(input_file)

# Ensure the 'Book_URL' column has unique values by using drop_duplicates
df_unique = df.drop_duplicates(subset=["Book_URL"])

# Save the DataFrame with unique URLs to a new CSV file
df_unique.to_csv(output_file, index=False)

# Print the number of unique book URLs
print(f"Total unique book URLs: {len(df_unique)}")
print(df_unique)

Total unique book URLs: 11418
                                                Book_URL
0      https://www.goodreads.com/book/show/20332546-k...
1           https://www.goodreads.com/book/show/20560971
2      https://www.goodreads.com/book/show/128319262-...
3           https://www.goodreads.com/book/show/38921982
4           https://www.goodreads.com/book/show/38894896
...                                                  ...
11413  https://www.goodreads.com/book/show/136088636-...
11414       https://www.goodreads.com/book/show/28512605
11415  https://www.goodreads.com/book/show/170123720-...
11416       https://www.goodreads.com/book/show/20746280
11417  https://www.goodreads.com/book/show/168162563-...

[11418 rows x 1 columns]


In [29]:
import requests
from bs4 import BeautifulSoup
import csv

# Define the single URL to scrape
url = "https://www.goodreads.com/author_followings?id=6523270&method=get&page=1"  
# output_file = "extracted_user_urls.csv"

# Open a CSV file for writing the extracted user URLs
with open(output_file, "w", newline="", encoding="utf-8") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["User URLs"])  # Header for the output file

    try:
        # Fetch the page content
        headers = {"User-Agent": "Mozilla/5.0"}
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, "html.parser")

        # Extract user URLs
        user_links = soup.find_all("a", {"rel": "acquaintance"})  # Adjust selector if needed
        for link in user_links:
            user_url = f"https://www.goodreads.com{link['href']}"
            writer.writerow([user_url])  # Write the extracted URL to the CSV
        print(f"Processed: {url}")
    
    except Exception as e:
        print(f"Error processing {url}: {e}")

print(f"User URLs extracted and saved to {output_file}.")


Processed: https://www.goodreads.com/author_followings?id=6523270&method=get&page=1
User URLs extracted and saved to extracted_user_urls.csv.


In [20]:
urls

'https://www.goodreads.com/author_followings?id=6523270&method=get&page=1'

In [28]:
import csv
import pandas as pd
from playwright.async_api import async_playwright
import nest_asyncio
import asyncio

# Allow nested event loops (required for Jupyter Notebooks)
nest_asyncio.apply()

# Function to scrape user URLs from a given page
async def scrape_user_urls(url, page):
    try:
        await page.goto(url, timeout=60000)  # Open the URL with a timeout of 60 seconds
        await page.wait_for_selector("a[rel='acquaintance']", timeout=10000)  # Wait for user URLs to load

        # Extract all user URLs from the page
        user_links = await page.query_selector_all("a[rel='acquaintance']")
        user_urls = []
        for link in user_links:
            href = await link.get_attribute("href")
            if href:
                user_urls.append(f"https://www.goodreads.com{href}")

        return user_urls
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return []

# Main function to process all URLs
async def main():
    # Step 1: Read the URLs from the input CSV file
    input_file = 'output2.csv'  # Update with your actual file name if necessary
    urls_df = pd.read_csv(input_file, header=None)
    urls_df = urls_df.drop_duplicates()  # Remove duplicate URLs
    urls = urls_df[0].tolist()  # Replace 0 if the URL column index differs

    # Step 2: Prepare an output CSV file to save extracted user URLs
    output_file = 'user_urls.csv'
    with open(output_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['User URLs'])  # Header row

        # Step 3: Start Playwright for web scraping
        async with async_playwright() as p:
            # Launch a browser
            browser = await p.chromium.launch(headless=False)  # Use headless=True for headless browsing
            context = await browser.new_context()
            page = await context.new_page()

            # Log in to Goodreads
            login_url = "https://www.goodreads.com/ap/signin?language=en_US&openid.assoc_handle=amzn_goodreads_web_na&openid.claimed_id=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&openid.identity=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&openid.mode=checkid_setup&openid.ns=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0&openid.pape.max_auth_age=0&openid.return_to=https%3A%2F%2Fwww.goodreads.com%2Fap-handler%2Fsign-in&siteState=eyJyZXR1cm5fdXJsIjoiaHR0cHM6Ly93d3cuZ29vZHJlYWRzLmNvbS8ifQ%3D%3D"
            await page.goto(login_url)
            await page.wait_for_selector("input[name='email']", timeout=10000)
            await page.fill("input[name='email']", "robiulriyadh66@gmail.com")  # Replace with your email
            await page.fill("input[name='password']", "V!MZUhP8TL39-jH")        # Replace with your password
            await page.click("input[type='submit']")
            await page.wait_for_selector(".siteHeader__topLevelLink", timeout=30000)  # Wait for login to complete

            # Step 4: Process each URL and extract user URLs
            for url in urls:
                print(f"Processing {url}...")
                user_urls = await scrape_user_urls(url, page)
                if user_urls:
                    # Write each user URL in a new row
                    for user_url in user_urls:
                        writer.writerow([user_url])
                else:
                    writer.writerow(["No user URLs found"])
                print(f"Scraped {len(user_urls)} user URLs from {url}")

            # Close the browser
            await browser.close()

    print(f"User URLs saved to {output_file}.")

# Run the async function
await main()


Processing transformed_url...
Error scraping transformed_url: Page.goto: Protocol error (Page.navigate): Cannot navigate to invalid URL
Call log:
navigating to "transformed_url", waiting until "load"

Scraped 0 user URLs from transformed_url
Processing https://www.goodreads.com/author_followings?id=6523270&method=get&page=1...
Scraped 30 user URLs from https://www.goodreads.com/author_followings?id=6523270&method=get&page=1
Processing https://www.goodreads.com/author_followings?id=7032598&method=get&page=1...
Scraped 18 user URLs from https://www.goodreads.com/author_followings?id=7032598&method=get&page=1
Processing https://www.goodreads.com/author_followings?id=4243396&method=get&page=1...
Scraped 30 user URLs from https://www.goodreads.com/author_followings?id=4243396&method=get&page=1
Processing https://www.goodreads.com/author_followings?id=7359259&method=get&page=1...
Scraped 13 user URLs from https://www.goodreads.com/author_followings?id=7359259&method=get&page=1
Processing htt

In [31]:
import asyncio
from playwright.async_api import async_playwright
import nest_asyncio

# Allow nested event loops (required for Jupyter Notebooks)
nest_asyncio.apply()

# Function to scrape user URLs from the given URL
async def scrape_user_urls(url, page):
    try:
        await page.goto(url, timeout=60000)  # Open the URL with a timeout of 60 seconds
        await page.wait_for_selector("a[rel='acquaintance']", timeout=30000)  # Increased timeout to 30 seconds

        # Extract all user URLs from the page
        user_links = await page.query_selector_all("a[rel='acquaintance']")
        user_urls = []
        for link in user_links:
            href = await link.get_attribute("href")
            if href:
                user_urls.append(f"https://www.goodreads.com{href}")

        return user_urls
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return []

# Main function to process the specific URL
async def main():
    # Step 1: Start Playwright for web scraping
    async with async_playwright() as p:
        # Launch a browser
        browser = await p.chromium.launch(headless=False)  # Use headless=True for headless browsing
        context = await browser.new_context()
        page = await context.new_page()

        # Log in to Goodreads
        login_url = "https://www.goodreads.com/ap/signin?language=en_US&openid.assoc_handle=amzn_goodreads_web_na&openid.claimed_id=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&openid.identity=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&openid.mode=checkid_setup&openid.ns=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0&openid.pape.max_auth_age=0&openid.return_to=https%3A%2F%2Fwww.goodreads.com%2Fap-handler%2Fsign-in&siteState=eyJyZXR1cm5fdXJsIjoiaHR0cHM6Ly93d3cuZ29vZHJlYWRzLmNvbS8ifQ%3D%3D"
        await page.goto(login_url)
        await page.wait_for_selector("input[name='email']", timeout=10000)
        await page.fill("input[name='email']", "robiulriyadh66@gmail.com")  # Replace with your email
        await page.fill("input[name='password']", "V!MZUhP8TL39-jH")        # Replace with your password
        await page.click("input[type='submit']")
        await page.wait_for_selector(".siteHeader__topLevelLink", timeout=30000)  # Wait for login to complete
        logged_in = await page.query_selector(".siteHeader__topLevelLink")
        if not logged_in:
            print("Login failed!")
            return

        # Step 2: Loop through pages from 1 to 100
        for page_num in range(1, 101):
            url = f"https://www.goodreads.com/author_followings?id=6983656&method=get&page={page_num}"  # Change the page number dynamically
            print(f"Processing {url}...")

            # Step 3: Scrape user URLs from the current page
            user_urls = await scrape_user_urls(url, page)
            
            # Print the results
            if user_urls:
                for user_url in user_urls:
                    print(f"Scraped User URL: {user_url}")
            else:
                print("No user URLs found.")

        # Close the browser
        await browser.close()

# Run the async function
await main()


Processing https://www.goodreads.com/author_followings?id=6983656&method=get&page=1...
Scraped User URL: https://www.goodreads.com/user/show/139637-anik
Scraped User URL: https://www.goodreads.com/user/show/797827-sabila-enun
Scraped User URL: https://www.goodreads.com/user/show/798147-sarina
Scraped User URL: https://www.goodreads.com/user/show/830221-ashik-uzzaman
Scraped User URL: https://www.goodreads.com/user/show/1144887-brianna
Scraped User URL: https://www.goodreads.com/user/show/1208065-tariqul-ponir
Scraped User URL: https://www.goodreads.com/user/show/1547368-orin
Scraped User URL: https://www.goodreads.com/user/show/1970621-saminadossani
Scraped User URL: https://www.goodreads.com/user/show/2070502-tapendu
Scraped User URL: https://www.goodreads.com/user/show/2448670-asif-tariq
Scraped User URL: https://www.goodreads.com/user/show/2794691-ash
Scraped User URL: https://www.goodreads.com/user/show/2993169-krishnendu
Scraped User URL: https://www.goodreads.com/user/show/315739

  self._nodes_by_line[lineno].append(node)


CancelledError: 

In [32]:
import asyncio
from playwright.async_api import async_playwright
import nest_asyncio
import csv
import pandas as pd

# Allow nested event loops (required for Jupyter Notebooks)
nest_asyncio.apply()

# Function to scrape user URLs from the given URL
async def scrape_user_urls(url, page):
    try:
        await page.goto(url, timeout=60000)  # Open the URL with a timeout of 60 seconds
        await page.wait_for_selector("a[rel='acquaintance']", timeout=30000)  # Increased timeout to 30 seconds

        # Extract all user URLs from the page
        user_links = await page.query_selector_all("a[rel='acquaintance']")
        user_urls = []
        for link in user_links:
            href = await link.get_attribute("href")
            if href:
                user_urls.append(f"https://www.goodreads.com{href}")

        return user_urls
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return []

# Function to read existing URLs from the output CSV to avoid duplicates
def read_existing_urls(output_file):
    try:
        # Read existing URLs from the final_user_url.csv to avoid duplicates
        existing_urls_df = pd.read_csv(output_file)
        return set(existing_urls_df['User URLs'].tolist())  # Return as a set for fast lookup
    except FileNotFoundError:
        return set()  # If file doesn't exist, return an empty set

# Main function to process the URLs from the CSV
async def main():
    # Step 1: Read the URLs from the input CSV (output2.csv)
    input_file = 'output2.csv'  # Input file containing URLs
    output_file = 'final_user_url.csv'  # Output file to store unique user URLs

    # Read existing user URLs from the final_user_url.csv to avoid duplicates
    existing_urls = read_existing_urls(output_file)

    # Step 2: Start Playwright for web scraping
    async with async_playwright() as p:
        # Launch a browser
        browser = await p.chromium.launch(headless=False)  # Use headless=True for headless browsing
        context = await browser.new_context()
        page = await context.new_page()

        # Log in to Goodreads
        login_url = "https://www.goodreads.com/ap/signin?language=en_US&openid.assoc_handle=amzn_goodreads_web_na&openid.claimed_id=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&openid.identity=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&openid.mode=checkid_setup&openid.ns=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0&openid.pape.max_auth_age=0&openid.return_to=https%3A%2F%2Fwww.goodreads.com%2Fap-handler%2Fsign-in&siteState=eyJyZXR1cm5fdXJsIjoiaHR0cHM6Ly93d3cuZ29vZHJlYWRzLmNvbS8ifQ%3D%3D"
        await page.goto(login_url)
        await page.wait_for_selector("input[name='email']", timeout=10000)
        await page.fill("input[name='email']", "robiulriyadh66@gmail.com")  # Replace with your email
        await page.fill("input[name='password']", "V!MZUhP8TL39-jH")        # Replace with your password
        await page.click("input[type='submit']")
        await page.wait_for_selector(".siteHeader__topLevelLink", timeout=30000)  # Wait for login to complete
        logged_in = await page.query_selector(".siteHeader__topLevelLink")
        if not logged_in:
            print("Login failed!")
            return

        # Step 3: Read the URLs from output2.csv (access the 'transformed_url' column)
        urls_df = pd.read_csv(input_file)  # Read the entire CSV
        if 'transformed_url' not in urls_df.columns:
            print("Error: 'transformed_url' column not found in the input CSV.")
            return
        
        urls = urls_df['transformed_url'].drop_duplicates().tolist()  # Get unique URLs from 'transformed_url' column

        # Open the output file to append new user URLs
        with open(output_file, mode='a', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            
            # Write header if the file is empty
            if file.tell() == 0:
                writer.writerow(['User URLs'])

            # Step 4: Loop through the URLs from transformed_url column
            for url in urls:
                print(f"Processing {url}...")

                # Step 5: Loop through pages 1 to 100 for each URL
                for page_num in range(1, 101):
                    page_url = f"{url}&page={page_num}"
                    print(f"Scraping page: {page_url}")

                    # Scrape user URLs from the current page
                    user_urls = await scrape_user_urls(page_url, page)

                    # Step 6: Write new user URLs to the CSV if they are not already in the existing set
                    for user_url in user_urls:
                        if user_url not in existing_urls:
                            writer.writerow([user_url])
                            existing_urls.add(user_url)  # Add to the existing set to prevent duplicates
                            print(f"Scraped and saved User URL: {user_url}")
                        else:
                            print(f"User URL already exists: {user_url}")

                    # Check if we reached the last page (no user URLs found)
                    if not user_urls:
                        print(f"No user URLs found on page {page_num}.")
                        break  # Stop if there are no more user URLs on the current page

        # Close the browser
        await browser.close()

# Run the async function
await main()


Processing https://www.goodreads.com/author_followings?id=6523270&method=get&page=1...
Scraping page: https://www.goodreads.com/author_followings?id=6523270&method=get&page=1&page=1
Scraped and saved User URL: https://www.goodreads.com/user/show/2794691-ash
Scraped and saved User URL: https://www.goodreads.com/user/show/10790225-rene
Scraped and saved User URL: https://www.goodreads.com/user/show/12896226-kuhu-mannan
Scraped and saved User URL: https://www.goodreads.com/user/show/22291480-sajana-orthy
Scraped and saved User URL: https://www.goodreads.com/user/show/31414585-anan
Scraped and saved User URL: https://www.goodreads.com/user/show/31540933-tasnim-shahriar
Scraped and saved User URL: https://www.goodreads.com/user/show/34177801-shafat
Scraped and saved User URL: https://www.goodreads.com/user/show/35149384-farzana-raisa
Scraped and saved User URL: https://www.goodreads.com/user/show/41316013-s-m-hridoy
Scraped and saved User URL: https://www.goodreads.com/user/show/43890943-mu

CancelledError: 