In [None]:
import pandas as pd
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from bs4 import BeautifulSoup
import time

# Read the `temp` list from the `books_with_ids.csv` file
books_with_ids = pd.read_csv('final_book_url.csv')
temp = books_with_ids['Book_URL'].tolist()  # Assuming the file has a column named 'urls'

# Create a session object
session = requests.Session()

# Configure retries
retries = Retry(total=5,  # Retry up to 5 times
                backoff_factor=1,  # Wait 1 second between retries
                status_forcelist=[500, 502, 503, 504],  # Retry on specific status codes
                raise_on_status=False)

# Mount the retry adapter to the session
adapter = HTTPAdapter(max_retries=retries)
session.mount('http://', adapter)
session.mount('https://', adapter)

m_urls = []  # List to store all URLs
visited = set()  # Set to track visited URLs

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

for x in temp:
    try:
        # Skip if the URL was already visited
        

        # Use the session to make requests with retry logic
        response = session.get(x, headers=headers, timeout=10)
        soup = BeautifulSoup(response.content, 'html.parser')
        tt = soup.find_all('div', class_='ReviewerProfile__name')
        
        # Loop through each div and extract the href from the <a> tag
        for div in tt:
            a_tag = div.find('a')
            if a_tag and 'href' in a_tag.attrs:
                href = a_tag['href']
                if href not in visited:  # Check if the href is already visited
                    m_urls.append(href)
                    visited.add(href)  # Mark this href as visited

       # print(f"Processed {x}: Extracted {len(m_urls)} total URLs so far.")

        # Mark the current URL as visited
        #visited.add(x)

        # Introduce a delay of 1 second between requests to avoid server overloading
        time.sleep(1)
    
    except requests.exceptions.RequestException as e:
        print(f"Request failed for: {x}, error: {e}")

# Save the extracted URLs to a new CSV file called 'bengali_users.csv'
df = pd.DataFrame(m_urls, columns=['URLs'])
df.to_csv('bengali_users.csv', index=False)

print(f"Total URLs extracted: {len(m_urls)}")
print("Scraping completed. Data saved to 'bengali_users.csv'.")


In [1]:
import pandas as pd
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
import time

# Read the `temp` list from the `books_with_ids.csv` file
books_with_ids = pd.read_csv('final_book_url.csv')
temp = books_with_ids['Book_URL'].tolist()  # Assuming the file has a column named 'urls'

# Create a session object
session = requests.Session()

# Configure retries
retries = Retry(total=5,  # Retry up to 5 times
                backoff_factor=1,  # Wait 1 second between retries
                status_forcelist=[500, 502, 503, 504],  # Retry on specific status codes
                raise_on_status=False)

# Mount the retry adapter to the session
adapter = HTTPAdapter(max_retries=retries)
session.mount('http://', adapter)
session.mount('https://', adapter)

m_urls = []  # List to store all URLs
visited = set()  # Set to track visited URLs
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

def scrape_url(x):
    """Scrape a single URL and extract relevant data."""
    local_m_urls = []
    try:
        # Use the session to make requests with retry logic
        response = session.get(x, headers=headers, timeout=10)
        soup = BeautifulSoup(response.content, 'html.parser')
        tt = soup.find_all('div', class_='ReviewerProfile__name')

        # Loop through each div and extract the href from the <a> tag
        for div in tt:
            a_tag = div.find('a')
            if a_tag and 'href' in a_tag.attrs:
                href = a_tag['href']
                if href not in visited:  # Check if the href is already visited
                    local_m_urls.append(href)
                    visited.add(href)  # Mark this href as visited
    except requests.exceptions.RequestException as e:
        print(f"Request failed for: {x}, error: {e}")
    return local_m_urls

# Use ThreadPoolExecutor for multithreading
with ThreadPoolExecutor(max_workers=10) as executor:  # Adjust max_workers as needed
    futures = {executor.submit(scrape_url, url): url for url in temp}
    for future in as_completed(futures):
        try:
            result = future.result()
            m_urls.extend(result)
        except Exception as e:
            print(f"Error processing URL: {futures[future]}, error: {e}")

# Save the extracted URLs to a new CSV file called 'bengali_users.csv'
df = pd.DataFrame(m_urls, columns=['URLs'])
df.to_csv('bengali_users.csv', index=False)

print(f"Total URLs extracted: {len(m_urls)}")
print("Scraping completed. Data saved to 'bengali_users.csv'.")


Request failed for: https://www.goodreads.com/book/show/75613852-anapus, error: HTTPSConnectionPool(host='www.goodreads.com', port=443): Read timed out.
Request failed for: https://www.goodreads.com/book/show/18490745, error: HTTPSConnectionPool(host='www.goodreads.com', port=443): Read timed out.
Request failed for: https://www.goodreads.com/book/show/58089567, error: HTTPSConnectionPool(host='www.goodreads.com', port=443): Read timed out.
Request failed for: https://www.goodreads.com/book/show/205818432---bangabandhu-muktijuddho, error: HTTPSConnectionPool(host='www.goodreads.com', port=443): Read timed out.
Request failed for: https://www.goodreads.com/book/show/170004951, error: HTTPSConnectionPool(host='www.goodreads.com', port=443): Read timed out.
Request failed for: https://www.goodreads.com/book/show/53857617, error: HTTPSConnectionPool(host='www.goodreads.com', port=443): Read timed out.
Request failed for: https://www.goodreads.com/book/show/168364474-smritikatha-o-anyanya, 