In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from time import sleep
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# Define headers
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

# Set up retry logic for requests
session = requests.Session()
retry = Retry(
    total=5,          # Retry up to 5 times
    backoff_factor=1,  # Wait 1 second between retries, then exponentially increase
    status_forcelist=[429, 500, 502, 503, 504],  # Retry on these HTTP status codes
)
adapter = HTTPAdapter(max_retries=retry)
session.mount('https://', adapter)
session.mount('http://', adapter)

# Initialize the list to store book data
master_list = []

# Function to scrape a single page
def scrape_page(i):
    try:
        url = f"https://www.goodreads.com/shelf/show/bengali?page={i}"
        response = session.get(url, headers=headers, timeout=10)  # Set timeout to 10 seconds
        response.raise_for_status()  # Raise HTTP errors
        soup = BeautifulSoup(response.content, 'html.parser')

        page_books = []

        # Find all book items
        for x in soup.find_all('a', {"class": "bookTitle"}):
            # Extract the URL
            book_url = 'https://www.goodreads.com' + x['href']

            # Extract the book ID from the href URL
            book_id = x['href'].split('/')[-1]  # Extracting the ID from the URL
            book_id = book_id.split('.')[0]

            # Append the data as a dictionary to the page_books list
            page_books.append({
                'Book_ID': book_id,
                'URL': book_url
            })

        return page_books
    except requests.exceptions.RequestException as e:
        print(f"Error occurred while scraping page {i}: {e}")
        return []

# Use ThreadPoolExecutor to scrape with 5 threads (adjust as needed)
with ThreadPoolExecutor(max_workers=5) as executor:  # Reduced from 10 to 5
    futures = [executor.submit(scrape_page, i) for i in range(1, 25+1)]  # Adjust range as needed

    for future in as_completed(futures):
        master_list.extend(future.result())

# Convert the master_list into a pandas DataFrame
df = pd.DataFrame(master_list)

# Save the data to a CSV file
df.to_csv('books_with_ids.csv', index=False)

# Check the number of books scraped
print(len(master_list))


1250


In [2]:
temp = []

# Use the actual length of master_list to avoid index errors
for i in range(len(master_list)):  
    temp.append(master_list[i]['URL'])
    # print(temp[i])

print(len(temp))

1250


In [3]:
# for x in temp:  
#     response = requests.get(x, headers=headers)
#     soup     = BeautifulSoup(response.content, 'html.parser')
#     tt = soup.find_all('div','ReviewerProfile__name')
#     print(tt)

In [4]:
# import time
# m_urls = []

# for x in temp:
#     try:
#         response = requests.get(x, headers=headers, timeout=10)  # Setting a timeout
#         soup = BeautifulSoup(response.content, 'html.parser')
#         tt = soup.find_all('div', class_='ReviewerProfile__name')
        
#         # List to store the URLs
#         urls = []
        
#         # Loop through each div and extract the href from the <a> tag
#         for div in tt:
#             a_tag = div.find('a')
#             if a_tag and 'href' in a_tag.attrs:
#                 urls.append(a_tag['href'])
        
#         # Append URLs to the master list
#         m_urls.extend(urls)

#         # Introduce a delay of 1 second between requests
#         time.sleep(1)
    
#     except requests.exceptions.Timeout:
#         print(f"Request timed out for: {x}")


In [5]:
   # Convert the urls into a pandas DataFrame
    # df = pd.DataFrame(m_urls)
    # df.to_csv('User_Url1.csv', index=False)

In [6]:
# len(m_urls)

In [8]:
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import time

# Create a session object
session = requests.Session()

# Configure retries
retries = Retry(total=5,  # Retry up to 5 times
                backoff_factor=1,  # Wait 1 second between retries
                status_forcelist=[500, 502, 503, 504],  # Retry on specific status codes
                raise_on_status=False)

# Mount the retry adapter to the session
adapter = HTTPAdapter(max_retries=retries)
session.mount('http://', adapter)
session.mount('https://', adapter)

m_urls = []  # List to store all URLs

for x in temp:
    try:
        # Use the session to make requests with retry logic
        response = session.get(x, headers=headers, timeout=10)
        soup = BeautifulSoup(response.content, 'html.parser')
        tt = soup.find_all('div', class_='ReviewerProfile__name')
        
        # List to store the URLs for this request
        urls = []
        
        # Loop through each div and extract the href from the <a> tag
        for div in tt:
            a_tag = div.find('a')
            if a_tag and 'href' in a_tag.attrs:
                urls.append(a_tag['href'])
        
        # Append URLs to the master list
        m_urls.extend(urls)

        # Introduce a delay of 1 second between requests to avoid server overloading
        time.sleep(1)
    
    except requests.exceptions.RequestException as e:
        print(f"Request failed for: {x}, error: {e}")

# After the loop, convert the URLs into a pandas DataFrame and save to a CSV
df = pd.DataFrame(m_urls)
df.to_csv('User_Url.csv', index=False)

print(f"Total URLs extracted: {len(m_urls)}")


Total URLs extracted: 37175


In [2]:
import pandas as pd
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from bs4 import BeautifulSoup
import time

# Read the `temp` list from the `books_with_ids.csv` file
books_with_ids = pd.read_csv('books_with_ids.csv')
temp = books_with_ids['URL'].tolist()  # Assuming the file has a column named 'urls'

# Create a session object
session = requests.Session()

# Configure retries
retries = Retry(total=5,  # Retry up to 5 times
                backoff_factor=1,  # Wait 1 second between retries
                status_forcelist=[500, 502, 503, 504],  # Retry on specific status codes
                raise_on_status=False)

# Mount the retry adapter to the session
adapter = HTTPAdapter(max_retries=retries)
session.mount('http://', adapter)
session.mount('https://', adapter)

m_urls = []  # List to store all URLs
visited = set()  # Set to track visited URLs

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

for x in temp:
    try:
        # Skip if the URL was already visited
        

        # Use the session to make requests with retry logic
        response = session.get(x, headers=headers, timeout=10)
        soup = BeautifulSoup(response.content, 'html.parser')
        tt = soup.find_all('div', class_='ReviewerProfile__name')
        
        # Loop through each div and extract the href from the <a> tag
        for div in tt:
            a_tag = div.find('a')
            if a_tag and 'href' in a_tag.attrs:
                href = a_tag['href']
                if href not in visited:  # Check if the href is already visited
                    m_urls.append(href)
                    visited.add(href)  # Mark this href as visited

       # print(f"Processed {x}: Extracted {len(m_urls)} total URLs so far.")

        # Mark the current URL as visited
        #visited.add(x)

        # Introduce a delay of 1 second between requests to avoid server overloading
        time.sleep(1)
    
    except requests.exceptions.RequestException as e:
        print(f"Request failed for: {x}, error: {e}")

# Save the extracted URLs to a new CSV file called 'bengali_users.csv'
df = pd.DataFrame(m_urls, columns=['URLs'])
df.to_csv('bengali_users.csv', index=False)

print(f"Total URLs extracted: {len(m_urls)}")
print("Scraping completed. Data saved to 'bengali_users.csv'.")


KeyboardInterrupt: 