In [1]:
import pandas as pd
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
import time

# Read the `temp` list from the `books_with_ids.csv` file
books_with_ids = pd.read_csv('final_book_url.csv')
temp = books_with_ids['Book_URL'].tolist()  # Assuming the file has a column named 'urls'

# Create a session object
session = requests.Session()

# Configure retries
retries = Retry(total=5,  # Retry up to 5 times
                backoff_factor=1,  # Wait 1 second between retries
                status_forcelist=[500, 502, 503, 504],  # Retry on specific status codes
                raise_on_status=False)

# Mount the retry adapter to the session
adapter = HTTPAdapter(max_retries=retries)
session.mount('http://', adapter)
session.mount('https://', adapter)

m_urls = []  # List to store all URLs
visited = set()  # Set to track visited URLs
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

def scrape_url(x):
    """Scrape a single URL and extract relevant data."""
    local_m_urls = []
    try:
        # Use the session to make requests with retry logic
        response = session.get(x, headers=headers, timeout=10)
        soup = BeautifulSoup(response.content, 'html.parser')
        tt = soup.find_all('div', class_='ReviewerProfile__name')

        # Loop through each div and extract the href from the <a> tag
        for div in tt:
            a_tag = div.find('a')
            if a_tag and 'href' in a_tag.attrs:
                href = a_tag['href']
                if href not in visited:  # Check if the href is already visited
                    local_m_urls.append(href)
                    visited.add(href)  # Mark this href as visited
                    print(f"Unique User Found: {href}")  # Print the unique user
    except requests.exceptions.RequestException as e:
        print(f"Request failed for: {x}, error: {e}")
    return local_m_urls

# Use ThreadPoolExecutor for multithreading
with ThreadPoolExecutor(max_workers=10) as executor:  # Adjust max_workers as needed
    futures = {executor.submit(scrape_url, url): url for url in temp}
    for future in as_completed(futures):
        try:
            result = future.result()
            m_urls.extend(result)
        except Exception as e:
            print(f"Error processing URL: {futures[future]}, error: {e}")

# Save the extracted URLs to a new CSV file called 'bengali_users.csv'
df = pd.DataFrame(m_urls, columns=['URLs'])
df.to_csv('bengali_users.csv', index=False)

print(f"Total URLs extracted: {len(m_urls)}")
print("Scraping completed. Data saved to 'bengali_users.csv'.")


Unique User Found: https://www.goodreads.com/user/show/61070444-shampa-paul
Unique User Found: https://www.goodreads.com/user/show/2736147-riju-ganguly
Unique User Found: https://www.goodreads.com/user/show/107928784-shariful-sadaf
Unique User Found: https://www.goodreads.com/user/show/24175715
Unique User Found: https://www.goodreads.com/user/show/64297599-fahad-ahammed
Unique User Found: https://www.goodreads.com/user/show/47956390-rakibul-dolon
Unique User Found: https://www.goodreads.com/user/show/57452924-niloy-gourh
Unique User Found: https://www.goodreads.com/user/show/107646496-afrin-sultana-moutusi
Unique User Found: https://www.goodreads.com/user/show/127384375-amjad-hossain
Unique User Found: https://www.goodreads.com/user/show/115105316-john-milton
Unique User Found: https://www.goodreads.com/user/show/64441420
Unique User Found: https://www.goodreads.com/user/show/131284579-sa796
Unique User Found: https://www.goodreads.com/user/show/136257679-klinton-saha
Unique User Foun

In [1]:
import pandas as pd

# Load the two CSV files into DataFrames
bengali_users_df = pd.read_csv('bengali_users.csv')
final_user_url_df = pd.read_csv('final_user_url.csv')

# Merge the DataFrames
# Use `pd.concat` to combine them row-wise since both likely contain URLs.
merged_df = pd.concat([bengali_users_df, final_user_url_df], ignore_index=True)

# Drop duplicate URLs if needed (to ensure uniqueness)
merged_df = merged_df.drop_duplicates()

# Save the merged DataFrame to a new CSV file
merged_df.to_csv('bengali_final_user.csv', index=False)

print(f"Successfully merged files. Total unique URLs: {len(merged_df)}")
print("Merged data saved to 'bengali_final_user.csv'.")


Successfully merged files. Total unique URLs: 18044
Merged data saved to 'bengali_final_user.csv'.


In [2]:
import pandas as pd

# Read the final_book_url.csv file
input_file = "bengali_final_user.csv"  # Replace with your input CSV file name
output_file = "bengali_final_user.csv"  # Output file for unique book URLs

# Load the CSV into a DataFrame
df = pd.read_csv(input_file)

# Ensure the 'Book_URL' column has unique values by using drop_duplicates
df_unique = df.drop_duplicates(subset=["URLs"])

# Save the DataFrame with unique URLs to a new CSV file
df_unique.to_csv(output_file, index=False)

# Print the number of unique book URLs
print(f"Total unique book URLs: {len(df_unique)}")
print(df_unique)

Total unique book URLs: 9385
                                                   URLs  \
0     https://www.goodreads.com/user/show/61070444-s...   
1     https://www.goodreads.com/user/show/2736147-ri...   
2     https://www.goodreads.com/user/show/107928784-...   
3          https://www.goodreads.com/user/show/24175715   
4     https://www.goodreads.com/user/show/64297599-f...   
...                                                 ...   
9380  https://www.goodreads.com/user/show/22190014-t...   
9381  https://www.goodreads.com/user/show/104537430-...   
9382     https://www.goodreads.com/user/show/5974362-db   
9383  https://www.goodreads.com/user/show/51582156-r...   
9384                                                NaN   

                                            User URLs  
0                                                 NaN  
1                                                 NaN  
2                                                 NaN  
3                                     