# FBN Scraping 

## Converting From PDF to Text

In [1]:
#Importing the needed library to convert PDF to text
import pdfplumber

#extracting the PDF
def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ''
        for page in pdf.pages:
            text += page.extract_text()
    return text

# Replace 'your_pdf_file.pdf' with the actual path to your PDF file
pdf_path = r"C:\Users\salom\OneDrive\Área de Trabalho\FBN New\FBN\Issues\21.1.pdf"

#variable containing text called extracted_text
extracted_text = extract_text_from_pdf(pdf_path)


In [2]:
#Downloading the text file 

with open('21.1_Issue_Text.txt', 'w', encoding='utf-8') as file:
    file.write(extracted_text)

## ISBNS

### Parsing ISBNs

In [3]:
#extracting ISBNs from thext 
#importing the library
import re

#defining the regex used 
def extract_isbns(text):
    # Define a regex pattern to match strings of numbers with dashes
    isbn_pattern = r"((978[-– ])?[0-9][0-9-– ]{10}[-– ][0-9xX])|((978)?[0-9]{9}[0-9Xx])"

    # Find all matches of the ISBN pattern in the text
    isbns = re.findall(isbn_pattern, text)

    return isbns

# Extract ISBNs from the example text
isbns_found = extract_isbns(extracted_text)

# Extract only the portions with numbers and dashes
isbns_found = [isbn[0] for isbn in isbns_found]

#print("Cleaned ISBNs:", isbns_found)


### ISBN Dataframe

In [4]:
#creating a dataframe that has ISBNs and the page numbers where found 

#importing necessary libaries
import pdfplumber
import pandas as pd
import re

def extract_isbns_from_text(text):
    # Define the ISBN regex pattern
    isbn_pattern = re.compile(r"((978[-– ])?[0-9][0-9-– ]{10}[-– ][0-9xX])|((978)?[0-9]{9}[0-9Xx])")

    # Find all ISBNs in the text
    isbns_found = isbn_pattern.findall(text)
    return isbns_found

def extract_text_from_pdf(pdf_path):
    data = {'PageNumber': [], 'Text': [], 'ISBNs': []}
    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages, start=1):
            # Add page number and corresponding text to the data dictionary
            data['PageNumber'].append(i)
            text = page.extract_text()
            data['Text'].append(text)
            
            # Extract ISBNs from the text
            isbns_found = extract_isbns_from_text(text)
            isbns_found = [isbn[0] for isbn in isbns_found]
            data['ISBNs'].append(isbns_found)

    return pd.DataFrame(data)

# Replace 'your_pdf_file.pdf' with the actual path to your PDF file
pdf_path = r"C:\Users\salom\OneDrive\Área de Trabalho\FBN New\FBN\Issues\21.1.pdf"
df = extract_text_from_pdf(pdf_path)


In [5]:
df.head()

Unnamed: 0,PageNumber,Text,ISBNs
0,1,Feminist Bookstore News\nSource: Reveal Digita...,[]
1,2,Feminist\nBookstore\nNews HAli\nvé\nA lot of d...,[]
2,3,“Harriet Lerner pioneers on behalf of womens\n...,[]
3,4,Feminist Bookstore News\nWelcome to.FBN’s Spri...,[]
4,5,"Feminist Bookstore News\n55% discount, postage...",[]


### Downloading List of ISBNs and Page Numbers

In [6]:
#change issue number
df.to_csv('21.1_text.csv', index=False)

## API

In [None]:
import requests
import csv
import time

api_key = '52084_9ddf62aeaa4a39b5485d9b7fb69dd5a8'
isbns_to_process = isbns_found  # Replace with the original list of ISBNs you want to process


#Removing Duplicates
isbns_to_process = list(set(isbns_to_process))

base_url = 'https://api2.isbndb.com/book/'

with open('book_information.csv', 'w', newline='', encoding='utf-8') as csv_file:
    fieldnames = ['ISBN', 'Title', 'Author', 'Publisher', 'Pages', 'Date Published', 'Subjects', 'Binding', 'Synopsis', 'Language', 'Edition', 'Dimensions', 'MSRP', 'Image', 'Status']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()

    for isbn in isbns_to_process:
        url = f'{base_url}{isbn}'
        headers = {'Authorization': api_key}

        response = requests.get(url, headers=headers)

        if response.status_code == 200:
            book_info = response.json().get('book', {})
            writer.writerow({
                'ISBN': isbn,
                'Title': book_info.get('title', ''),
                'Author': ', '.join(book_info.get('authors', [])),
                'Publisher': book_info.get('publisher', ''),
                'Pages': book_info.get('pages', ''),
                'Date Published': book_info.get('date_published', ''),
                'Binding': book_info.get('binding', ''),
                'Synopsis': book_info.get('synopsis', ''),
                'Language': book_info.get('language', ''),
                'Edition': book_info.get('edition', ''),
                'Dimensions': book_info.get('dimensions', ''),
                'MSRP': book_info.get('msrp', ''),
                'Image': book_info.get('image', ''),
                'Status': 'Success'
            })
        else:
            print(f"Error for ISBN {isbn}: {response.status_code}")
            writer.writerow({
                'ISBN': isbn,
                'Status': 'Error',
                'Title': '',
                'Author': '',
                'Publisher': '',
                'Pages': '',
                'Date Published': '',
                'Binding': '',
                'Synopsis': '',
                'Language': '',
                'Edition': '',
                'Dimensions': '',
                'MSRP': '',
                'Image': ''
            })

        time.sleep(1)  # Pause for 1 second before making the next API call

print("CSV file created successfully.")

Error for ISBN 95 1-889330-1: 404
Error for ISBN 95 1-878610-6: 404
Error for ISBN 95 0-930100-7: 404
Error for ISBN 00 1-886913-1: 404
Error for ISBN 95 1-883513-0: 404
Error for ISBN 8 1-800-243-0: 404
Error for ISBN 72 3- 94 16 0: 404
Error for ISBN 1162 202-62 0: 404
Error for ISBN 192 9- 6268-7: 404
Error for ISBN 29 -0 52 6- 6: 404
Error for ISBN 5 -95 01 31 5: 404
Error for ISBN 0 90 62-268-7: 404
Error for ISBN 0-880000-60-1: 404
Error for ISBN 0-689-00449-4: 404
Error for ISBN : 403
Error for ISBN : 403
Error for ISBN : 403
Error for ISBN 0-485-80345-3: 404
Error for ISBN 0-384-80344-5: 404
Error for ISBN : 403
Error for ISBN : 403
Error for ISBN : 403
Error for ISBN : 403
Error for ISBN : 403
Error for ISBN 0-689-00462-1: 404
Error for ISBN 1-56280-224-0: 404
Error for ISBN 1-896705-24-3: 404
Error for ISBN 0-921411-77-4: 404
Error for ISBN 19-884-0710 x: 404
Error for ISBN : 403
Error for ISBN 15-974-8985 x: 404
Error for ISBN 1-56280-225-9: 404
Error for ISBN 1-56280-224-0:

## Joining Dataframes

### Renaming to ISBN

In [None]:
import pandas as pd

# Assuming 'ISBNs' is the current column name and 'ISBN' is the desired column name
df.rename(columns={'ISBNs': 'ISBN'}, inplace=True)

### Adding Page Numbers

In [None]:
page = df[['PageNumber', 'ISBN']]
page.head()

In [None]:
page['ISBN'] = page['ISBN'].astype(str).str.replace('[\[\]\'\"]', '')
page.head()

In [None]:
# Split values on comma and explode to create new rows
page['ISBN'] = page['ISBN'].str.split(', ')
page = page.explode('ISBN')

# Display the updated DataFrame
page.head()

### Opening Scraped Dataset

In [None]:


# Specify the file path
file_path = r'C:\Users\salom\OneDrive\Área de Trabalho\FBN New\FBN\notebooks\book_information.csv'

# Read the CSV file into a DataFrame
info = pd.read_csv(file_path)

info.head()

### Joining it With Pages

In [None]:
merge = pd.merge(info, page, on='ISBN', how='left')
merge

### Adding a Column For Issue and Volume 

In [None]:
merge['Volume & Issue'] = '21.1'

### Removing Duplicates

In [None]:
# Remove all duplicate rows from the DataFrame
merge = merge.drop_duplicates(keep=False)

# Reset the index after dropping duplicates
merge.reset_index(drop=True, inplace=True)

### Downloading Dataframe

In [None]:
# Assuming 'merge' is your DataFrame
merge.to_csv('21.1_booklist.csv', index=False)