# FBN Scraping 

## Converting From PDF to Text

In [1]:
#Importing the needed library to convert PDF to text
import pdfplumber

#extracting the PDF
def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ''
        for page in pdf.pages:
            text += page.extract_text()
    return text

# Replace 'your_pdf_file.pdf' with the actual path to your PDF file
pdf_path = r"C:\Users\salom\OneDrive\Área de Trabalho\FBN New\FBN\Issues\22.3.4.pdf"

#variable containing text called extracted_text
extracted_text = extract_text_from_pdf(pdf_path)


In [2]:
#Downloading the text file 

with open('Issue_Text.txt', 'w', encoding='utf-8') as file:
    file.write(extracted_text)

## ISBNS

### Parsing ISBNs

In [3]:
#extracting ISBNs from thext 
#importing the library
import re

#defining the regex used 
def extract_isbns(text):
    # Define a regex pattern to match strings of numbers with dashes
    isbn_pattern = r"((978[-– ])?[0-9][0-9-– ]{10}[-– ][0-9xX])|((978)?[0-9]{9}[0-9Xx])"

    # Find all matches of the ISBN pattern in the text
    isbns = re.findall(isbn_pattern, text)

    return isbns

# Extract ISBNs from the example text
isbns_found = extract_isbns(extracted_text)

# Extract only the portions with numbers and dashes
isbns_found = [isbn[0] for isbn in isbns_found]

#print("Cleaned ISBNs:", isbns_found)


### ISBN Dataframe

In [4]:
#creating a dataframe that has ISBNs and the page numbers where found 

#importing necessary libaries
import pdfplumber
import pandas as pd
import re

def extract_isbns_from_text(text):
    # Define the ISBN regex pattern
    isbn_pattern = re.compile(r"((978[-– ])?[0-9][0-9-– ]{10}[-– ][0-9xX])|((978)?[0-9]{9}[0-9Xx])")

    # Find all ISBNs in the text
    isbns_found = isbn_pattern.findall(text)
    return isbns_found

def extract_text_from_pdf(pdf_path):
    data = {'PageNumber': [], 'Text': [], 'ISBNs': []}
    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages, start=1):
            # Add page number and corresponding text to the data dictionary
            data['PageNumber'].append(i)
            text = page.extract_text()
            data['Text'].append(text)
            
            # Extract ISBNs from the text
            isbns_found = extract_isbns_from_text(text)
            isbns_found = [isbn[0] for isbn in isbns_found]
            data['ISBNs'].append(isbns_found)

    return pd.DataFrame(data)

# Replace 'your_pdf_file.pdf' with the actual path to your PDF file
pdf_path = r"C:\Users\salom\OneDrive\Área de Trabalho\FBN New\FBN\Issues\22.3.4.pdf"
df = extract_text_from_pdf(pdf_path)


In [5]:
df.head()

Unnamed: 0,PageNumber,Text,ISBNs
0,1,Feminist Bookstore News\nSource: Reveal Digita...,[]
1,2,Feminist\nBookstore\nNews IAs\nuSN vA n e\nN `...,[]
2,3,Z PRESS GANG\nZ\n(A PUBLISHING\nALLIANCE\nWork...,"[1-896095-55-0, 0-88974-061-5]"
3,4,Feminist Bookstore News\nIt’s difficult being ...,[]
4,5,Feminist Bookstore News\nThis new format could...,[]


### Downloading List of ISBNs and Page Numbers

In [6]:
#change issue number
df.to_csv('22.3.4_text.csv', index=False)

## API

In [7]:
import requests
import csv
import time

api_key = '52084_9ddf62aeaa4a39b5485d9b7fb69dd5a8'
isbns_to_process = isbns_found  # Replace with the original list of ISBNs you want to process

base_url = 'https://api2.isbndb.com/book/'

with open('book_information.csv', 'w', newline='', encoding='utf-8') as csv_file:
    fieldnames = ['ISBN', 'Title', 'Author', 'Publisher', 'Pages', 'Date Published', 'Binding', 'Synopsis', 'Language', 'Edition', 'Dimensions', 'MSRP', 'Image', 'Status']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()

    for isbn in isbns_to_process:
        url = f'{base_url}{isbn}'
        headers = {'Authorization': api_key}

        response = requests.get(url, headers=headers)

        if response.status_code == 200:
            book_info = response.json().get('book', {})
            writer.writerow({
                'ISBN': isbn,
                'Title': book_info.get('title', ''),
                'Author': ', '.join(book_info.get('authors', [])),
                'Publisher': book_info.get('publisher', ''),
                'Pages': book_info.get('pages', ''),
                'Date Published': book_info.get('date_published', ''),
                'Binding': book_info.get('binding', ''),
                'Synopsis': book_info.get('synopsis', ''),
                'Language': book_info.get('language', ''),
                'Edition': book_info.get('edition', ''),
                'Dimensions': book_info.get('dimensions', ''),
                'MSRP': book_info.get('msrp', ''),
                'Image': book_info.get('image', ''),
                'Status': 'Success'
            })
        else:
            print(f"Error for ISBN {isbn}: {response.status_code}")
            writer.writerow({
                'ISBN': isbn,
                'Status': 'Error',
                'Title': '',
                'Author': '',
                'Publisher': '',
                'Pages': '',
                'Date Published': '',
                'Binding': '',
                'Synopsis': '',
                'Language': '',
                'Edition': '',
                'Dimensions': '',
                'MSRP': '',
                'Image': ''
            })

        time.sleep(1)  # Pause for 1 second before making the next API call

print("CSV file created successfully.")

Error for ISBN 1- -5 56 63 3: 404
Error for ISBN 41 1- -1 11 1: 404
Error for ISBN 1-892514-54-0: 404
Error for ISBN 0-609-80476-6: 404
Error for ISBN 11 11 -0 70 2: 404
Error for ISBN 27-0-11-489-9: 404
Error for ISBN 27-0-11-489-9: 404
Error for ISBN 03853 1-888-6: 404
Error for ISBN 15-974-8985 x: 404
Error for ISBN : 403
Error for ISBN 95 1-896764-1: 404
Error for ISBN 8 49 -0 793 1: 404
Error for ISBN 999 1-55583-5: 404
Error for ISBN 1-55701-086-9: 404
Error for ISBN 1-882723-01-8: 404
Error for ISBN 0-9682959-2-0: 404
Error for ISBN 0-8020-1571-4: 404
Error for ISBN 0-8143-2822-2: 404
Error for ISBN 0-380-80902-8: 403
Error for ISBN 0-8109-4381-6: 403
Error for ISBN 0-8109-4107-4: 403
Error for ISBN 0-380-79402-0: 403
Error for ISBN 0-380-97648-X: 403
Error for ISBN 0-380-97703-6: 403
Error for ISBN 0-380-81130-8: 403
Error for ISBN 0-380-97698-6: 403
Error for ISBN 0-380-80318-6: 403
Error for ISBN 0-380-80632-0: 403
Error for ISBN 0-375-40560-7: 403
Error for ISBN 0-345-42348-

## Joining Dataframes

### Renaming to ISBN

In [8]:
import pandas as pd

# Assuming 'ISBNs' is the current column name and 'ISBN' is the desired column name
df.rename(columns={'ISBNs': 'ISBN'}, inplace=True)

### Adding Page Numbers

In [9]:
page = df[['PageNumber', 'ISBN']]
page.head()

Unnamed: 0,PageNumber,ISBN
0,1,[]
1,2,[]
2,3,"[1-896095-55-0, 0-88974-061-5]"
3,4,[]
4,5,[]


In [10]:
page['ISBN'] = page['ISBN'].astype(str).str.replace('[\[\]\'\"]', '')
page.head()

  page['ISBN'] = page['ISBN'].astype(str).str.replace('[\[\]\'\"]', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  page['ISBN'] = page['ISBN'].astype(str).str.replace('[\[\]\'\"]', '')


Unnamed: 0,PageNumber,ISBN
0,1,
1,2,
2,3,"1-896095-55-0, 0-88974-061-5"
3,4,
4,5,


In [11]:
# Split values on comma and explode to create new rows
page['ISBN'] = page['ISBN'].str.split(', ')
page = page.explode('ISBN')

# Display the updated DataFrame
page.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  page['ISBN'] = page['ISBN'].str.split(', ')


Unnamed: 0,PageNumber,ISBN
0,1,
1,2,
2,3,1-896095-55-0
2,3,0-88974-061-5
3,4,


### Opening Scraped Dataset

In [12]:


# Specify the file path
file_path = r'C:\Users\salom\OneDrive\Área de Trabalho\FBN New\FBN\notebooks\book_information.csv'

# Read the CSV file into a DataFrame
info = pd.read_csv(file_path)

info.head()

Unnamed: 0,ISBN,Title,Author,Publisher,Pages,Date Published,Binding,Synopsis,Language,Edition,Dimensions,MSRP,Image,Status
0,1-896095-55-0,Between Gardens,"Graham Chudley, Carol, Field, Dorothy","Raincoast Books, Polestar",240.0,2002-05-29,Paperback,When good friends and avid gardeners Dorothy F...,en,First Edition,"Height: 8.25 Inches, Length: 6.25 Inches, Weig...",18.95,https://images.isbndb.com/covers/55/54/9781896...,Success
1,0-88974-061-5,Sojourners and Sundogs: First Nations Fiction,"Maracle, Lee","Raincoast Books, Press Gang Publishers",352.0,2002-05-08,Paperback,Stories About Modern Indians In Canada. The St...,en,,"Height: 7.75 Inches, Length: 5.75 Inches, Weig...",16.95,https://images.isbndb.com/covers/06/17/9780889...,Success
2,1- -5 56 63 3,,,,,,,,,,,,,Error
3,41 1- -1 11 1,,,,,,,,,,,,,Error
4,1-56341-116-4,This Is What Lesbian Looks Like: Dyke Activist...,,Firebrand Books,256.0,1999-11-01,Paperback,Twenty-six Lesbian Grassroots Activists -- Som...,en,First Edition,"Height: 8.5 Inches, Length: 5.75 Inches, Weigh...",18.95,https://images.isbndb.com/covers/11/68/9781563...,Success


### Joining it With Pages

In [13]:
merge = pd.merge(info, page, on='ISBN', how='left')
merge

Unnamed: 0,ISBN,Title,Author,Publisher,Pages,Date Published,Binding,Synopsis,Language,Edition,Dimensions,MSRP,Image,Status,PageNumber
0,1-896095-55-0,Between Gardens,"Graham Chudley, Carol, Field, Dorothy","Raincoast Books, Polestar",240.0,2002-05-29,Paperback,When good friends and avid gardeners Dorothy F...,en,First Edition,"Height: 8.25 Inches, Length: 6.25 Inches, Weig...",18.95,https://images.isbndb.com/covers/55/54/9781896...,Success,3.0
1,1-896095-55-0,Between Gardens,"Graham Chudley, Carol, Field, Dorothy","Raincoast Books, Polestar",240.0,2002-05-29,Paperback,When good friends and avid gardeners Dorothy F...,en,First Edition,"Height: 8.25 Inches, Length: 6.25 Inches, Weig...",18.95,https://images.isbndb.com/covers/55/54/9781896...,Success,99.0
2,0-88974-061-5,Sojourners and Sundogs: First Nations Fiction,"Maracle, Lee","Raincoast Books, Press Gang Publishers",352.0,2002-05-08,Paperback,Stories About Modern Indians In Canada. The St...,en,,"Height: 7.75 Inches, Length: 5.75 Inches, Weig...",16.95,https://images.isbndb.com/covers/06/17/9780889...,Success,3.0
3,0-88974-061-5,Sojourners and Sundogs: First Nations Fiction,"Maracle, Lee","Raincoast Books, Press Gang Publishers",352.0,2002-05-08,Paperback,Stories About Modern Indians In Canada. The St...,en,,"Height: 7.75 Inches, Length: 5.75 Inches, Weig...",16.95,https://images.isbndb.com/covers/06/17/9780889...,Success,100.0
4,1- -5 56 63 3,,,,,,,,,,,,,Error,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
707,0-06-251630-2,,,,,,,,,,,,,Error,160.0
708,0-06-251630-2,,,,,,,,,,,,,Error,162.0
709,0-06-251554-3,,,,,,,,,,,,,Error,141.0
710,0-06-251554-3,,,,,,,,,,,,,Error,160.0


In [14]:
duplicate = pd.read_csv(r"C:\Users\salom\OneDrive\Área de Trabalho\FBN New\FBN\notebooks\22.3.4_booklist.csv")
duplicate

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\salom\\OneDrive\\Área de Trabalho\\FBN New\\FBN\\notebooks\\22.3.4_booklist.csv'

In [None]:
duplicate = duplicate.drop_duplicates(subset=['ISBN'])
duplicate

### Adding a Column For Issue and Volume 

In [None]:
duplicate['Volume & Issue'] = '22.3.4'

### Downloading Dataframe

In [None]:
# Assuming 'merge' is your DataFrame
duplicate.to_csv('22.3.4_booklist.csv', index=False)