# FBN Scraping 

## Converting From PDF to Text

In [57]:
#Importing the needed library to convert PDF to text
import pdfplumber

#extracting the PDF
def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ''
        for page in pdf.pages:
            text += page.extract_text()
    return text

# Replace 'your_pdf_file.pdf' with the actual path to your PDF file
pdf_path = r"C:\Users\salom\OneDrive\Área de Trabalho\FBN New\FBN\Issues\22.1.pdf"

#variable containing text called extracted_text
extracted_text = extract_text_from_pdf(pdf_path)


In [58]:
#Downloading the text file 

with open('Issue_Text.txt', 'w', encoding='utf-8') as file:
    file.write(extracted_text)

## ISBNS

### Parsing ISBNs

In [59]:
#extracting ISBNs from thext 
#importing the librart
import re

#defining the regex used 
def extract_isbns(text):
    # Define a regex pattern to match strings of numbers with dashes
    isbn_pattern = r"((978[-– ])?[0-9][0-9-– ]{10}[-– ][0-9xX])|((978)?[0-9]{9}[0-9Xx])"

    # Find all matches of the ISBN pattern in the text
    isbns = re.findall(isbn_pattern, text)

    return isbns

# Extract ISBNs from the example text
isbns_found = extract_isbns(extracted_text)

# Extract only the portions with numbers and dashes
isbns_found = [isbn[0] for isbn in isbns_found]

print("Cleaned ISBNs:", isbns_found)


Cleaned ISBNs: ['0-06-251426-1', '1-883523-30-3', '1-883523-32-X', '0-934971-66-8', '0-934971-67-6', '0-375-40747-2', '0-517-70666-0', '0-609-60411-2', '0-609-80262-3', '0-609-80416-2', '0-609-80435-9', '0-609-80316-6', '0-446-67221-1', '0-395-95637-4', '0-395-97771-1', '1-892514-15-X', '0-88974-086-0', '0-8263-1843-6', '1-896095-18-6', '0-671-79388-8', '0-393-02749-X', '0-609-80384-0', '0-465-08364-1', '0-395-85010-X', '1-58005-013-1', '0-316-28526-9', '1-56341-100-8', '15-974-8985 x', '0-465-02485-8', '1-56280-239-9', '1-56280-238-0', '1-56280-240-2', '0-89239-157-X', '0-89239-159-6', '0-500-28104-1', '0-89239-158-8', '0-7611-1360-6', '0-85170-666-5', '0-85170-665-7', '1-57806-132-6', '0-415-06700-6', '1-55670-888-2', '0-500-28098-3', '0-262-19409-0', '0-393-73027-1', '3-908247-02-0', '1-55595-156-2', '0-14-056219-2', '0-698-11774-3', '0-8037-2446-2', '0-399-23141-2', '1-55861-217-3', '0-8037-2326-1', '1-55861-197-5', '0-88776-385-5', '1-55861-199-1', '1-55861-201-7', '0-374-33551-0'

### ISBN Dataframe

In [60]:
#creating a dataframe that has ISBNs and the page numbers where found 

#importing necessary libaries
import pdfplumber
import pandas as pd
import re

def extract_isbns_from_text(text):
    # Define the ISBN regex pattern
    isbn_pattern = re.compile(r"((978[-– ])?[0-9][0-9-– ]{10}[-– ][0-9xX])|((978)?[0-9]{9}[0-9Xx])")

    # Find all ISBNs in the text
    isbns_found = isbn_pattern.findall(text)
    return isbns_found

def extract_text_from_pdf(pdf_path):
    data = {'PageNumber': [], 'Text': [], 'ISBNs': []}
    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages, start=1):
            # Add page number and corresponding text to the data dictionary
            data['PageNumber'].append(i)
            text = page.extract_text()
            data['Text'].append(text)
            
            # Extract ISBNs from the text
            isbns_found = extract_isbns_from_text(text)
            isbns_found = [isbn[0] for isbn in isbns_found]
            data['ISBNs'].append(isbns_found)

    return pd.DataFrame(data)

# Replace 'your_pdf_file.pdf' with the actual path to your PDF file
pdf_path = r"C:\Users\salom\OneDrive\Área de Trabalho\FBN New\FBN\Issues\22.1.pdf"
df = extract_text_from_pdf(pdf_path)


In [61]:
df.head()

Unnamed: 0,PageNumber,Text,ISBNs
0,1,Feminist Bookstore News\nSource: Reveal Digita...,[]
1,2,"Гетілізі\nВооК<іюге\nГОМТ МОРРУ, ЈЕТАММА ЛНІЎ\...",[]
2,3,BLACK ANGEL CARDS\nEarthlyn Manuel\n0-06-25161...,[0-06-251426-1]
3,4,Feminist Bookstore News\nAs we go to press...\...,[]
4,5,"Feminist Bookstore News\nbattles, its case wil...",[]


### Downloading List of ISBNs and Page Numbers

In [62]:
#change issue number
df.to_csv('22.1_text.csv', index=False)

## API

In [None]:
#calling libraries
import requests
import csv

#api key
api_key = '52084_9ddf62aeaa4a39b5485d9b7fb69dd5a8'
isbns_to_process = isbns_found  # Replace with the original list of ISBNs you want to process

# Construct the base URL
base_url = 'https://api2.isbndb.com/book/'

# Create a CSV file and write headers
with open('book_information.csv', 'w', newline='', encoding='utf-8') as csv_file:
    fieldnames = ['ISBN', 'Title', 'Author', 'Publisher', 'Pages', 'Date Published', 'Binding', 'Synopsis', 'Language', 'Edition', 'Dimensions', 'MSRP', 'Image', 'Status']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

    writer.writeheader()

    # Make requests for each ISBN in the list
    for isbn in isbns_to_process:
        url = f'{base_url}{isbn}'
        headers = {'Authorization': api_key}

        response = requests.get(url, headers=headers)

        if response.status_code == 200:
            book_info = response.json().get('book', {})
            
            # Write information to CSV
            writer.writerow({
                'ISBN': isbn,
                'Title': book_info.get('title', ''),
                'Author': ', '.join(book_info.get('authors', [])),
                'Publisher': book_info.get('publisher', ''),
                'Pages': book_info.get('pages', ''),
                'Date Published': book_info.get('date_published', ''),
                'Binding': book_info.get('binding', ''),
                'Synopsis': book_info.get('synopsis', ''),
                'Language': book_info.get('language', ''),
                'Edition': book_info.get('edition', ''),
                'Dimensions': book_info.get('dimensions', ''),
                'MSRP': book_info.get('msrp', ''),
                'Image': book_info.get('image', ''),
                'Status': 'Success'
            })
        else:
            print(f"Error for ISBN {isbn}: {response.status_code}")

            # Write a row with the ISBN, an indication of the error status, and empty fields for other information
            writer.writerow({
                'ISBN': isbn,
                'Status': 'Error',
                'Title': '',
                'Author': '',
                'Publisher': '',
                'Pages': '',
                'Date Published': '',
                'Binding': '',
                'Synopsis': '',
                'Language': '',
                'Edition': '',
                'Dimensions': '',
                'MSRP': '',
                'Image': ''
            })

print("CSV file created successfully.")

Error for ISBN 15-974-8985 x: 404
Error for ISBN 0-374-33551-0: 404
Error for ISBN 0-699-89181-1: 404
Error for ISBN 0-77101-438-X: 404
Error for ISBN 0-934278-98-7: 404
Error for ISBN 0-892-92288-2: 404
Error for ISBN 0-466-67366-8: 404
Error for ISBN 800-626- 41-3: 404
Error for ISBN 1-56947-131-7: 404


## Joining Dataframes

In [None]:
import pandas as pd

# Assuming 'ISBNs' is the current column name and 'ISBN' is the desired column name
df.rename(columns={'ISBNs': 'ISBN'}, inplace=True)

In [None]:
page = df[['PageNumber', 'ISBN']]
page.head()

In [None]:
page['ISBN'] = page['ISBN'].astype(str).str.replace('[\[\]\'\"]', '')
page.head()

In [None]:
# Split values on comma and explode to create new rows
page['ISBN'] = page['ISBN'].str.split(', ')
page = page.explode('ISBN')

# Display the updated DataFrame
page.head(10)

In [50]:
import pandas as pd

# Specify the file path
file_path = r'C:\Users\salom\OneDrive\Área de Trabalho\FBN New\FBN\notebooks\book_information.csv'

# Read the CSV file into a DataFrame
info = pd.read_csv(file_path)

info

Unnamed: 0,ISBN,Title,Author,Publisher,Pages,Date Published,Binding,Synopsis,Language,Edition,Dimensions,MSRP,Image,Status
0,0-06-251426-1,Divine Daughters: Liberating the Power and Pas...,"Bagby, Rachel L.",Harper San Francisco,288.0,1999,Hardcover,"An extraordinary vocal artist, storyteller, an...",en,First Edition,"Height: 8 Inches, Length: 5.75 Inches, Weight:...",22.00,https://images.isbndb.com/covers/42/64/9780062...,Success
1,1-883523-30-3,Conferences Are Murder: The Fourth Lindsay Gor...,Val McDermid,Spinsters Ink,236.0,1999,Paperback,,en,Reprint,"Height: 8.5 Inches, Length: 5.5 Inches, Weight...",12.00,https://images.isbndb.com/covers/33/05/9781883...,Success
2,1-883523-32-X,Sugarland,"Rodgers, Joni",Spinster Ink,346.0,1999-01-01,Paperback,"In Texas, Two Married Women Who In Their Youth...",en,First Edition,"Height: 8.75 Inches, Length: 5.5 Inches, Weigh...",0.00,https://images.isbndb.com/covers/33/29/9781883...,Success
3,0-934971-66-8,"End Of The Class War, The","Brady, Catherine",Calyx Books,241.0,1999-06-15,Paperback,<p>...the author succeeds in breathing new lif...,en,First Edition,"Height: 9 Inches, Length: 6 Inches, Weight: 1 ...",13.95,https://images.isbndb.com/covers/16/69/9780934...,Success
4,0-934971-67-6,"End Of The Class War, The","Brady, Catherine",Calyx Books,239.0,1999,Hardcover,A Collection Of Stories On The Irish Immigrant...,en,,"Height: 9.2 Inches, Length: 6.3 Inches, Weight...",27.95,https://images.isbndb.com/covers/16/76/9780934...,Success
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
435,0-671-01386-6,,,,,,,,,,,,,Error
436,0-671-00957-5,,,,,,,,,,,,,Error
437,0-275-96297-0,,,,,,,,,,,,,Error
438,0-7879-4513-7,,,,,,,,,,,,,Error


In [51]:
merge = pd.merge(info, page, on='ISBN', how='left')
merge

Unnamed: 0,ISBN,Title,Author,Publisher,Pages,Date Published,Binding,Synopsis,Language,Edition,Dimensions,MSRP,Image,Status,PageNumber
0,0-06-251426-1,Divine Daughters: Liberating the Power and Pas...,"Bagby, Rachel L.",Harper San Francisco,288.0,1999,Hardcover,"An extraordinary vocal artist, storyteller, an...",en,First Edition,"Height: 8 Inches, Length: 5.75 Inches, Weight:...",22.0,https://images.isbndb.com/covers/42/64/9780062...,Success,3
1,0-06-251426-1,Divine Daughters: Liberating the Power and Pas...,"Bagby, Rachel L.",Harper San Francisco,288.0,1999,Hardcover,"An extraordinary vocal artist, storyteller, an...",en,First Edition,"Height: 8 Inches, Length: 5.75 Inches, Weight:...",22.0,https://images.isbndb.com/covers/42/64/9780062...,Success,99
2,1-883523-30-3,Conferences Are Murder: The Fourth Lindsay Gor...,Val McDermid,Spinsters Ink,236.0,1999,Paperback,,en,Reprint,"Height: 8.5 Inches, Length: 5.5 Inches, Weight...",12.0,https://images.isbndb.com/covers/33/05/9781883...,Success,7
3,1-883523-30-3,Conferences Are Murder: The Fourth Lindsay Gor...,Val McDermid,Spinsters Ink,236.0,1999,Paperback,,en,Reprint,"Height: 8.5 Inches, Length: 5.5 Inches, Weight...",12.0,https://images.isbndb.com/covers/33/05/9781883...,Success,55
4,1-883523-32-X,Sugarland,"Rodgers, Joni",Spinster Ink,346.0,1999-01-01,Paperback,"In Texas, Two Married Women Who In Their Youth...",en,First Edition,"Height: 8.75 Inches, Length: 5.5 Inches, Weigh...",0.0,https://images.isbndb.com/covers/33/29/9781883...,Success,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
461,0-671-01386-6,,,,,,,,,,,,,Error,110
462,0-671-00957-5,,,,,,,,,,,,,Error,110
463,0-275-96297-0,,,,,,,,,,,,,Error,112
464,0-7879-4513-7,,,,,,,,,,,,,Error,113


In [52]:
# Assuming 'merge' is your DataFrame
# Drop rows with NaN in 'Title' and specific ISBN formats
merge = merge[~(merge['Title'].isna() & merge['ISBN'].str.contains(r'^\d{1}-\d{3}-\d{3}-\d{4}$|^\d{3}-\d{3}-\d{4}$|^\d{5}-\d{4}$|^\d{5}$|^\d{6}$'))]

# Display the updated DataFrame
merge

Unnamed: 0,ISBN,Title,Author,Publisher,Pages,Date Published,Binding,Synopsis,Language,Edition,Dimensions,MSRP,Image,Status,PageNumber
0,0-06-251426-1,Divine Daughters: Liberating the Power and Pas...,"Bagby, Rachel L.",Harper San Francisco,288.0,1999,Hardcover,"An extraordinary vocal artist, storyteller, an...",en,First Edition,"Height: 8 Inches, Length: 5.75 Inches, Weight:...",22.0,https://images.isbndb.com/covers/42/64/9780062...,Success,3
1,0-06-251426-1,Divine Daughters: Liberating the Power and Pas...,"Bagby, Rachel L.",Harper San Francisco,288.0,1999,Hardcover,"An extraordinary vocal artist, storyteller, an...",en,First Edition,"Height: 8 Inches, Length: 5.75 Inches, Weight:...",22.0,https://images.isbndb.com/covers/42/64/9780062...,Success,99
2,1-883523-30-3,Conferences Are Murder: The Fourth Lindsay Gor...,Val McDermid,Spinsters Ink,236.0,1999,Paperback,,en,Reprint,"Height: 8.5 Inches, Length: 5.5 Inches, Weight...",12.0,https://images.isbndb.com/covers/33/05/9781883...,Success,7
3,1-883523-30-3,Conferences Are Murder: The Fourth Lindsay Gor...,Val McDermid,Spinsters Ink,236.0,1999,Paperback,,en,Reprint,"Height: 8.5 Inches, Length: 5.5 Inches, Weight...",12.0,https://images.isbndb.com/covers/33/05/9781883...,Success,55
4,1-883523-32-X,Sugarland,"Rodgers, Joni",Spinster Ink,346.0,1999-01-01,Paperback,"In Texas, Two Married Women Who In Their Youth...",en,First Edition,"Height: 8.75 Inches, Length: 5.5 Inches, Weigh...",0.0,https://images.isbndb.com/covers/33/29/9781883...,Success,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
461,0-671-01386-6,,,,,,,,,,,,,Error,110
462,0-671-00957-5,,,,,,,,,,,,,Error,110
463,0-275-96297-0,,,,,,,,,,,,,Error,112
464,0-7879-4513-7,,,,,,,,,,,,,Error,113


In [53]:
import pandas as pd
duplicate = pd.read_csv(r"C:\Users\salom\OneDrive\Área de Trabalho\FBN\data\20.4\20.4_booklist.csv")
duplicate

Unnamed: 0,ISBN,Title,Author,Publisher,Pages,Date Published,Binding,Synopsis,Language,Edition,Dimensions,MSRP,Image,Status,PageNumber
0,0-87286-332-8,Close to the Machine: Technophilia and Its Dis...,"Ullman, Ellen",City Lights Publishers,189.0,1/1/2001,Paperback,"Ullman Tries To Balance Her Life, Close To The...",en,(2nd),"Height: 8 Inches, Length: 5 Inches, Weight: 0....",12.95,https://images.isbndb.com/covers/33/23/9780872...,Success,31.0
1,1-880913-16-X,Raising Peaceful Children In A Violent World,"Cecil, Nancy Lee, Roberts, Patricia L.",Publishing/Editing Network,253.0,1997,Paperback,,en,,"Height: 10.25 Inches, Length: 7.25 Inches, Wei...",16.95,https://images.isbndb.com/covers/31/61/9781880...,Success,42.0
2,1-55074-113-6,The Science Book for Girls: and Other Intellig...,"Wyatt, Valerie",Kids Can Press,80.0,8/1/1993,Paperback,<p>Studies show that many girls' interest in s...,en,,"Height: 9.5 Inches, Length: 8.5 Inches, Weight...",1.99,https://images.isbndb.com/covers/11/31/9781550...,Success,42.0
3,0-670-87367-5,Monkey Bridge,"Cao, Lan",Viking Adult,260.0,7/1/1997,Hardcover,A Memoir Of A Vietnamese Refugee. The Narrator...,en,,"Height: 9.25 Inches, Length: 5.75 Inches, Weig...",23.95,https://images.isbndb.com/covers/36/78/9780670...,Success,42.0
4,0-452-27621-7,Storming Heaven's Gate: An Anthology of Spirit...,"Vecchione, Patricia, Sumrall, Amber Coverdale",Plume,496.0,7/1/1997,Paperback,In This Powerful Collection Of Modern Prose An...,en,First Plume Printed Edition,"Height: 8.25 Inches, Length: 5.5 Inches, Weigh...",14.95,https://images.isbndb.com/covers/62/15/9780452...,Success,42.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
696,0-804-11558-3,,,,,,,,,,,,,Error,138.0
697,0-553-57715-8,,,,,,,,,,,,,Error,138.0
698,0-553-57489-2,,,,,,,,,,,,,Error,138.0
699,0-385-48692-8,,,,,,,,,,,,,Error,138.0


In [54]:
duplicate = duplicate.drop_duplicates(subset=['ISBN'])
duplicate

Unnamed: 0,ISBN,Title,Author,Publisher,Pages,Date Published,Binding,Synopsis,Language,Edition,Dimensions,MSRP,Image,Status,PageNumber
0,0-87286-332-8,Close to the Machine: Technophilia and Its Dis...,"Ullman, Ellen",City Lights Publishers,189.0,1/1/2001,Paperback,"Ullman Tries To Balance Her Life, Close To The...",en,(2nd),"Height: 8 Inches, Length: 5 Inches, Weight: 0....",12.95,https://images.isbndb.com/covers/33/23/9780872...,Success,31.0
1,1-880913-16-X,Raising Peaceful Children In A Violent World,"Cecil, Nancy Lee, Roberts, Patricia L.",Publishing/Editing Network,253.0,1997,Paperback,,en,,"Height: 10.25 Inches, Length: 7.25 Inches, Wei...",16.95,https://images.isbndb.com/covers/31/61/9781880...,Success,42.0
2,1-55074-113-6,The Science Book for Girls: and Other Intellig...,"Wyatt, Valerie",Kids Can Press,80.0,8/1/1993,Paperback,<p>Studies show that many girls' interest in s...,en,,"Height: 9.5 Inches, Length: 8.5 Inches, Weight...",1.99,https://images.isbndb.com/covers/11/31/9781550...,Success,42.0
3,0-670-87367-5,Monkey Bridge,"Cao, Lan",Viking Adult,260.0,7/1/1997,Hardcover,A Memoir Of A Vietnamese Refugee. The Narrator...,en,,"Height: 9.25 Inches, Length: 5.75 Inches, Weig...",23.95,https://images.isbndb.com/covers/36/78/9780670...,Success,42.0
4,0-452-27621-7,Storming Heaven's Gate: An Anthology of Spirit...,"Vecchione, Patricia, Sumrall, Amber Coverdale",Plume,496.0,7/1/1997,Paperback,In This Powerful Collection Of Modern Prose An...,en,First Plume Printed Edition,"Height: 8.25 Inches, Length: 5.5 Inches, Weigh...",14.95,https://images.isbndb.com/covers/62/15/9780452...,Success,42.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
696,0-804-11558-3,,,,,,,,,,,,,Error,138.0
697,0-553-57715-8,,,,,,,,,,,,,Error,138.0
698,0-553-57489-2,,,,,,,,,,,,,Error,138.0
699,0-385-48692-8,,,,,,,,,,,,,Error,138.0


In [56]:
# Assuming 'merge' is your DataFrame
duplicate.to_csv('22.1_booklist.csv', index=False)