# FBN Scraping 

## Converting From PDF to Text

In [1]:
#Importing the needed library to convert PDF to text
import pdfplumber

#extracting the PDF
def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ''
        for page in pdf.pages:
            text += page.extract_text()
    return text

# Replace 'your_pdf_file.pdf' with the actual path to your PDF file
pdf_path = r"C:\Users\SalomeGrasland\Desktop\FBN\FBN PDFs-20240319T133812Z-001\FBN PDFs\Volume 21\21.5.pdf"

#variable containing text called extracted_text
extracted_text = extract_text_from_pdf(pdf_path)


In [None]:
#Downloading the text file 

with open('Issue_Text.txt', 'w', encoding='utf-8') as file:
    file.write(extracted_text)

## ISBNS

### Parsing ISBNs

In [3]:
#extracting ISBNs from thext 
#importing the librart
import re

#defining the regex used 
def extract_isbns(text):
    # Define a regex pattern to match strings of numbers with dashes
    isbn_pattern = r'\b\d{1,5}-?\d{1,7}-?\d{1,7}-?\d{1,7}-?\w\b'

    # Find all matches of the ISBN pattern in the text
    isbns = re.findall(isbn_pattern, text)

    return isbns

# Extract ISBNs from the example text
isbns_found = extract_isbns(extracted_text)

# Print the list of ISBNs
print("ISBNs found:", isbns_found)

ISBNs found: ['01-01-1999', '28036388', '0741-6555', '94110', '882554', '94188-2554', '415-642-9993', '415-642-9995', '882554', '94188', '94110', '0-9659521-3-4', '1-883523-28-1', '0-9653800-3-3', '1-800-593', '1-800-948', '27334', '87125', '01010', '0-88961-233-1', '1-800-626-4330', '1-800-565-9523', '10005', '21303', '212-650-7925', '10031', '02130', '617-524-0415', '1-800-243-0138', '1-800-626-4330', '1-800-334-3892', '415-642-9993', '202-462-7924', '213-237-7321', '90053', '11590', '516-338-6312', '516-333-0689', '800-637-0037', '94037', '650-728-1783', '02116', '617-262-6969', '44110', '1-896705-14-6', '565-9253', '60607', '1-55670-888-2', '0-689-81071-7', '0-689-71798-9', '0-14-055782-2', '0-670-86733-0', '0-689-81593-X', '0-02-747838-6', '0307128733', '0-6060-2973-7', '0-803-71040-2', '0-698114-36-1', '0-688-12533-6', '0-689-80668-X', '0-940975-21-1', '0-027090-35-3', '0-679-89445-4', '0-30-716167-6', '0-6060-8410-X', '0694005908', '0-531071-02-2', '0-152012-88-5', '0-7868-1306-

### ISBN Dataframe

In [4]:
#creating a dataframe that has ISBNs and the page numbers where found 

#importing necessary libaries
import pdfplumber
import pandas as pd
import re

def extract_isbns_from_text(text):
    # Define the ISBN regex pattern
    isbn_pattern = re.compile(r'\b\d{1,5}-?\d{1,7}-?\d{1,7}-?\d{1,7}-?\w\b')

    # Find all ISBNs in the text
    isbns_found = isbn_pattern.findall(text)
    return isbns_found

def extract_text_from_pdf(pdf_path):
    data = {'PageNumber': [], 'Text': [], 'ISBNs': []}
    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages, start=1):
            # Add page number and corresponding text to the data dictionary
            data['PageNumber'].append(i)
            text = page.extract_text()
            data['Text'].append(text)
            
            # Extract ISBNs from the text
            isbns_found = extract_isbns_from_text(text)
            data['ISBNs'].append(isbns_found)

    return pd.DataFrame(data)

# Replace 'your_pdf_file.pdf' with the actual path to your PDF file
pdf_path = r'C:\Users\SalomeGrasland\Desktop\FBN\FBN PDFs-20240319T133812Z-001\FBN PDFs\Volume 21\21.5.pdf'
df = extract_text_from_pdf(pdf_path)


In [5]:
df.head()

Unnamed: 0,PageNumber,Text,ISBNs
0,1,Feminist Bookstore News\nSource: Reveal Digita...,"[01-01-1999, 28036388]"
1,2,Feminist\nBookstore\nNews RivAlzhti\nGET READY...,[]
2,3,This content downloaded from\n(cid:0)(cid:0)(c...,[]
3,4,Feminist Bookstore News\nItis truly amazing ho...,[]
4,5,Feminist Bookstore News\nPaison. (“That’s FEH ...,"[0741-6555, 94110, 882554, 94188-2554]"


### Downloading ISBN Dataframe

In [6]:
df.to_csv('21.5_text.csv', index=False)

## API

In [7]:
#calling libraries
import requests
import csv

#api key
api_key = '52084_9ddf62aeaa4a39b5485d9b7fb69dd5a8'
isbns_to_process = isbns_found  # Replace with the original list of ISBNs you want to process

# Construct the base URL
base_url = 'https://api2.isbndb.com/book/'

# Create a CSV file and write headers
with open('book_information.csv', 'w', newline='', encoding='utf-8') as csv_file:
    fieldnames = ['ISBN', 'Title', 'Author', 'Publisher', 'Pages', 'Date Published', 'Binding', 'Synopsis', 'Language', 'Edition', 'Dimensions', 'MSRP', 'Image', 'Status']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

    writer.writeheader()

    # Make requests for each ISBN in the list
    for isbn in isbns_to_process:
        url = f'{base_url}{isbn}'
        headers = {'Authorization': api_key}

        response = requests.get(url, headers=headers)

        if response.status_code == 200:
            book_info = response.json().get('book', {})
            
            # Write information to CSV
            writer.writerow({
                'ISBN': isbn,
                'Title': book_info.get('title', ''),
                'Author': ', '.join(book_info.get('authors', [])),
                'Publisher': book_info.get('publisher', ''),
                'Pages': book_info.get('pages', ''),
                'Date Published': book_info.get('date_published', ''),
                'Binding': book_info.get('binding', ''),
                'Synopsis': book_info.get('synopsis', ''),
                'Language': book_info.get('language', ''),
                'Edition': book_info.get('edition', ''),
                'Dimensions': book_info.get('dimensions', ''),
                'MSRP': book_info.get('msrp', ''),
                'Image': book_info.get('image', ''),
                'Status': 'Success'
            })
        else:
            print(f"Error for ISBN {isbn}: {response.status_code}")

            # Write a row with the ISBN, an indication of the error status, and empty fields for other information
            writer.writerow({
                'ISBN': isbn,
                'Status': 'Error',
                'Title': '',
                'Author': '',
                'Publisher': '',
                'Pages': '',
                'Date Published': '',
                'Binding': '',
                'Synopsis': '',
                'Language': '',
                'Edition': '',
                'Dimensions': '',
                'MSRP': '',
                'Image': ''
            })

print("CSV file created successfully.")

Error for ISBN 01-01-1999: 404
Error for ISBN 28036388: 404
Error for ISBN 0741-6555: 404
Error for ISBN 94110: 404
Error for ISBN 882554: 404
Error for ISBN 94188-2554: 404
Error for ISBN 415-642-9993: 404
Error for ISBN 415-642-9995: 404
Error for ISBN 882554: 404
Error for ISBN 94188: 404
Error for ISBN 94110: 404
Error for ISBN 1-800-593: 404
Error for ISBN 1-800-948: 404
Error for ISBN 27334: 404
Error for ISBN 87125: 404
Error for ISBN 01010: 404
Error for ISBN 1-800-626-4330: 404
Error for ISBN 1-800-565-9523: 404
Error for ISBN 10005: 404
Error for ISBN 21303: 404
Error for ISBN 212-650-7925: 404
Error for ISBN 10031: 404
Error for ISBN 02130: 404
Error for ISBN 617-524-0415: 404
Error for ISBN 1-800-243-0138: 404
Error for ISBN 1-800-626-4330: 404
Error for ISBN 1-800-334-3892: 404
Error for ISBN 415-642-9993: 404
Error for ISBN 202-462-7924: 404
Error for ISBN 213-237-7321: 404
Error for ISBN 90053: 404
Error for ISBN 11590: 404
Error for ISBN 516-338-6312: 404
Error for ISBN

In [8]:
import pandas as pd

# Assuming 'ISBNs' is the current column name and 'ISBN' is the desired column name
df.rename(columns={'ISBNs': 'ISBN'}, inplace=True)

In [9]:
page = df[['PageNumber', 'ISBN']]
page.head()

Unnamed: 0,PageNumber,ISBN
0,1,"[01-01-1999, 28036388]"
1,2,[]
2,3,[]
3,4,[]
4,5,"[0741-6555, 94110, 882554, 94188-2554]"


In [10]:
page['ISBN'] = page['ISBN'].astype(str).str.replace('[\[\]\'\"]', '')
page.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  page['ISBN'] = page['ISBN'].astype(str).str.replace('[\[\]\'\"]', '')


Unnamed: 0,PageNumber,ISBN
0,1,"['01-01-1999', '28036388']"
1,2,[]
2,3,[]
3,4,[]
4,5,"['0741-6555', '94110', '882554', '94188-2554']"


In [11]:
# Split values on comma and explode to create new rows
page['ISBN'] = page['ISBN'].str.split(', ')
page = page.explode('ISBN')

# Display the updated DataFrame
page.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  page['ISBN'] = page['ISBN'].str.split(', ')


Unnamed: 0,PageNumber,ISBN
0,1,['01-01-1999'
0,1,'28036388']
1,2,[]
2,3,[]
3,4,[]
4,5,['0741-6555'
4,5,'94110'
4,5,'882554'
4,5,'94188-2554']
5,6,['415-642-9993'


In [12]:
import pandas as pd

# Specify the file path
file_path = r'C:\Users\salom\OneDrive\Área de Trabalho\FBN-\data\21.5_book_information.csv'

# Read the CSV file into a DataFrame
info = pd.read_csv(file_path)

info

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\salom\\OneDrive\\Área de Trabalho\\FBN-\\data\\21.5_book_information.csv'

In [None]:
merge = pd.merge(info, page, on='ISBN', how='left')
merge

In [None]:
# Assuming 'merge' is your DataFrame
# Drop rows with NaN in 'Title' and specific ISBN formats
merge = merge[~(merge['Title'].isna() & merge['ISBN'].str.contains(r'^\d{1}-\d{3}-\d{3}-\d{4}$|^\d{3}-\d{3}-\d{4}$|^\d{5}-\d{4}$|^\d{5}$|^\d{6}$'))]

# Display the updated DataFrame
merge

In [None]:
import pandas as pd
duplicate = pd.read_csv(r"C:\Users\salom\OneDrive\Área de Trabalho\FBN\data\20.4\20.4_booklist.csv")
duplicate

In [None]:
duplicate = duplicate.drop_duplicates(subset=['ISBN'])
duplicate

In [None]:
# Assuming 'merge' is your DataFrame
duplicate.to_csv('20.4_booklist.csv', index=False)