# Using Python to Scrape ISBNs from a Text Document

## Generating a text document with random words, numbers, and ISBNs

In [5]:
import random
from faker import Faker

# Initialize Faker
fake = Faker()

# Generate random words
words = [fake.word() for _ in range(10)]

# Generate random ISBNs
isbns = [fake.isbn13() for _ in range(10)]

# Generate random zip codes and dates
zip_codes = [fake.zipcode() for _ in range(10)]
dates = [fake.date() for _ in range(10)]

# Combine lists into a single string with newlines
random_text = "\n".join([
    "Random Words:",
    *words,
    "\nRandom ISBNs:",
    *isbns,
    "\nRandom Zip Codes:",
    *zip_codes,
    "\nRandom Dates:",
    *dates
])

# Write to a text file
with open("random_data.txt", "w") as file:
    file.write(content)

print("File created: random_data.txt")

File created: random_data.txt


## Parsing ISBNs

#### Using RegEx we can distinguish the ISBNs from the random text

In [8]:
#extracting ISBNs from thext 
#importing the library
import re

#defining the regex used 
def extract_isbns(text):
    # Define a regex pattern to match strings of numbers with dashes
    isbn_pattern = r"((978[-– ])?[0-9][0-9-– ]{10}[-– ][0-9xX])|((978)?[0-9]{9}[0-9Xx])"

    # Find all matches of the ISBN pattern in the text
    isbns = re.findall(isbn_pattern, text)

    return isbns

# Extract ISBNs from the example text
isbns_found = extract_isbns(random_text)

# Extract only the portions with numbers and dashes
isbns_found = [isbn[0] for isbn in isbns_found]

print("Cleaned ISBNs:", isbns_found)

Cleaned ISBNs: ['978-1-72445-406-5', '978-1-5137-1207-9', '978-0-309-00547-0', '978-0-10-759455-8', '978-0-7177-4640-8', '978-0-621-37483-4', '978-0-360-97199-8', '978-1-352-75516-9', '978-1-116-25368-9', '978-0-491-65269-8']
