# Q2: Parallel Web Scraping

### Step0: Import all dependencies needed

In [3]:
import boto3
import json
import requests
import dataset
import re
from datetime import datetime
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time

### Step1: Set up RDS instance

In [4]:
rds_client = boto3.client('rds')
iam_client = boto3.client('iam')
role = iam_client.get_role(RoleName='LabRole')

# Create RDS instance
try:
    response = rds_client.create_db_instance(
        DBInstanceIdentifier='relational-db',
        DBName='books',
        MasterUsername='username',
        MasterUserPassword='password',
        DBInstanceClass='db.t3.micro',
        Engine='MySQL',
        AllocatedStorage=5
    )
    print("RDS instance created successfully:", response)
    
    # Wait until DB is available to continue
    rds_client.get_waiter('db_instance_available').wait(DBInstanceIdentifier='relational-db')
    
    # Describe where DB is available and on what port
    db = rds_client.describe_db_instances()['DBInstances'][0]
    ENDPOINT = db['Endpoint']['Address']
    PORT = db['Endpoint']['Port']
    DBID = db['DBInstanceIdentifier']
    
    USERNAME = 'username'
    PASSWORD = 'password'
    
    print(DBID,
        "is available at", ENDPOINT,
        "on Port", PORT,
        )   
except Exception as e:
    print("Error creating RDS instance:", e)

RDS instance created successfully: {'DBInstance': {'DBInstanceIdentifier': 'relational-db', 'DBInstanceClass': 'db.t3.micro', 'Engine': 'mysql', 'DBInstanceStatus': 'creating', 'MasterUsername': 'username', 'DBName': 'books', 'AllocatedStorage': 5, 'PreferredBackupWindow': '05:56-06:26', 'BackupRetentionPeriod': 1, 'DBSecurityGroups': [], 'VpcSecurityGroups': [{'VpcSecurityGroupId': 'sg-0ebcfd71f93ad1599', 'Status': 'active'}], 'DBParameterGroups': [{'DBParameterGroupName': 'default.mysql8.0', 'ParameterApplyStatus': 'in-sync'}], 'DBSubnetGroup': {'DBSubnetGroupName': 'default', 'DBSubnetGroupDescription': 'default', 'VpcId': 'vpc-03fe4a0e60b95c95e', 'SubnetGroupStatus': 'Complete', 'Subnets': [{'SubnetIdentifier': 'subnet-06c5f1ed345b03ce5', 'SubnetAvailabilityZone': {'Name': 'us-east-1d'}, 'SubnetOutpost': {}, 'SubnetStatus': 'Active'}, {'SubnetIdentifier': 'subnet-023920fe191952e92', 'SubnetAvailabilityZone': {'Name': 'us-east-1a'}, 'SubnetOutpost': {}, 'SubnetStatus': 'Active'}, {'

In [5]:
# Get Name of Security Group
SGNAME = db['VpcSecurityGroups'][0]['VpcSecurityGroupId']

# Adjust Permissions for that security group so that we can access it on Port 3306
# If already SG is already adjusted, print this out
try:
    ec2 = boto3.client('ec2')
    data = ec2.authorize_security_group_ingress(
            GroupId=SGNAME,
            IpPermissions=[
                {'IpProtocol': 'tcp',
                 'FromPort': PORT,
                 'ToPort': PORT,
                 'IpRanges': [{'CidrIp': '0.0.0.0/0'}]}
            ]
    )
    print("Permissions adjusted successfully.")
    
except ec2.exceptions.ClientError as e:
    if e.response["Error"]["Code"] == 'InvalidPermission.Duplicate':
        print("Permissions already adjusted.")
    else:
        print(e)

Permissions already adjusted.


### Step2: Get list of books to scrape

In [6]:
# Create the database url for dataset package connection
db_url = \
    'mysql+mysqlconnector://{}:{}@{}:{}/books'.format(
        USERNAME,
        PASSWORD,
        ENDPOINT,
        PORT)

# Connect to database
try:
    db = dataset.connect(db_url)
except Exception as e:
    print("Failed to connect to the database:", e)
    exit(1)

base_url = 'http://books.toscrape.com/'


def scrape_books(html_soup, url):
    for book in html_soup.select('article.product_pod'):
        # For now, we'll only store the books url
        book_url = book.find('h3').find('a').get('href')
        book_url = urljoin(url, book_url)
        path = urlparse(book_url).path
        book_id = path.split('/')[2]
        # Upsert tries to update first and then insert instead
        try:
            db['books'].upsert({'book_id': book_id,
                                'last_seen': datetime.now()
                                }, ['book_id'])
        except Exception as e:
            print(f"Failed to upsert book {book_id}:", e)
        

def robust_request(url, max_retries=5, delay=5):
    for attempt in range(max_retries):
        try:
            response = requests.get(url)
            response.raise_for_status()  # Raises an HTTPError for bad responses
            return response
        except requests.exceptions.RequestException as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            time.sleep(delay)  # Wait for a specified delay period between retries
    print("Failed to fetch data after several attempts.")
    return None


# Scrape the pages in the catalogue
url = base_url
inp = input('Do you wish to re-scrape the catalogue (y/n)? ')
while True and inp == 'y':
    print('Now scraping page:', url)
    r = robust_request(url)
    if r is None:
        break  # Exit if unable to fetch data after retries
    html_soup = BeautifulSoup(r.text, 'html.parser')
    scrape_books(html_soup, url)
    # Is there a next page?
    next_a = html_soup.select('li.next > a')
    if not next_a or not next_a[0].get('href'):
        break
    url = urljoin(url, next_a[0].get('href'))
    

Now scraping page: http://books.toscrape.com/
Now scraping page: http://books.toscrape.com/catalogue/page-2.html
Now scraping page: http://books.toscrape.com/catalogue/page-3.html
Now scraping page: http://books.toscrape.com/catalogue/page-4.html
Now scraping page: http://books.toscrape.com/catalogue/page-5.html
Now scraping page: http://books.toscrape.com/catalogue/page-6.html
Now scraping page: http://books.toscrape.com/catalogue/page-7.html
Now scraping page: http://books.toscrape.com/catalogue/page-8.html
Now scraping page: http://books.toscrape.com/catalogue/page-9.html
Now scraping page: http://books.toscrape.com/catalogue/page-10.html
Now scraping page: http://books.toscrape.com/catalogue/page-11.html
Now scraping page: http://books.toscrape.com/catalogue/page-12.html
Now scraping page: http://books.toscrape.com/catalogue/page-13.html
Now scraping page: http://books.toscrape.com/catalogue/page-14.html
Now scraping page: http://books.toscrape.com/catalogue/page-15.html
Now scrapi

### Step3: Deploy Lambda Functions

In [7]:
aws_lambda = boto3.client('lambda')

# Read the Lambda deployment package
lambda_zip_path = 'q2-deployment-package.zip'
try:
    with open(lambda_zip_path, 'rb') as f:
        lambda_zip = f.read()
except FileNotFoundError:
    print(f"Failed to find the file {lambda_zip_path}. Ensure it's in the correct directory.")
    exit(1)

# Define environment variables
env_variables = {
    'Variables': {
        'DB_ENDPOINT': ENDPOINT,
        'DB_USER': USERNAME,
        'DB_PASSWORD': PASSWORD,
        'DB_PORT': str(PORT), 
    }
}

# Create Lambda function
try:
    response = aws_lambda.create_function(
        FunctionName='a2q2_lambda',
        Runtime='python3.9',
        Role=role['Role']['Arn'],
        Handler='lambda_function.lambda_handler',
        Code=dict(ZipFile=lambda_zip),
        Timeout=300,
        Environment=env_variables  # Add environment variables here
    )
    print("Lambda function created successfully:", response)
except aws_lambda.exceptions.ResourceConflictException:
    # Update the function code
    response = aws_lambda.update_function_code(
        FunctionName='a2q2_lambda',
        ZipFile=lambda_zip
    )
    print("Lambda function updated successfully:", response)

    # Update the environment variables if needed
    response = aws_lambda.update_function_configuration(
        FunctionName='a2q2_lambda',
        Environment=env_variables
    )
    print("Lambda function configuration updated successfully:", response)

# Set the concurrency configuration
try:
    response = aws_lambda.put_function_concurrency(
        FunctionName='a2q2_lambda',
        ReservedConcurrentExecutions=10
    )
    print("Function concurrency set successfully:", response)
except Exception as e:
    print(f"Failed to set function concurrency: {e}")

Lambda function created successfully: {'ResponseMetadata': {'RequestId': '67be9178-9b94-48b3-9fe1-398748d1de03', 'HTTPStatusCode': 201, 'HTTPHeaders': {'date': 'Thu, 02 May 2024 21:44:45 GMT', 'content-type': 'application/json', 'content-length': '1475', 'connection': 'keep-alive', 'x-amzn-requestid': '67be9178-9b94-48b3-9fe1-398748d1de03'}, 'RetryAttempts': 0}, 'FunctionName': 'a2q2_lambda', 'FunctionArn': 'arn:aws:lambda:us-east-1:102168828713:function:a2q2_lambda', 'Runtime': 'python3.9', 'Role': 'arn:aws:iam::102168828713:role/LabRole', 'Handler': 'lambda_function.lambda_handler', 'CodeSize': 45449856, 'Description': '', 'Timeout': 300, 'MemorySize': 128, 'LastModified': '2024-05-02T21:44:44.877+0000', 'CodeSha256': 'mLHwLeE8Sm7Mvdsyc6eXje349HMsGRua0VbuyL5nVu4=', 'Version': '$LATEST', 'Environment': {'Variables': {'DB_PORT': '3306', 'DB_ENDPOINT': 'relational-db.cfxyluhsb7bh.us-east-1.rds.amazonaws.com', 'DB_USER': 'username', 'DB_PASSWORD': 'password'}}, 'TracingConfig': {'Mode': 

### Step4: Set up step Ffunctions

In [8]:
def make_def(lambda_arn):
    definition = {
      "Comment": "Q2 State Machine",
      "StartAt": "Map",
      "States": {
        "Map": {
          "Type": "Map",
          "End": True,
          "MaxConcurrency": 10,
          "Iterator": {
            "StartAt": "Lambda Invoke",
            "States": {
              "Lambda Invoke": {
                "Type": "Task",
                "Resource": "arn:aws:states:::lambda:invoke",
                "OutputPath": "$.Payload",
                "Parameters": {
                  "Payload.$": "$",
                  "FunctionName": lambda_arn
                },
                "Retry": [
                  {
                    "ErrorEquals": [
                      "Lambda.ServiceException",
                      "Lambda.AWSLambdaException",
                      "Lambda.SdkClientException",
                      "Lambda.TooManyRequestsException",
                      "States.TaskFailed",
                      "Lambda.Unknown"                      
                    ],
                    "IntervalSeconds": 2,
                    "MaxAttempts": 6,
                    "BackoffRate": 2
                  }
                ],
                "End": True
              }
            }
          }
        }
      }
    }
    return definition

if __name__ == '__main__':
    iam = boto3.client('iam')
    sfn = boto3.client('stepfunctions')
    aws_lambda = boto3.client('lambda')
    role = iam.get_role(RoleName='LabRole')

    lambda_function_name = "a2q2_lambda"

    # Get Lambda Function ARN and Role ARN
    # Assumes Lambda function already exists
    lambda_arn = [f['FunctionArn']
                  for f in aws_lambda.list_functions()['Functions']
                  if f['FunctionName'] == lambda_function_name][0]
    
    # Throttle concurrent executions to 10
    response = aws_lambda.put_function_concurrency(
            FunctionName=lambda_function_name,
            ReservedConcurrentExecutions=10
        )

    sfn_function_name = "a2q2_stepfunctions"

    # Use Lambda ARN to create State Machine Definition
    sf_def = make_def(lambda_arn)

    # Create Step Function State Machine if doesn't already exist
    try:
        response = sfn.create_state_machine(
            name=sfn_function_name,
            definition=json.dumps(sf_def),
            roleArn=role['Role']['Arn'],
            type='EXPRESS' 
        )
    except sfn.exceptions.StateMachineAlreadyExists:
        response = sfn.list_state_machines()
        state_machine_arn = [sm['stateMachineArn'] 
                            for sm in response['stateMachines'] 
                            if sm['name'] == sfn_function_name][0]
        response = sfn.update_state_machine(
            stateMachineArn=state_machine_arn,
            definition=json.dumps(sf_def),
            roleArn=role['Role']['Arn']
        )

In [9]:
# Connect to the database
try:
    db = dataset.connect(db_url)
except Exception as e:
    print(f"Failed to connect to database: {e}")
    exit(1)

# Get arn for Step Function state machine
response = sfn.list_state_machines()
state_machine_arn = [sm['stateMachineArn'] 
                     for sm in response['stateMachines'] 
                     if sm['name'] == sfn_function_name][0]

# Generate test data to pass as input to the Step Function
try:
    records = list(db['books'].find())
    data = [{'book_id': record['book_id']} for record in records]
    json_input = json.dumps(data)
    print("Input JSON prepared successfully:", json_input)
except Exception as e:
    print(f"Error preparing input data: {e}")
    exit(1)

Input JSON prepared successfully: [{"book_id": "a-light-in-the-attic_1000"}, {"book_id": "tipping-the-velvet_999"}, {"book_id": "soumission_998"}, {"book_id": "sharp-objects_997"}, {"book_id": "sapiens-a-brief-history-of-humankind_996"}, {"book_id": "the-requiem-red_995"}, {"book_id": "the-dirty-little-secrets-of-getting-your-dream-job_994"}, {"book_id": "the-coming-woman-a-novel-based-on-the-life-of-the-infamous-feminist-victoria-woodhull_993"}, {"book_id": "the-boys-in-the-boat-nine-americans-and-their-epic-quest-for-gold-at-the-1936-berlin-olympics_992"}, {"book_id": "the-black-maria_991"}, {"book_id": "starving-hearts-triangular-trade-trilogy-1_990"}, {"book_id": "shakespeares-sonnets_989"}, {"book_id": "set-me-free_988"}, {"book_id": "scott-pilgrims-precious-little-life-scott-pilgrim-1_987"}, {"book_id": "rip-it-up-and-start-again_986"}, {"book_id": "our-band-could-be-your-life-scenes-from-the-american-indie-underground-1981-1991_985"}, {"book_id": "olio_984"}, {"book_id": "mesaer

### Step5: Trigger step functinos

In [10]:
try:
    # Start synchronous execution of the state machine
    response = sfn.start_sync_execution(
        stateMachineArn=state_machine_arn,
        input=json_input
    )
    print("State machine execution response:", response)
    
    if response['status'] == 'SUCCEEDED':
        print("Execution succeeded!")
    elif response['status'] == 'FAILED':
        print("Execution failed!")
    else:
        print("Execution status:", response['status'])
        
except sfn.exceptions.SfnException as e:
    print(f"An error occurred while executing the state machine: {e}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

State machine execution response: {'executionArn': 'arn:aws:states:us-east-1:102168828713:express:a2q2_stepfunctions:118970a9-cb45-42ab-b1ee-7602652ccd86:33ef24b7-be4e-4762-9208-ba29253a3d59', 'stateMachineArn': 'arn:aws:states:us-east-1:102168828713:stateMachine:a2q2_stepfunctions', 'name': '118970a9-cb45-42ab-b1ee-7602652ccd86', 'startDate': datetime.datetime(2024, 5, 2, 16, 44, 58, 758000, tzinfo=tzlocal()), 'stopDate': datetime.datetime(2024, 5, 2, 16, 45, 48, 676000, tzinfo=tzlocal()), 'status': 'SUCCEEDED', 'input': '[{"book_id": "a-light-in-the-attic_1000"}, {"book_id": "tipping-the-velvet_999"}, {"book_id": "soumission_998"}, {"book_id": "sharp-objects_997"}, {"book_id": "sapiens-a-brief-history-of-humankind_996"}, {"book_id": "the-requiem-red_995"}, {"book_id": "the-dirty-little-secrets-of-getting-your-dream-job_994"}, {"book_id": "the-coming-woman-a-novel-based-on-the-life-of-the-infamous-feminist-victoria-woodhull_993"}, {"book_id": "the-boys-in-the-boat-nine-americans-and-t

### Step6: Check the scraped data

In [11]:
try:
    # Connect to the database
    db = dataset.connect(db_url)
    tables = db.tables
    print("Tables in the database:", tables)
    
except dataset.DatabaseError as e:
    print("Database connection failed:", e)
except Exception as e:
    print("An unexpected error occurred:", e)


Tables in the database: ['book_info', 'books']


In [12]:
try:
    # Access the 'books' table
    table = db['books']

    # Fetch the first 10 records using a limit
    records = table.find(_limit=10)

    # Check if records are found and print them
    records_found = False
    for record in records:
        print(record)
        records_found = True
    
    if not records_found:
        print("No records found in the 'books' table.")

except Exception as e:
    print("An error occurred while fetching records:", e)

OrderedDict([('id', 1), ('book_id', 'a-light-in-the-attic_1000'), ('last_seen', datetime.datetime(2024, 5, 2, 21, 45, 5))])
OrderedDict([('id', 2), ('book_id', 'tipping-the-velvet_999'), ('last_seen', datetime.datetime(2024, 5, 2, 21, 45, 5))])
OrderedDict([('id', 3), ('book_id', 'soumission_998'), ('last_seen', datetime.datetime(2024, 5, 2, 21, 45, 5))])
OrderedDict([('id', 4), ('book_id', 'sharp-objects_997'), ('last_seen', datetime.datetime(2024, 5, 2, 21, 45, 5))])
OrderedDict([('id', 5), ('book_id', 'sapiens-a-brief-history-of-humankind_996'), ('last_seen', datetime.datetime(2024, 5, 2, 21, 45, 3))])
OrderedDict([('id', 6), ('book_id', 'the-requiem-red_995'), ('last_seen', datetime.datetime(2024, 5, 2, 21, 45, 5))])
OrderedDict([('id', 7), ('book_id', 'the-dirty-little-secrets-of-getting-your-dream-job_994'), ('last_seen', datetime.datetime(2024, 5, 2, 21, 45, 5))])
OrderedDict([('id', 8), ('book_id', 'the-coming-woman-a-novel-based-on-the-life-of-the-infamous-feminist-victoria-wo

### Ending: Delete RDS instance

In [16]:
# Deleting the RDS instance
try:
    response = rds_client.delete_db_instance(
        DBInstanceIdentifier=DBID,
        SkipFinalSnapshot=True
    )
    print("Deletion initiated:", response)
except Exception as e:
    print("Error deleting RDS instance:", e)


Deletion initiated: {'DBInstance': {'DBInstanceIdentifier': 'relational-db', 'DBInstanceClass': 'db.t3.micro', 'Engine': 'mysql', 'DBInstanceStatus': 'deleting', 'MasterUsername': 'username', 'DBName': 'books', 'Endpoint': {'Address': 'relational-db.cfxyluhsb7bh.us-east-1.rds.amazonaws.com', 'Port': 3306, 'HostedZoneId': 'Z2R2ITUGPM61AM'}, 'AllocatedStorage': 5, 'InstanceCreateTime': datetime.datetime(2024, 5, 2, 16, 26, 37, 832000, tzinfo=tzutc()), 'PreferredBackupWindow': '09:01-09:31', 'BackupRetentionPeriod': 1, 'DBSecurityGroups': [], 'VpcSecurityGroups': [{'VpcSecurityGroupId': 'sg-0ebcfd71f93ad1599', 'Status': 'active'}], 'DBParameterGroups': [{'DBParameterGroupName': 'default.mysql8.0', 'ParameterApplyStatus': 'in-sync'}], 'AvailabilityZone': 'us-east-1c', 'DBSubnetGroup': {'DBSubnetGroupName': 'default', 'DBSubnetGroupDescription': 'default', 'VpcId': 'vpc-03fe4a0e60b95c95e', 'SubnetGroupStatus': 'Complete', 'Subnets': [{'SubnetIdentifier': 'subnet-06c5f1ed345b03ce5', 'SubnetA