In [17]:
# help me make a scraper for MFN news data from the website using AWS Lambda
# https://www.mfn.se/nyheter/

import requests
from bs4 import BeautifulSoup
import json
import boto3
import os
import datetime
import logging

# set up logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)

# set up the s3 client
s3 = boto3.client('s3')

# set up the bucket name
bucket_name = os.environ['BUCKET_NAME']

# set up the url
url = 'https://www.mfn.se/nyheter/'

# set up the headers

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

# set up the function
def lambda_handler(event, context):
    # get the html content
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    # get the news data
    news_data = []
    for news in soup.find_all('div', class_='news-item'):
        news_data.append({
            'title': news.find('a').text,
            'link': news.find('a')['href'],
            'date': news.find('span', class_='date').text
        })
    # save the news data to s3
    file_name = 'news_data.json'
    s3.put_object(Bucket=bucket_name, Key=file_name, Body=json.dumps(news_data))
    # log the success
    logger.info('News data saved to s3')
    return {
        'statusCode': 200,
        'body': json.dumps('News data saved to s3')
    }

# test the function
lambda_handler(None, None)

# check the s3 bucket
response = s3.get_object(Bucket=bucket_name, Key='news_data.json')
news_data = json.loads(response['Body'].read())
print(news_data)

# check the logs
# go to the cloudwatch logs
# click on the log group
# click on the log stream
# click on the log event
# check the log messages

'''

## Step 5: Schedule the Lambda Function

Now that we have the Lambda function, we can schedule it to run at regular intervals. This will allow us to scrape the news data from the website automatically.

Here's how you can schedule the Lambda function:

1. Go to the AWS Management Console and navigate to the Lambda service.
2. Click on the Lambda function that you created earlier.
3. Click on the "Add trigger" button.
4. Select "CloudWatch Events/EventBridge" as the trigger type.
5. Click on the "Create a new rule" button.
6. Enter a name for the rule, such as "ScrapeNewsData".
7. Set the schedule expression to the desired interval, e.g., `rate(1 day)`.
8. Click on the "Add" button to add the trigger.
9. Click on the "Save" button to save the changes.

Now the Lambda function will run at the specified interval and scrape the news data from the website.

## Conclusion

In this tutorial, we have learned how to scrape news data from a website using AWS Lambda. We have set up a Lambda function that scrapes the news data from the website and saves it to an S3 bucket. We have also scheduled the Lambda function to run at regular intervals using CloudWatch Events/EventBridge.

This approach allows us to automate the process of scraping news data from the website and store it in a centralized location for further analysis. It can be useful for monitoring news updates, tracking trends, and generating insights from the data.
'''


KeyError: 'BUCKET_NAME'

In [None]:
pip install bs4

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.
