## Importing and Scraping the Content

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Fetch the webpage content
url = 'https://aps-india.de/news/'  
response = requests.get(url)
soup = BeautifulSoup(response.content, 'lxml')

# Find all strong and span tags
tags = soup.find_all(['strong', 'span'])  # Fetch both strong and span tags

# Extract text from the tags that contain the word 'Update'
new_data = []
for tag in tags:
    text = tag.get_text(strip=True)  # Extract and clean the text
    if 'UPDATE' in text:  # Check if 'Update' is in the text
        new_data.append(text)


# Convert the new scraped data to a DataFrame
new_df = pd.DataFrame(new_data, columns=['Text'])

#new_df = new_df.iloc[1:, :] 

#new_df.to_csv('extracted_updates.csv', index = False)

In [2]:
new_df

Unnamed: 0,Text
0,UPDATE 10 JULY 2024:We would like to inform yo...
1,UPDATE 10 JULY 2024:
2,UPDATE 03 JUNE 2024:
3,UPDATE 14 MAY 2024:
4,UPDATE 31 OCT 2023:
5,UPDATE 21 JUN 2023:
6,UPDATE 26 APR 2023:
7,UPDATE 21 APR 2023:
8,UPDATE 07 MAR 2023:
9,UPDATE 08 FEB 2023:


In [3]:
new_df.Text[20]

'UPDATE'

## Comparing the previous Output for Update Process

In [4]:
# Load the old CSV file with the previously scraped data
old_df = pd.read_csv('extracted_updates.csv')

In [5]:
# Concatenate the old and new data
combined_df = pd.concat([old_df, new_df])

# Keep only the new/unmatched rows by dropping duplicates
# This assumes the 'Text' column should be unique, i.e., if it's repeated, it's not new
new_unmatched_df = combined_df.drop_duplicates(keep=False)

# Check if there are any new/unmatched rows
if not new_unmatched_df.empty:
    print("There are new updates or unmatched rows!")
else:
    print("No new updates found.")

There are new updates or unmatched rows!


In [6]:
# Save the new/unmatched rows to a separate CSV if needed
new_unmatched_df.to_csv('new_updates.csv', index=False)

In [8]:
import yagmail
import os

def send_email(subject, body, to_email):
    # Get email and password from environment variables
    email_user = os.getenv('EMAIL_USER')
    email_password = os.getenv('EMAIL_PASSWORD')

    # Initialize yagmail
    yag = yagmail.SMTP(email_user, email_password)
    
    # Send the email
    yag.send(to=to_email, subject=subject, contents=body)

# Convert new/unmatched rows to string for the email body
if not new_unmatched_df.empty:
    df_string = new_unmatched_df.to_string(index=False)

    email_body = (
        "This is an automated mail. Here are the results for the scheduled trigger:\n\n"
        + df_string  # Include DataFrame content
        + "\n\nYou can review further details at the following URL:\n"
        + "https://aps-india.de/news/"  # Add your URL
    )

    recipient_emails = os.getenv('EMAIL_RECIPIENTS').split(',')

    send_email('APS News Updates', email_body, recipient_emails)
else:
    print("No new updates to send via email.")




In [9]:
new_df.to_csv('extracted_updates.csv', index = False)