In [94]:
import requests
from bs4 import BeautifulSoup
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from urllib.parse import urlparse

# Step 1: Fetch metadata from a URL
def fetch_metadata(url):
    try:
        # Fetch the webpage content
        response = requests.get(url)
        response.raise_for_status()  # Ensure we got a successful response

        # Parse the HTML content with BeautifulSoup
        soup = BeautifulSoup(response.text, "html.parser")

        # Extract the title
        title = soup.title.string if soup.title else "No title"

        # Extract the description (if present)
        description_tag = soup.find("meta", attrs={"name": "description"})
        description = description_tag["content"] if description_tag else "No description"

        # Extract the keywords (if present)
        keywords_tag = soup.find("meta", attrs={"name": "keywords"})
        keywords = keywords_tag["content"] if keywords_tag else "N/A"

        return {"title": title, "description": description, "keywords": keywords}
    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL: {e}")
        return None

# Step 2: Analyze metadata for flagged keywords
def analyze_data(metadata):
    if not metadata:
        return []

    # Expanded list of suspicious or dangerous keywords to flag
    flagged_keywords = [
        "malware", "attack", "breach", "phishing", "ransomware", "spyware", "hacking", 
        "vulnerability", "exploit", "trojan", "scam", "botnet", "data leak"
    ]

    # Check if any flagged keyword appears in the description (case insensitive)
    flags = [kw for kw in flagged_keywords if kw in metadata["description"].lower()]
    return flags

# Step 3: Generate the report and save it as a PDF
def generate_report(data, flagged_keywords, filename="osint_report.pdf"):
    c = canvas.Canvas(filename, pagesize=letter)
    c.setFont("Helvetica", 12)  # Set font for the text

    # Title at the top of the page
    c.drawString(100, 750, f"OSINT Report Summary: {data['title']}")

    # Description of the website
    c.drawString(100, 730, f"Description: {data['description']}")

    # Keywords (or N/A if not available)
    c.drawString(100, 710, f"Keywords: {data['keywords']}")

    # Flagged Keywords
    if flagged_keywords:
        c.drawString(100, 690, f"Flagged Keywords: {', '.join(flagged_keywords)}")  # Join keywords with a comma
    else:
        c.drawString(100, 690, "Flagged Keywords: None")

    # Save the PDF
    c.save()

# Main function: Fetch metadata, analyze for flagged keywords, and generate the report
def main(url):
    metadata = fetch_metadata(url)  # Fetch metadata from the website
    if metadata is None:
        print("Failed to fetch metadata. Exiting.")
        return

    flagged_keywords = analyze_data(metadata)  # Analyze description for flagged keywords

    # Extract domain from the URL (use it as part of the filename)
    parsed_url = urlparse(url)
    domain = parsed_url.netloc  # This will give you the domain, e.g., "malwarebytes.com"

    # Ensure the domain is formatted correctly for filenames (replace www. and handle edge cases)
    domain = domain.replace('www.', '').replace('/', '')

    # Generate the report and save it to a PDF with the domain in the filename
    filename = f"Osint Report {domain}.pdf"
    generate_report(metadata, flagged_keywords, filename=filename)
    print(f"Report generated: {filename}")

# Example URL to test
url = "https://www.cybereason.com/"  # Replace with any URL you'd like to analyze
main(url)


Report generated: Osint Report cybereason.com.pdf
