<a href="https://colab.research.google.com/github/PencilNeck666/PencilNeck666/blob/main/Pyscraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install requests
!pip install bs4
!pip install geoip2




import requests
from bs4 import BeautifulSoup
import socket
from geoip2 import database
import os
import argparse

def get_public_ip():
    try:
        response = requests.get('https://api.ipify.org?format=json')
        response.raise_for_status()
        return response.json().get('ip')
    except Exception as e:
        print(f"Error getting public IP: {e}")
        return None

def gather_information(urls, username=None, name=None):
    """
    This function scrapes data from multiple websites, extracts IP information, and analyzes the data.

    Args:
        urls: A comma-separated string of URLs of the websites to scrape.
        username: The user handle to filter by (optional).
        name: The person's name to filter by (optional).

    Returns:
        A list of dictionaries, each containing the scraped data, IP information, and analysis results for each URL.
    """
    results = []
    url_list = urls.split(',')

    for url in url_list:
        url = url.strip()  # Remove any leading/trailing whitespace
        result = {}
        try:
            # Scrape data from the website
            response = requests.get(url)
            response.raise_for_status()  # Ensure we notice bad responses
            soup = BeautifulSoup(response.content, "html.parser")

            # Replace these with actual class names used on the website
            data = soup.find_all(class_="some_class")
            user_handles = soup.find_all(class_="user-handle")
            person_names = soup.find_all(class_="person-name")

            # Extract text from BeautifulSoup objects
            user_handles_text = [handle.get_text(strip=True) for handle in user_handles]
            person_names_text = [name.get_text(strip=True) for name in person_names]

            # Filter data based on username or name
            filtered_data = []
            for item in data:
                item_text = item.get_text(strip=True)
                if (username and any(username in handle for handle in user_handles_text)) or \
                   (name and any(name in pname for pname in person_names_text)):
                    filtered_data.append(item_text)

        except requests.RequestException as e:
            print(f"Error fetching the URL {url}: {e}")
            continue

        # Extract IP information
        public_ip = get_public_ip()
        if not public_ip:
            continue

        try:
            geoip_db_path = 'GeoLite2-City.mmdb'
            if not os.path.exists(geoip_db_path):
                raise FileNotFoundError(f"GeoIP database not found at path: {geoip_db_path}")
            reader = database.Reader(geoip_db_path)
            geo_info = reader.city(public_ip)
        except Exception as e:
            print(f"Error with GeoIP lookup: {e}")
            continue

        # Analyze the data (e.g., count the number of elements)
        data_count = len(filtered_data)

        # Prepare the result for this URL
        result = {
            "url": url,
            "scraped_data": filtered_data,
            "public_ip": public_ip,
            "city": geo_info.city.name if geo_info.city else "Unknown",
            "country": geo_info.country.name if geo_info.country else "Unknown",
            "data_count": data_count,
            "user_handles": user_handles_text,
            "person_names": person_names_text
        }
        results.append(result)

    return results

def main():
    parser = argparse.ArgumentParser(description='Scrape data from websites and gather IP information.')
    parser.add_argument('urls', type=str, help='Comma-separated URLs of the websites to scrape.')
    parser.add_argument('--username', type=str, help='User handle to filter by.', default=None)
    parser.add_argument('--name', type=str, help="Person's name to filter by.", default=None)

    args = parser.parse_args()

    results = gather_information(args.urls, username=args.username, name=args.name)

    # Print the results
    for result in results:
        print(f"URL: {result['url']}")
        print(f"Scraped data: {result['scraped_data']}")
        print(f"Public IP: {result['public_ip']}")
        print(f"City: {result['city']}")
        print(f"Country: {result['country']}")
        print(f"Number of data points: {result['data_count']}")
        print(f"User handles: {result['user_handles']}")
        print(f"Person names: {result['person_names']}")
        print("------")

if __name__ == "__main__":
    main()




usage: colab_kernel_launcher.py [-h] [--username USERNAME] [--name NAME] urls
colab_kernel_launcher.py: error: unrecognized arguments: -f


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
