<a href="https://colab.research.google.com/github/PencilNeck666/PencilNeck666/blob/main/pyscraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install requests
!pip install bs4
!pip install geoip2

import requests
from bs4 import BeautifulSoup
import socket
from geoip2 import database

def get_public_ip():
    try:
        response = requests.get('https://api.ipify.org?format=json')
        return response.json().get('ip')
    except Exception as e:
        print(f"Error getting public IP: {e}")
        return None

def gather_information(url, username=None, name=None):
    """
    This function scrapes data from a website, extracts IP information, and analyzes the data.

    Args:
        url: The URL of the website to scrape.

    Returns:
        A dictionary containing the scraped data, IP information, and analysis results.
    """
    results = {}
    try:
        # Scrape data from the website
        response = requests.get(url)
        response.raise_for_status()  # Ensure we notice bad responses
        soup = BeautifulSoup(response.content, "html.parser")

        # Replace these with actual class names used on the website
        data = soup.find_all(class_="some_class")
        user_handles = soup.find_all(class_="user-handle")
        person_names = soup.find_all(class_="person-name")

        # Extract text from BeautifulSoup objects
        user_handles_text = [handle.get_text(strip=True) for handle in user_handles]
        person_names_text = [name.get_text(strip=True) for name in person_names]

        # Filter data based on username or name
        if username:
            data = [item for item in data if username in item]
        if name:
            data = [item for item in data if name in item]

    except requests.RequestException as e:
        print(f"Error fetching the URL: {e}")
        return results

    # Extract IP information
    public_ip = get_public_ip()
    if not public_ip:
        return results

    try:
        reader = database.Reader('GeoLite2-City.mmdb')
        geo_info = reader.city(public_ip)
    except Exception as e:
        print(f"Error with GeoIP lookup: {e}")
        return results

    # Analyze the data (e.g., count the number of elements)
    data_count = len(data)

    # Prepare the results
    results = {
        "scraped_data": data,
        "public_ip": public_ip,
        "city": geo_info.city.name,
        "country": geo_info.country.name,
        "data_count": data_count,
        "user_handles": user_handles_text,
        "person_names": person_names_text
    }
    return results

# Example usage
url = "https://example.com/some_page"
results = gather_information(url)

# Print the results
if results:
    print(f"Scraped data: {results['scraped_data']}")
    print(f"Public IP: {results['public_ip']}")
    print(f"City: {results['city']}")
    print(f"Country: {results['country']}")
    print(f"Number of data points: {results['data_count']}")
    print(f"User handles: {results['user_handles']}")
    print(f"Person names: {results['person_names']}")
else:
    print("Failed to gather information.")


Error fetching the URL: 500 Server Error: Internal Server Error for url: https://example.com/some_page
Failed to gather information.


In [2]:
!pip install requests
!pip install bs4
!pip install geoip2

import requests
from bs4 import BeautifulSoup
import socket
from geoip2 import database
import os

def get_public_ip():
    try:
        response = requests.get('https://api.ipify.org?format=json')
        response.raise_for_status()
        return response.json().get('ip')
    except Exception as e:
        print(f"Error getting public IP: {e}")
        return None

def gather_information(urls, username=None, name=None):
    """
    This function scrapes data from multiple websites, extracts IP information, and analyzes the data.

    Args:
        urls: A comma-separated string of URLs of the websites to scrape.
        username: The user handle to filter by (optional).
        name: The person's name to filter by (optional).

    Returns:
        A list of dictionaries, each containing the scraped data, IP information, and analysis results for each URL.
    """
    results = []
    url_list = urls.split(',')

    for url in url_list:
        url = url.strip()  # Remove any leading/trailing whitespace
        result = {}
        try:
            # Scrape data from the website
            response = requests.get(url)
            response.raise_for_status()  # Ensure we notice bad responses
            soup = BeautifulSoup(response.content, "html.parser")

            # Replace these with actual class names used on the website
            data = soup.find_all(class_="some_class")
            user_handles = soup.find_all(class_="user-handle")
            person_names = soup.find_all(class_="person-name")

            # Extract text from BeautifulSoup objects
            user_handles_text = [handle.get_text(strip=True) for handle in user_handles]
            person_names_text = [name.get_text(strip=True) for name in person_names]

            # Filter data based on username or name
            filtered_data = []
            for item in data:
                item_text = item.get_text(strip=True)
                if (username and any(username in handle for handle in user_handles_text)) or \
                   (name and any(name in pname for pname in person_names_text)):
                    filtered_data.append(item_text)

        except requests.RequestException as e:
            print(f"Error fetching the URL {url}: {e}")
            continue

        # Extract IP information
        public_ip = get_public_ip()
        if not public_ip:
            continue

        try:
            geoip_db_path = 'GeoLite2-City.mmdb'
            if not os.path.exists(geoip_db_path):
                raise FileNotFoundError(f"GeoIP database not found at path: {geoip_db_path}")
            reader = database.Reader(geoip_db_path)
            geo_info = reader.city(public_ip)
        except Exception as e:
            print(f"Error with GeoIP lookup: {e}")
            continue

        # Analyze the data (e.g., count the number of elements)
        data_count = len(filtered_data)

        # Prepare the result for this URL
        result = {
            "url": url,
            "scraped_data": filtered_data,
            "public_ip": public_ip,
            "city": geo_info.city.name if geo_info.city else "Unknown",
            "country": geo_info.country.name if geo_info.country else "Unknown",
            "data_count": data_count,
            "user_handles": user_handles_text,
            "person_names": person_names_text
        }
        results.append(result)

    return results

# Example usage
urls = "https://example.com/some_page, https://anotherexample.com/another_page"
results = gather_information(urls, username="example_handle", name="Example Name")

# Print the results
for result in results:
    print(f"URL: {result['url']}")
    print(f"Scraped data: {result['scraped_data']}")
    print(f"Public IP: {result['public_ip']}")
    print(f"City: {result['city']}")
    print(f"Country: {result['country']}")
    print(f"Number of data points: {result['data_count']}")
    print(f"User handles: {result['user_handles']}")
    print(f"Person names: {result['person_names']}")
    print("------")


Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Installing collected packages: bs4
Successfully installed bs4-0.0.2
Collecting geoip2
  Downloading geoip2-4.8.0-py2.py3-none-any.whl (27 kB)
Collecting maxminddb<3.0.0,>=2.5.1 (from geoip2)
  Downloading maxminddb-2.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (87 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.8/87.8 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: maxminddb, geoip2
Successfully installed geoip2-4.8.0 maxminddb-2.6.1
Error fetching the URL https://example.com/some_page: 500 Server Error: Internal Server Error for url: https://example.com/some_page
Error fetching the URL https://anotherexample.com/another_page: HTTPSConnectionPool(host='anotherexample.com', port=443): Max retries exceeded with url: /another_page (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x79fc69cd1840>: Failed to resolve 'anothere

In [3]:
!pip install requests
!pip install bs4
!pip install geoip2




import requests
from bs4 import BeautifulSoup
import socket
from geoip2 import database
import os
import argparse

def get_public_ip():
    try:
        response = requests.get('https://api.ipify.org?format=json')
        response.raise_for_status()
        return response.json().get('ip')
    except Exception as e:
        print(f"Error getting public IP: {e}")
        return None

def gather_information(urls, username=None, name=None):
    """
    This function scrapes data from multiple websites, extracts IP information, and analyzes the data.

    Args:
        urls: A comma-separated string of URLs of the websites to scrape.
        username: The user handle to filter by (optional).
        name: The person's name to filter by (optional).

    Returns:
        A list of dictionaries, each containing the scraped data, IP information, and analysis results for each URL.
    """
    results = []
    url_list = urls.split(',')

    for url in url_list:
        url = url.strip()  # Remove any leading/trailing whitespace
        result = {}
        try:
            # Scrape data from the website
            response = requests.get(url)
            response.raise_for_status()  # Ensure we notice bad responses
            soup = BeautifulSoup(response.content, "html.parser")

            # Replace these with actual class names used on the website
            data = soup.find_all(class_="some_class")
            user_handles = soup.find_all(class_="user-handle")
            person_names = soup.find_all(class_="person-name")

            # Extract text from BeautifulSoup objects
            user_handles_text = [handle.get_text(strip=True) for handle in user_handles]
            person_names_text = [name.get_text(strip=True) for name in person_names]

            # Filter data based on username or name
            filtered_data = []
            for item in data:
                item_text = item.get_text(strip=True)
                if (username and any(username in handle for handle in user_handles_text)) or \
                   (name and any(name in pname for pname in person_names_text)):
                    filtered_data.append(item_text)

        except requests.RequestException as e:
            print(f"Error fetching the URL {url}: {e}")
            continue

        # Extract IP information
        public_ip = get_public_ip()
        if not public_ip:
            continue

        try:
            geoip_db_path = 'GeoLite2-City.mmdb'
            if not os.path.exists(geoip_db_path):
                raise FileNotFoundError(f"GeoIP database not found at path: {geoip_db_path}")
            reader = database.Reader(geoip_db_path)
            geo_info = reader.city(public_ip)
        except Exception as e:
            print(f"Error with GeoIP lookup: {e}")
            continue

        # Analyze the data (e.g., count the number of elements)
        data_count = len(filtered_data)

        # Prepare the result for this URL
        result = {
            "url": url,
            "scraped_data": filtered_data,
            "public_ip": public_ip,
            "city": geo_info.city.name if geo_info.city else "Unknown",
            "country": geo_info.country.name if geo_info.country else "Unknown",
            "data_count": data_count,
            "user_handles": user_handles_text,
            "person_names": person_names_text
        }
        results.append(result)

    return results

def main():
    parser = argparse.ArgumentParser(description='Scrape data from websites and gather IP information.')
    parser.add_argument('urls', type=str, help='Comma-separated URLs of the websites to scrape.')
    parser.add_argument('--username', type=str, help='User handle to filter by.', default=None)
    parser.add_argument('--name', type=str, help="Person's name to filter by.", default=None)

    args = parser.parse_args()

    results = gather_information(args.urls, username=args.username, name=args.name)

    # Print the results
    for result in results:
        print(f"URL: {result['url']}")
        print(f"Scraped data: {result['scraped_data']}")
        print(f"Public IP: {result['public_ip']}")
        print(f"City: {result['city']}")
        print(f"Country: {result['country']}")
        print(f"Number of data points: {result['data_count']}")
        print(f"User handles: {result['user_handles']}")
        print(f"Person names: {result['person_names']}")
        print("------")

if __name__ == "__main__":
    main()




usage: colab_kernel_launcher.py [-h] [--username USERNAME] [--name NAME] urls
colab_kernel_launcher.py: error: unrecognized arguments: -f


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
