In [1]:
# Step 1: Clone the Sublist3r repository from GitHub
!git clone https://github.com/aboul3la/Sublist3r.git

# Step 2: Navigate to the Sublist3r directory and install required Python packages
%cd Sublist3r
!pip install -r requirements.txt


In [2]:

# Install additional dependencies for Selenium, NLP, and ChromeDriver
!pip install selenium
!pip install transformers
!pip install torch

# Update the system and install Chromium and ChromeDriver
!apt-get update
!apt-get install -y chromium-browser
!apt-get install -y chromium-chromedriver

Collecting selenium
  Downloading selenium-4.29.0-py3-none-any.whl.metadata (7.1 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.29.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.29.0-py3-none-any.whl (9.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m75.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio-0.29.0-py3-none-any.whl (492 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m492.9/492.9 kB[0m [31m29.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio_websocket-0.12.2-py3-none-any.whl (21 kB)
Downloading outcome-1.3.0.post0-py2.py3-

In [3]:
# Check the path of the installed ChromeDriver
!which chromedriver

/usr/bin/chromedriver


In [7]:

# Import necessary libraries
import textwrap
import re
import csv
import os
import torch
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from transformers import pipeline, BertTokenizer, BertForSequenceClassification

# Define the Domain Analysis class, which extends Selenium's Chrome WebDriver
class Domain_Analyse(webdriver.Chrome):
    def __init__(self, driver_path, teardown=False):
        self.driver_path = driver_path  # Path to ChromeDriver
        self.teardown = teardown  # Whether to close the browser after use

        # Set up Chrome options for headless mode (no GUI)
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--no-sandbox")
        os.environ["PATH"] += os.pathsep + self.driver_path  # Add ChromeDriver to PATH

        # Load NLP models for summarization and sentiment analysis
        self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
        self.sentiment_pipeline = pipeline("sentiment-analysis")

        # Load BERT model and tokenizer for intent classification
        self.model_name = "bert-base-uncased"
        self.tokenizer = BertTokenizer.from_pretrained(self.model_name)
        self.model = BertForSequenceClassification.from_pretrained(self.model_name, num_labels=2)

        # Initialize the Chrome WebDriver with the configured options
        super().__init__(options=chrome_options)

    # Method to enumerate subdomains using Sublist3r
    def enumerate_subdomains(self, domain):
        try:
            print(f"Enumerating subdomains for {domain}...")
            # Run Sublist3r to find subdomains and save the output to a file
            !python sublist3r.py -d {domain_name} -o output.txt
            subdomains = []
            # Read the subdomains from the output file
            with open('output.txt', 'r') as file:
                subdomains = [line.strip() for line in file.readlines() if line.strip()]

            print(f"Found {len(subdomains)} subdomains for {domain}.")
            return subdomains
        except Exception as e:
            print(f"An error occurred while enumerating subdomains for {domain}: {e}")
            return []

    # Method to find the URL of the terms and conditions page
    def find_terms_url(self, url):
        self.get(url)  # Navigate to the given URL
        self.implicitly_wait(10)  # Wait for the page to load
        # List of keywords to look for in the links
        keywords = [
            "terms", "our-privacy-policy", "conditions", "privacy-policy", "policies-procedures",
            "terms of service", "user agreement", "data protection",
            "personal data", "data collection", "data usage",
            "user consent", "data security",
            "third-party sharing", "cookies policy", "policies-procedures"
        ]
        # Find all links on the page
        links = self.find_elements(By.TAG_NAME, "a")
        # Check each link for the presence of keywords
        for link in links:
            href = link.get_attribute("href")
            if any(href.lower().endswith(f"{keyword.lower()}/") for keyword in keywords):
                return href  # Return the URL if a match is found
        return None  # Return None if no matching URL is found

    # Method to scrape the terms and conditions text from a URL
    def scrape_terms(self, url):
        try:
            # Find the terms and conditions URL
            terms_url = self.find_terms_url(url)
            if terms_url:
                self.get(terms_url)  # Navigate to the terms URL
                self.implicitly_wait(10)  # Wait for the page to load
                # Extract the text from the page body
                policy_text = self.find_element(By.TAG_NAME, "body").text
                return policy_text, terms_url
            else:
                return "Terms and Conditions not found.", None
        except Exception as e:
            print(f"Error scraping terms: {e}")
            return ""

    # Method to preprocess text by removing extra spaces and punctuation
    def preprocess_text(self, text):
        text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
        text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
        return text.strip()

    # Method to classify the intent of the text using BERT
    def classify_intent(self, text):
        # Tokenize the text and truncate it to 512 tokens
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = self.model(**inputs)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=1).item()  # Get the predicted class
        return predicted_class  # 0 -> User Protection, 1 -> Data Exploitation

    # Method to summarize the policy text using BART
    def summarize_policy(self, text):
        # Summarize the text, truncating to fit within the model's limit
        summary = self.summarizer(text, max_length=512, min_length=10, do_sample=False)
        return summary[0]['summary_text']

    # Method to analyze the sentiment of the text
    def analyze_sentiment(self, text):
        result = self.sentiment_pipeline(text)  # Analyze sentiment
        return result[0]  # Return the sentiment result

    # Method to split text into chunks of 512 tokens for processing
    def split_text_into_chunks(self, text, max_tokens=512):
        tokens = self.tokenizer.encode(text, truncation=True, max_length=max_tokens)  # Tokenize the text
        chunks = []
        # Split the tokens into chunks
        for i in range(0, len(tokens), max_tokens):
            chunk = tokens[i:i + max_tokens]
            chunks.append(self.tokenizer.decode(chunk))  # Decode tokens back to text
        return chunks

    # Method to analyze the policy text (sentiment, intent, and summary)
    def analyze_policy(self, text):
        preprocessed_text = self.preprocess_text(text)  # Preprocess the text

        # Analyze sentiment
        sentiment = self.analyze_sentiment(preprocessed_text)

        # Split text into chunks if it exceeds the token limit
        chunks = self.split_text_into_chunks(preprocessed_text)

        # Classify intent for each chunk
        intents = [self.classify_intent(chunk) for chunk in chunks]
        intent = "User Protection" if intents.count(0) > intents.count(1) else "Data Exploitation"

        # Summarize the policy
        summary = self.summarize_policy(preprocessed_text)

        return {
            "Sentiment": sentiment,
            "Intent": intent,
            "Summary": summary,
        }

    # Method to clean up resources when exiting the context manager
    def __exit__(self, exc_type, exc_value, traceback):
        if self.teardown:
            self.quit()  # Close the browser

# Helper function to print text in a wrapped format
def print_wrapped_text(text, width=80):
    print("\n".join(textwrap.wrap(text, width=width)))

# Main function to execute the script
if __name__ == "__main__":
    driver_path = r"/usr/bin/chromedriver"  # Path to ChromeDriver
    with open(r"/content/top-1m.csv", mode='r') as file:  # Open the CSV file containing domains
        with Domain_Analyse(driver_path=r"/usr/bin/chromedriver") as DA:  # Initialize the Domain_Analyse class
            csv_reader = csv.reader(file)  # Read the CSV file
            for index, row in enumerate(csv_reader):
                if index >= 1:  # Process only the first domain for demonstration
                    break
                domain_name = row[1]  # Get the domain name from the CSV
                subdomain_list = DA.enumerate_subdomains(domain_name)  # Enumerate subdomains

                # Print the list of subdomains
                for ele in subdomain_list:
                    print(ele)
                print("\n")

               # Analyze the terms and conditions for each subdomain
                for subdomain in subdomain_list:
                    # Assign the results of DA.scrape_terms to a single variable
                    result = DA.scrape_terms(f"https://{subdomain}")
                    # Check if the result contains two values before unpacking
                    if result and len(result) == 2:
                        policy_identified, terms_url = result
                        print(f"{terms_url} \n")
                        print_wrapped_text(policy_identified)  # Print the policy text

                        # Analyze the policy and print the results
                        result = DA.analyze_policy(policy_identified)
                        for key, value in result.items():
                            print(f"{key} -- {value}")
                    else:
                        # Handle the case where DA.scrape_terms doesn't return two values
                        print(f"Could not find terms and conditions for {subdomain}")

Device set to use cuda:0
No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cuda:0
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Enumerating subdomains for termly.io...
[91m
                 ____        _     _ _     _   _____
                / ___| _   _| |__ | (_)___| |_|___ / _ __
                \___ \| | | | '_ \| | / __| __| |_ \| '__|
                 ___) | |_| | |_) | | \__ \ |_ ___) | |
                |____/ \__,_|_.__/|_|_|___/\__|____/|_|[0m[93m

                # Coded By Ahmed Aboul-Ela - @aboul3la
    
[94m[-] Enumerating subdomains now for termly.io[0m
[92m[-] Searching now in Baidu..[0m
[92m[-] Searching now in Yahoo..[0m
[92m[-] Searching now in Google..[0m
[92m[-] Searching now in Bing..[0m
[92m[-] Searching now in Ask..[0m
[92m[-] Searching now in Netcraft..[0m
[92m[-] Searching now in DNSdumpster..[0m
[92m[-] Searching now in Virustotal..[0m
[92m[-] Searching now in ThreatCrowd..[0m
[92m[-] Searching now in SSL Certificates..[0m
[92m[-] Searching now in PassiveDNS..[0m
Process DNSdumpster-8:
Traceback (most recent call last):
  File "/usr/lib/python3.11/multiproce

Your max_length is set to 512, but your input_length is only 7. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=3)


Sentiment -- {'label': 'NEGATIVE', 'score': 0.9978312849998474}
Intent -- User Protection
Summary -- CNN.com will feature iReporter photos in a weekly Travel Snapshots gallery. Please submit your best shots for next week. Visit CNN.com/Travel next Wednesday for a new gallery of snapshots.
Error scraping terms: 'NoneType' object has no attribute 'lower'
Could not find terms and conditions for us.consent.api.termly.io
https://termly.io/our-privacy-policy/ 

Open Navigation menu   Home  ›  Termly’s Privacy Notice Termly’s Privacy Notice
Company About Us Careers Updates and Press Our Privacy Center Our Privacy Policy
Our Terms of Use Our Disclaimer Our Cookie Policy Our Sub-Processors Limit the
Use of My Sensitive Personal Information Do Not Sell or Share My Information
Products Privacy Policy Generator Terms and Conditions Generator EULA Generator
Impressum Generator Refund & Return Policy Shipping Policy Generator Disclaimer
Generator Consent Management Platform Cookie Consent Cookie Ban

Your max_length is set to 512, but your input_length is only 211. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=105)


Sentiment -- {'label': 'NEGATIVE', 'score': 0.9917758703231812}
Intent -- User Protection
Summary -- Termlys is not a lawyer or a law firm and does not engage in the practice of law or provide legal advice or legal representation. All information software services and comments provided on the site are for informational and selfhelp purposes only and are not intended to be a substitute for professional legal advice. Use of this site is subject to our Terms of Use.
Error scraping terms: Message: unknown error: net::ERR_NAME_NOT_RESOLVED
  (Session info: chrome=134.0.6998.88)
Stacktrace:
#0 0x57ecc625aa1a <unknown>
#1 0x57ecc5d12390 <unknown>
#2 0x57ecc5d09518 <unknown>
#3 0x57ecc5cf9a39 <unknown>
#4 0x57ecc5cfb73d <unknown>
#5 0x57ecc5cf9dce <unknown>
#6 0x57ecc5cf9775 <unknown>
#7 0x57ecc5cf9449 <unknown>
#8 0x57ecc5cf7159 <unknown>
#9 0x57ecc5cf7a2a <unknown>
#10 0x57ecc5d15829 <unknown>
#11 0x57ecc5db0d85 <unknown>
#12 0x57ecc5d89bd2 <unknown>
#13 0x57ecc5db007b <unknown>
#14 0x57ecc5

Your max_length is set to 512, but your input_length is only 7. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=3)


Sentiment -- {'label': 'NEGATIVE', 'score': 0.9978312849998474}
Intent -- User Protection
Summary -- CNN.com will feature iReporter photos in a weekly Travel Snapshots gallery. Please submit your best shots for next week. Visit CNN.com/Travel next Wednesday for a new gallery of snapshots.
None 

Terms and Conditions not found.


Your max_length is set to 512, but your input_length is only 7. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=3)


Sentiment -- {'label': 'NEGATIVE', 'score': 0.9978312849998474}
Intent -- User Protection
Summary -- CNN.com will feature iReporter photos in a weekly Travel Snapshots gallery. Please submit your best shots for next week. Visit CNN.com/Travel next Wednesday for a new gallery of snapshots.
https://termlyv3dev.wpengine.com/our-privacy-policy/ 

Open Navigation menu   Home  ›  Termly’s Privacy Notice Termly’s Privacy Notice
Company About Us Careers Updates and Press Our Privacy Center Our Privacy Policy
Our Terms of Use Our Disclaimer Our Cookie Policy Our Sub-Processors Limit the
Use of My Sensitive Personal Information Do Not Sell or Share My Information
Products Privacy Policy Generator Terms and Conditions Generator EULA Generator
Refund & Return Policy Shipping Policy Generator Disclaimer Generator Consent
Management Platform Cookie Consent Cookie Banner Cookie Policy Generator Cookie
Scanner Support Help and Support FAQs Contact Us Pricing Partner with Us
Resources Cookie Preference

Your max_length is set to 512, but your input_length is only 208. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=104)


Sentiment -- {'label': 'NEGATIVE', 'score': 0.9896798133850098}
Intent -- User Protection
Summary -- Termlys is not a lawyer or a law firm and does not engage in the practice of law or provide legal advice or legal representation. All information software services and comments provided on the site are for informational and selfhelp purposes only and are not intended to be a substitute for professional legal advice. Use of this site is subject to our Terms of Use.
https://staging.termly.io/our-privacy-policy/ 

We use essential cookies to make our site work. With your consent, we may also
use non-essential cookies to improve user experience, personalize content,
customize advertisements, and analyze website traffic. For these reasons, we may
share your site usage data with our social media, advertising, and analytics
partners. By clicking ”Accept,” you agree to our website's cookie use as
described in our Cookie Policy. You can change your cookie settings at any time
by clicking “Prefer

Your max_length is set to 512, but your input_length is only 293. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=146)


Sentiment -- {'label': 'NEGATIVE', 'score': 0.9959775805473328}
Intent -- Data Exploitation
Summary -- We use essential cookies to make our site work With your consent we may also use nonessential cookies to improve user experience personalize content customize advertisements and analyze website traffic. For these reasons we may share your site usage data with our social media advertising and analytics partners By clicking Accept you agree to our websites cookie use as described in our Cookie Policy.
None 

Terms and Conditions not found.


Your max_length is set to 512, but your input_length is only 7. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=3)


Sentiment -- {'label': 'NEGATIVE', 'score': 0.9978312849998474}
Intent -- User Protection
Summary -- CNN.com will feature iReporter photos in a weekly Travel Snapshots gallery. Please submit your best shots for next week. Visit CNN.com/Travel next Wednesday for a new gallery of snapshots.
None 

Terms and Conditions not found.


Your max_length is set to 512, but your input_length is only 7. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=3)


Sentiment -- {'label': 'NEGATIVE', 'score': 0.9978312849998474}
Intent -- User Protection
Summary -- CNN.com will feature iReporter photos in a weekly Travel Snapshots gallery. Please submit your best shots for next week. Visit CNN.com/Travel next Wednesday for a new gallery of snapshots.
