In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Sample data with labeled URLs
data = pd.DataFrame({
    'url': [
        'https://example.com/',
        'https://amazon.com/',
        'https://privacyexample.com/',
        'https://nontransparent.com/',
        'https://flipkart.com/'
    ],
    'transparency_label': [0, 1, 1, 0, 1]  # 0: not transparent, 1: transparent
})

# Feature extraction: 'contains privacy' and 'length of URL'
data['contains_privacy'] = data['url'].apply(lambda x: 'privacy' in x.lower())
data['url_length'] = data['url'].apply(len)

# Define a threshold for URL length
url_length_threshold = 25
data['long_url'] = data['url_length'] > url_length_threshold

# Features and labels
X = data[['contains_privacy', 'long_url']]
y = data['transparency_label']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Instantiate and train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
predictions = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
f1 = f1_score(y_test, predictions)

print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1-score: {f1}")


Accuracy: 0.0, Precision: 0.0, Recall: 0.0, F1-score: 0.0


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
import requests
from bs4 import BeautifulSoup
import logging

logging.basicConfig(level=logging.INFO)

def fetch_webpage(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an HTTPError for bad responses
        return response.text
    except requests.exceptions.RequestException as e:
        logging.error(f"Failed to fetch the web page. Error: {e}")
        return None

def extract_and_highlight_terms(html_content, keywords=['terms', 'conditions']):
    if not html_content:
        return

    soup = BeautifulSoup(html_content, 'html.parser')
    terms_elements = soup.select('p, a, h1, h2, h3, h4, h5, h6, span, div')

    filtered_text = ""
    for element in terms_elements:
        text_content = element.get_text()

        # Check if the element contains user-specified keywords
        if any(keyword in text_content.lower() for keyword in keywords):
            filtered_text += f"[TERMS AND CONDITIONS: {text_content}]\n"
        elif element.name == 'a' and element.get('href'):
            filtered_text += f"[{text_content} - {element['href']}]\n"
        else:
            filtered_text += text_content + '\n'

    # Remove extra spaces between lines
    filtered_text = '\n'.join(line.strip() for line in filtered_text.splitlines() if line.strip())
    return filtered_text

def generate_report(highlighted_terms):
    if not highlighted_terms:
        return "No terms and conditions found on the webpage."

    # Simple example: Count the occurrences of the word "privacy" in the highlighted terms
    privacy_count = highlighted_terms.lower().count('privacy')

    # You can customize this report based on your specific criteria
    report = f"Terms and Conditions Report:\n\n"
    report += f"Occurrences of 'privacy': {privacy_count}\n"

    return report

# Example usage:
url_to_fetch = 'https://amazon.in'
webpage_content = fetch_webpage(url_to_fetch)

if webpage_content:
    highlighted_terms = extract_and_highlight_terms(webpage_content)
    print(highlighted_terms)

    report = generate_report(highlighted_terms)
    print(report)


[TERMS AND CONDITIONS:
Skip to main content
.in
Delivering to Mumbai 400001
Update location
All
Select the department you want to search in
All Categories
Alexa Skills
Amazon Devices
Amazon Fashion
Amazon Fresh
Amazon Pharmacy
Appliances
Apps & Games
Audible Audiobooks
Baby
Beauty
Books
Car & Motorbike
Clothing & Accessories
Collectibles
Computers & Accessories
Electronics
Furniture
Garden & Outdoors
Gift Cards
Grocery & Gourmet Foods
Health & Personal Care
Home & Kitchen
Industrial & Scientific
Jewellery
Kindle Store
Luggage & Bags
Luxury Beauty
Movies & TV Shows
Music
Musical Instruments
Office Products
Pet Supplies
Prime Video
Shoes & Handbags
Software
Sports, Fitness & Outdoors
Subscribe & Save
Tools & Home Improvement
Toys & Games
Under ₹500
Video Games
Watches
Search Amazon.in
EN
Hello, sign in
Account & Lists
Returns
& Orders
Cart
All
Fresh
Amazon miniTV
Sell
Best Sellers
Today's Deals
Mobiles
Electronics
Prime
Gift Ideas
Customer Service
New Releases
Home & Kitchen
Fashion
Amaz

In [None]:
import requests
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

def fetch_webpage_content(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an HTTPError for bad responses
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Failed to fetch the web page. Error: {e}")
        return None

def extract_text_from_html(html_content):
    if html_content:
        soup = BeautifulSoup(html_content, 'html.parser')
        return soup.get_text(separator=' ', strip=True)
    return None

def predict_terms_and_conditions(url):
    webpage_content = fetch_webpage_content(url)

    if webpage_content:
        text_content = extract_text_from_html(webpage_content)

        if text_content:
            model = make_pipeline(CountVectorizer(), MultinomialNB())
            model.fit(X_train, y_train)  # Assume you have a pre-trained model and data

            prediction = model.predict([text_content])

            if prediction[0] == 1:
                return "Terms and conditions found on the webpage."
            else:
                return "No terms and conditions found on the webpage."
        else:
            return "Failed to extract text content from the webpage."
    else:
        return "Failed to fetch the webpage content."

# Example usage:
url_to_check = 'https://www.amazon.in'
result = predict_terms_and_conditions(url_to_check)
print(result)


Failed to fetch the web page. Error: 503 Server Error: Service Unavailable for url: https://www.amazon.in/
Failed to fetch the webpage content.


In [12]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse

# Function to calculate transparency score for a given URL
def calculate_transparency_score(url):
    try:
        response = requests.get(url, timeout=5)
        response.raise_for_status()  # Raise an exception for bad responses (e.g., 404, 500)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Feature 1: Presence of a privacy policy
        privacy_policy = 1 if 'privacy' in soup.text.lower() and 'policy' in soup.text.lower() else 0

        # Feature 2: Easy-to-find contact information
        contact_info = 1 if 'contact' in soup.text.lower() else 0

        # Feature 3: Information about data collection and usage
        data_collection_info = 1 if 'data' in soup.text.lower() or 'usage' in soup.text.lower() else 0

        # Feature 4: Use of secure connections (HTTPS)
        uses_https = 1 if urlparse(url).scheme == 'https' else 0

        # Feature 5: Presence of "About Us" section
        about_us_section = 1 if soup.find('a', string='About Us') else 0

        # Feature 6: Presence of "Terms and Conditions" or "Terms of Service"
        terms_and_conditions = 1 if soup.find('a', string='Terms and Conditions') or soup.find('a', string='Terms of Service') else 0

        # Feature 7: Presence of "Cookies" information
        cookies_info = 1 if 'cookie' in soup.text.lower() else 0

        # Feature 8: Presence of "Security" or "Security Practices" section
        security_section = 1 if soup.find('a', string='Security') or soup.find('a', string='Security Practices') else 0

        # Feature 9: Presence of "Data Protection" or "Data Security" section
        data_protection_section = 1 if soup.find('a', string='Data Protection') or soup.find('a', string='Data Security') else 0

        # Feature 10: Presence of "FAQ" section
        faq_section = 1 if soup.find('a', string='FAQ') else 0

        # Feature 11: Presence of GDPR-related information
        gdpr_info = 1 if 'gdpr' in soup.text.lower() or 'general data protection regulation' in soup.text.lower() else 0

        # Feature 12: Presence of CCPA-related information
        ccpa_info = 1 if 'ccpa' in soup.text.lower() or 'california consumer privacy act' in soup.text.lower() else 0

        # Feature 13: Presence of software version in meta tags or headers
        software_version = 1 if soup.find('meta', {'name': 'generator', 'content': True}) or response.headers.get('Server') else 0

        # Feature 14: Presence of technology stack information in footer
        tech_stack_info = 1 if soup.find('footer', string='Powered by') else 0

        # Feature 15: Presence of data access policies
        data_access_policies = 1 if 'access' in soup.text.lower() and 'policy' in soup.text.lower() else 0

        # Feature 16: Presence of data sharing practices
        data_sharing_practices = 1 if 'data sharing' in soup.text.lower() else 0

        # Feature 17: Presence of data retention policies
        data_retention_policies = 1 if 'data retention' in soup.text.lower() else 0

        # Feature 18: Presence of data security measures
        data_security_measures = 1 if 'data security' in soup.text.lower() else 0

        # Feature 19: Presence of third-party services information
        third_party_services_info = 1 if soup.find('section', {'class': 'third-party-services'}) else 0

        # Feature 20: Presence of user tracking information
        user_tracking_info = 1 if 'user tracking' in soup.text.lower() else 0

        # Feature 21: Presence of user education resources
        user_education_resources = 1 if 'user education' in soup.text.lower() else 0

        # Feature 22: Presence of data breach response information
        data_breach_response_info = 1 if 'data breach response' in soup.text.lower() else 0

        # Assigning a transparency score (you can define a more sophisticated scoring logic)
        transparency_score = (
            privacy_policy + contact_info + data_collection_info + uses_https +
            about_us_section + terms_and_conditions + cookies_info +
            security_section + data_protection_section + faq_section +
            gdpr_info + ccpa_info +
            software_version + tech_stack_info +
            data_access_policies + data_sharing_practices +
            data_retention_policies + data_security_measures +
            third_party_services_info + user_tracking_info +
            user_education_resources + data_breach_response_info
        )

        return transparency_score

    except requests.exceptions.RequestException as e:
        print(f"Error calculating transparency score for {url}: {e}")
        return None

# Test URL
test_url = "https://www.amazon.in"

# Calculate transparency score for the test URL
transparency_score = calculate_transparency_score(test_url)

# Display the transparency score
if transparency_score is not None:
    print(f"Transparency Score for {test_url}: {transparency_score}")
else:
    print("Error calculating transparency score.")


Transparency Score for https://www.amazon.in: 3


In [15]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse

# Function to retrieve website content
def get_website_content(url):
    try:
        response = requests.get(url, timeout=5)
        response.raise_for_status()  # Raise an exception for bad responses (e.g., 404, 500)
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Error retrieving content for {url}: {e}")
        return None

# Function to extract text from HTML using BeautifulSoup
def extract_text_from_html(html_content):
    if html_content is not None:
        soup = BeautifulSoup(html_content, 'html.parser')
        return soup.get_text()
    return None

# Function to check if certain keywords are present in the webpage
def check_keywords_presence(text, keywords):
    return any(keyword in text.lower() for keyword in keywords)

# Function to calculate transparency score for a given URL
def calculate_transparency_score(url):
    html_content = get_website_content(url)
    if html_content is not None:
        text_content = extract_text_from_html(html_content)
        soup = BeautifulSoup(html_content, 'html.parser')  # Create a new BeautifulSoup object

        # Feature 1: Presence of a privacy policy
        privacy_policy = 1 if check_keywords_presence(text_content, ['privacy', 'policy']) else 0

        # Feature 2: Easy-to-find contact information
        contact_info = 1 if check_keywords_presence(text_content, ['contact']) else 0

        # Feature 3: Information about data collection and usage
        data_collection_info = 1 if check_keywords_presence(text_content, ['data']) or check_keywords_presence(text_content, ['usage']) else 0

        # Feature 4: Use of secure connections (HTTPS)
        uses_https = 1 if urlparse(url).scheme == 'https' else 0

        # Feature 5: Presence of "About Us" section
        about_us_section = 1 if check_keywords_presence(text_content, ['about us']) else 0
        # Feature 6: Presence of "Terms and Conditions" or "Terms of Service"
        terms_and_conditions = 1 if soup.find('a', string='Terms and Conditions') or soup.find('a', string='Terms of Service') else 0

        # Feature 7: Presence of "Cookies" information
        cookies_info = 1 if 'cookie' in soup.text.lower() else 0

        # Feature 8: Presence of "Security" or "Security Practices" section
        security_section = 1 if soup.find('a', string='Security') or soup.find('a', string='Security Practices') else 0

        # Feature 9: Presence of "Data Protection" or "Data Security" section
        data_protection_section = 1 if soup.find('a', string='Data Protection') or soup.find('a', string='Data Security') else 0

        # Feature 10: Presence of "FAQ" section
        faq_section = 1 if soup.find('a', string='FAQ') else 0

        # Feature 11: Presence of GDPR-related information
        gdpr_info = 1 if 'gdpr' in soup.text.lower() or 'general data protection regulation' in soup.text.lower() else 0

        # Feature 12: Presence of CCPA-related information
        ccpa_info = 1 if 'ccpa' in soup.text.lower() or 'california consumer privacy act' in soup.text.lower() else 0

        # Feature 13: Presence of software version in meta tags or headers
        software_version = 1 if soup.find('meta', {'name': 'generator', 'content': True}) or \
                                 soup.find('meta', {'http-equiv': 'Server', 'content': True}) else 0

        # Feature 14: Presence of technology stack information in footer
        tech_stack_info = 1 if soup.find('footer', string='Powered by') else 0

        # Feature 15: Presence of data access policies
        data_access_policies = 1 if 'access' in soup.text.lower() and 'policy' in soup.text.lower() else 0

        # Feature 16: Presence of data sharing practices
        data_sharing_practices = 1 if 'data sharing' in soup.text.lower() else 0

        # Feature 17: Presence of data retention policies
        data_retention_policies = 1 if 'data retention' in soup.text.lower() else 0

        # Feature 18: Presence of data security measures
        data_security_measures = 1 if 'data security' in soup.text.lower() else 0

        # Feature 19: Presence of third-party services information
        third_party_services_info = 1 if soup.find('section', {'class': 'third-party-services'}) else 0

        # Feature 20: Presence of user tracking information
        user_tracking_info = 1 if 'user tracking' in soup.text.lower() else 0

        # Feature 21: Presence of user education resources
        user_education_resources = 1 if 'user education' in soup.text.lower() else 0

        # Feature 22: Presence of data breach response information
        data_breach_response_info = 1 if 'data breach response' in soup.text.lower() else 0
        # ... (other features)

        # Adjusting the transparency score calculation
        transparency_score = privacy_policy + contact_info + data_collection_info + uses_https + about_us_section + other_features

        return transparency_score

    else:
        print(f"Error calculating transparency score for {url}: Unable to retrieve website content.")
        return None

# Test URL
test_url = "https://www.amazon.in"

# Calculate transparency score for the test URL
transparency_score = calculate_transparency_score(test_url)

# Display the transparency score
if transparency_score is not None:
    print(f"Transparency Score for {test_url}: {transparency_score}")
else:
    print("Error calculating transparency score.")


Error retrieving content for https://www.amazon.in: 503 Server Error: Service Unavailable for url: https://www.amazon.in/
Error calculating transparency score for https://www.amazon.in: Unable to retrieve website content.
Error calculating transparency score.
