<a href="https://colab.research.google.com/github/Tanzaniav0825/CS667/blob/main/Project_1_deliverables.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
pip install requests beautifulsoup4 sentence-transformers


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 

In [7]:
import requests
import re
from sentence_transformers import SentenceTransformer, util
from urllib.parse import urlparse
from bs4 import BeautifulSoup

# 1. Trustworthiness: Domain Authority Scoring
def is_trustworthy_url(url):
    """
    Checks basic trustworthiness of a URL using heuristics.
    """
    domain = urlparse(url).netloc
    if not domain.startswith("www."):
        domain = "www." + domain
    trusted_domains = [".edu", ".gov", ".org", ".com"]
    if any(domain.endswith(tld) for tld in trusted_domains) and url.startswith("https://"):
        return True, "The domain is reputable (e.g., '.edu', '.gov', or similar) and uses HTTPS."
    return False, "The domain is not from a trusted source, or it doesn't use HTTPS."

# 2. Relevance: Semantic Similarity
def get_content_similarity(query, url):
    """
    Computes semantic similarity between a query and webpage content.
    """
    model = SentenceTransformer('all-MiniLM-L6-v2')
    try:
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        webpage_text = ' '.join([p.text for p in soup.find_all('p')])

        # Embeddings
        query_embedding = model.encode(query, convert_to_tensor=True)
        content_embedding = model.encode(webpage_text, convert_to_tensor=True)

        # Cosine similarity
        similarity_score = util.cos_sim(query_embedding, content_embedding)
        return similarity_score.item(), "The webpage content matches the semantic meaning of the query well." \
               if similarity_score.item() > 0.5 else "The webpage content only partially matches the query."
    except Exception as e:
        return 0.0, f"Error fetching or processing URL content: {e}"

# 3. Credibility: Keyword Match
def contains_credible_statements(url, query_keywords):
    """
    Checks for the presence of key query terms in the content of the URL.
    """
    try:
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        webpage_text = ' '.join([p.text.lower() for p in soup.find_all('p')])
        matches = [keyword for keyword in query_keywords if keyword.lower() in webpage_text]
        credibility_score = len(matches) / len(query_keywords)
        explanation = f"{len(matches)} out of {len(query_keywords)} keywords from the query were found in the content."
        return credibility_score, explanation
    except Exception as e:
        return 0.0, f"Error fetching or processing URL content: {e}"

# 4. Combined Scoring with Explanations
def evaluate_url_with_explanation(url, query):
    """
    Combines trustworthiness, relevance, and credibility into a final score with explanations.
    """
    # Parameters
    query_keywords = re.findall(r'\w+', query)  # Extract keywords from the query

    # Trustworthiness
    trustworthiness_score, trustworthiness_explanation = is_trustworthy_url(url)

    # Relevance
    relevance_score, relevance_explanation = get_content_similarity(query, url)

    # Credibility
    credibility_score, credibility_explanation = contains_credible_statements(url, query_keywords)

    # Final score (weights: 30% trustworthiness, 40% relevance, 30% credibility)
    final_score = 0.3 * trustworthiness_score + 0.4 * relevance_score + 0.3 * credibility_score

    # Return results with explanations
    return {
        "Trustworthiness Score": trustworthiness_score,
        "Trustworthiness Explanation": trustworthiness_explanation,
        "Relevance Score": relevance_score,
        "Relevance Explanation": relevance_explanation,
        "Credibility Score": credibility_score,
        "Credibility Explanation": credibility_explanation,
        "Final Score": final_score,
        "Final Explanation": f"The final score is a weighted combination of trustworthiness (30%), relevance (40%), "
                             f"and credibility (30%). Trustworthiness contributed {trustworthiness_score * 0.3}, "
                             f"relevance contributed {relevance_score * 0.4}, and credibility contributed {credibility_score * 0.3}."
    }

# Example Usage
if __name__ == "__main__":
    url = "https://www.bhtp.com/blog/when-safe-to-travel-with-newborn/"
    query = "I have just been on an international flight, can I come back home to hold my 1 month old newborn?"
    results = evaluate_url_with_explanation(url, query)
    for key, value in results.items():
        print(f"{key}: {value}")


Trustworthiness Score: True
Trustworthiness Explanation: The domain is reputable (e.g., '.edu', '.gov', or similar) and uses HTTPS.
Relevance Score: 0.5616772174835205
Relevance Explanation: The webpage content matches the semantic meaning of the query well.
Credibility Score: 0.95
Credibility Explanation: 19 out of 20 keywords from the query were found in the content.
Final Score: 0.8096708869934082
Final Explanation: The final score is a weighted combination of trustworthiness (30%), relevance (40%), and credibility (30%). Trustworthiness contributed 0.3, relevance contributed 0.2246708869934082, and credibility contributed 0.285.
