<a href="https://colab.research.google.com/github/Tanzaniav0825/CS667/blob/main/Projet_1_deliverable_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install tldextract
import tldextract
from urllib.parse import urlparse
import re
import random

# Simulated ML Scorer (placeholder)
def simulate_ml_score():
    return round(random.uniform(0.6, 0.95), 2)

# Convert numerical score to emoji star rating
def star_rating(score):
    if score >= 0.9:
        return "⭐⭐⭐⭐⭐"
    elif score >= 0.75:
        return "⭐⭐⭐⭐"
    elif score >= 0.6:
        return "⭐⭐⭐"
    elif score >= 0.4:
        return "⭐⭐"
    else:
        return "⭐"

# Rule-based credibility checker
def evaluate_credibility(url: str) -> dict:
    explanation_parts = []
    score = 0

    # Extract domain
    ext = tldextract.extract(url)
    domain = f"{ext.domain}.{ext.suffix}"

        # Rule 1: Domain suffix trust
    if ext.suffix in ["edu", "gov"]:
        score += 0.3
        explanation_parts.append(f"The domain suffix '.{ext.suffix}' indicates a government or academic institution, generally considered highly credible.")
    elif ext.suffix == "org":
        score += 0.2
        explanation_parts.append(f"'.org' domains often belong to non-profits or institutions, which adds moderate credibility.")
    else:
        score += 0.1
        explanation_parts.append(f"The domain uses '.{ext.suffix}', a common commercial or unknown-level domain. Limited credibility added.")

    # Rule 2: Trusted source database
    trusted_sources = ["nature.com", "sciencedirect.com", "springer.com", "jstor.org", "arxiv.org"]
    if any(t in url for t in trusted_sources):
        score += 0.4
        explanation_parts.append(f"The URL belongs to a recognized academic or research repository ({domain}), which strongly supports credibility.")
    else:
        explanation_parts.append(f"{domain} is not recognized in the list of top scholarly repositories, slightly lowering confidence.")

    # Rule 3: Publication date recency
    if re.search(r"/20\d{2}/", url):
        score += 0.1
        explanation_parts.append("The URL contains a recent year (post-2000), which suggests the source may be up-to-date.")
    else:
        explanation_parts.append("No recent year detected in the URL, which may indicate outdated content.")

    # Rule 4: Simulated ML score
    ml_score = simulate_ml_score()
    score += ml_score * 0.2
    explanation_parts.append(f"A simulated ML model rated this source with {ml_score} credibility, boosting overall trust slightly.")

    # Normalize score
    final_score = min(round(score, 2), 1.0)
    stars = star_rating(final_score)
    explanation = " ".join(explanation_parts)

    return {"score": stars, "explanation": explanation}


# Run interactively
if __name__ == "__main__":
    print("🔍 Credibility Checker")
    url_input = input("Enter a URL to evaluate: ").strip()
    if not url_input.startswith("http"):
        print("⚠️ Please include 'http://' or 'https://' in the URL.")
    else:
        result = evaluate_credibility(url_input)
        print("\n✅ Evaluation Result")
        print(f"⭐ Score: {result['score']}")
        print(f"📘 Explanation: {result['explanation']}")

🔍 Credibility Checker
Enter a URL to evaluate:  https://www.nature.com/articles/s41586-020-03119-7

✅ Evaluation Result
⭐ Score: ⭐⭐⭐
📘 Explanation: The domain uses '.com', a common commercial or unknown-level domain. Limited credibility added. The URL belongs to a recognized academic or research repository (nature.com), which strongly supports credibility. No recent year detected in the URL, which may indicate outdated content. A simulated ML model rated this source with 0.67 credibility, boosting overall trust slightly.
