In [1]:
import requests
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
def fetch_seo_data(url):
    """
    Extracts SEO-related data from a given website URL.
    
    Parameters:
        url (str): The website URL to analyze.
    
    Returns:
        dict: A dictionary containing the title, meta description, H1 tags, canonical URL, and robots meta tag.
    """
    try:
        headers = {"User-Agent": "Mozilla/5.0"}
        response = requests.get(url, headers=headers)

        if response.status_code != 200:
            return {"error": f"Failed to fetch page, status code: {response.status_code}"}

        soup = BeautifulSoup(response.text, 'html.parser')

        seo_data = {
            "Title": soup.title.string if soup.title else "No Title",
            "Meta Description": (
                soup.find("meta", attrs={"name": "description"})["content"]
                if soup.find("meta", attrs={"name": "description"})
                else "No Meta Description"
            ),
            "H1 Tags": [h1.text.strip() for h1 in soup.find_all("h1")],
            "Canonical URL": (
                soup.find("link", attrs={"rel": "canonical"})["href"]
                if soup.find("link", attrs={"rel": "canonical"})
                else "No Canonical URL"
            ),
            "Robots Meta": (
                soup.find("meta", attrs={"name": "robots"})["content"]
                if soup.find("meta", attrs={"name": "robots"})
                else "No Robots Tag"
            ),
        }

        return seo_data

    except Exception as e:
        return {"error": str(e)}

In [3]:
def fix_url(url):
    if not url.startswith(("http://", "https://")):
        url = "https://" + url  # Default to 'https://' for security
    return url

In [4]:
def detect_website_type(url, seo_data):
    """
    Determines whether a website is a personal portfolio, company/business site, or uncategorized.

    Parameters:
        url (str): The website URL.
        seo_data (dict): Extracted SEO data containing Title and Meta Description.

    Returns:
        str: The detected website type - 'Personal Portfolio/Project', 'Company/Business Website', or 'Uncategorized Website'.
    """

    personal_domains = ["github.io", "vercel.app", "netlify.app", ".me", ".dev"]
    company_keywords = ["products", "services", "pricing", "customers", "company",
                        "enterprise", "platform", "official", "cloud", "ai", "business"]
    portfolio_keywords = ["portfolio", "projects", "about me", "skills", "resume"]
    known_companies = ["kaggle.com", "flipkart.com", "amazon.com", "google.com"]

    # Convert SEO text to lowercase for case-insensitive comparison
    meta_desc = seo_data.get("Meta Description", "").lower()
    title = seo_data.get("Title", "").lower()

    # Check if it's a personal portfolio or project
    is_personal = any(domain in url for domain in personal_domains) or \
                  any(keyword in meta_desc for keyword in portfolio_keywords)

    # Check if it's a company/business website
    is_company = any(keyword in meta_desc for keyword in company_keywords) or \
                 any(keyword in title for keyword in company_keywords) or \
                 any(comp in url for comp in known_companies)

    if is_personal:
        return "Personal Portfolio/Project"
    elif is_company:
        return "Company/Business Website"
    else:
        return "Uncategorized Website"


In [5]:
def analyze_portfolio(seo_data):
    """
    Analyzes a portfolio website based on SEO data and assigns a portfolio relevance score.

    Parameters:
        seo_data (dict): Extracted SEO data containing Title, Meta Description, and H1 Tags.

    Returns:
        float: Portfolio score as a percentage (0-100).
    """
    portfolio_score = 0
    total_checks = 4  

    # Extract necessary data
    meta_desc = seo_data.get("Meta Description", "").lower()
    h1_tags = " ".join(seo_data.get("H1 Tags", [])).lower()  

    # Check for relevant keywords in Meta Description and H1 Tags
    keywords = ["skills", "projects", "contact"]
    
    if seo_data.get("H1 Tags"):
        portfolio_score += 1  # At least one H1 tag exists

    for keyword in keywords:
        if keyword in meta_desc or keyword in h1_tags:
            portfolio_score += 1

    # Convert score to percentage
    portfolio_score = (portfolio_score / total_checks) * 100  
    return round(portfolio_score, 2)


In [6]:
def calculate_seo_score(seo_data):
    """
    Calculates an SEO score for company websites based on key SEO elements.

    Parameters:
        seo_data (dict): Extracted SEO data containing Title, Meta Description, H1 Tags, Canonical URL, and Robots Meta.

    Returns:
        float: SEO score as a percentage (0-100).
    """
    score = 0
    total_checks = 5  

    # Extract necessary data
    title = seo_data.get("Title", "No Title")
    meta_desc = seo_data.get("Meta Description", "No Meta Description")
    h1_tags = seo_data.get("H1 Tags", [])
    canonical = seo_data.get("Canonical URL", "No Canonical URL")
    robots_meta = seo_data.get("Robots Meta", "")

    # Scoring criteria
    if title and title != "No Title":
        score += 1
    
    if meta_desc and meta_desc != "No Meta Description":
        score += 1
    
    if h1_tags:
        score += 1
    
    if canonical and canonical != "No Canonical URL":
        score += 1
    
    if robots_meta and "index" in robots_meta.lower():
        score += 1

    # Convert score to percentage
    seo_score = (score / total_checks) * 100  
    return round(seo_score, 2)


In [7]:
def analyze_website(url):
    """
    Detects the type of website (Personal Portfolio, Company/Business, or Uncategorized)
    and performs the appropriate analysis.

    Parameters:
        url (str): The website URL to be analyzed.

    Returns:
        dict: A dictionary containing the website type, portfolio score (if applicable),
              or SEO score (if applicable).
    """
    url = fix_url(url)  # Ensure URL format is correct
    # print(f"🔍 Checking URL: {url}")

    seo_data = fetch_seo_data(url)

    if "error" in seo_data:
        print("❌ Error fetching data:", seo_data["error"])
        return {"error": seo_data["error"]}

    website_type = detect_website_type(url, seo_data)
    print(f"🌍 Website Type Detected: {website_type}")

    result = {"Website Type": website_type}

    if website_type == "Personal Portfolio/Project":
        portfolio_score = analyze_portfolio(seo_data)
        result["Portfolio Score"] = portfolio_score
        # print(f"📝 Portfolio Score: {portfolio_score}%")

    elif website_type == "Company/Business Website":
        seo_score = calculate_seo_score(seo_data)
        result["SEO Score"] = seo_score
        # print(f"📊 SEO Score: {seo_score}%")

    else:
        print("⚠️ This website is Uncategorized. Cannot analyze.")

    return result


In [8]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

In [9]:
# ✅ ChromeDriver ka path
chrome_driver_path = r"C:\Users\Lenovo\Downloads\chromedriver-win64\chromedriver.exe"
service = Service(chrome_driver_path)

# ✅ WebDriver Options (Bot Detection Bypass)
options = webdriver.ChromeOptions()
options.add_argument(
    "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36"
)
options.add_argument("--disable-blink-features=AutomationControlled")

def analyze_ui(url):
    """
    Analyzes the UI of a given website by extracting the number of images, buttons, and links.

    Parameters:
        url (str): The website URL to be analyzed.

    Returns:
        dict: A dictionary containing page title, number of UI elements, and UI score.
    """
    url = fix_url(url)  # Ensure URL is formatted correctly
    # print(f"\n🚀 Testing UI: {url}")

    # ✅ Start WebDriver
    driver = webdriver.Chrome(service=service, options=options)
    driver.get(url)

    # ✅ Wait for JavaScript elements to load
    time.sleep(5)

    # ✅ Extract UI Elements
    images = driver.find_elements(By.TAG_NAME, "img")
    buttons = driver.find_elements(By.TAG_NAME, "button")
    links = driver.find_elements(By.TAG_NAME, "a")

    # ✅ UI Score Calculation (Normalized)
    total_elements = len(images) + len(buttons) + len(links)
    max_elements = 100  # Normalizing factor
    ui_score = min(round((total_elements / max_elements) * 100, 2), 100)  # Capped at 100%

    # ✅ Prepare Output
    result = {
        "Page Title": driver.title,
        # "Images": len(images),
        # "Buttons": len(buttons),
        # "Links": len(links),
        "UI Score": ui_score,
    }

    # ✅ Print Results
    # print(f"🔍 Page Title: {result['Page Title']}")
    # print(f"🖼️ Images: {result['Images']}, 🔘 Buttons: {result['Buttons']}, 🔗 Links: {result['Links']}")
    # print(f"🎨 UI Score: {result['UI Score']}%")

    # ✅ Close WebDriver
    driver.quit()
    # print(f"✅ Finished Testing: {url}")

    return result

In [10]:
# ✅ ChromeDriver ka exact path specify karo
CHROMEDRIVER_PATH = r"C:\Users\Lenovo\Downloads\chromedriver-win64\chromedriver.exe"

def analyze_ux(url):
    """
    Analyzes the UX of a given website by extracting key interaction elements.

    Parameters:
        url (str): The website URL to be analyzed.

    Returns:
        dict: A dictionary containing the page title, number of UI elements, and UX score.
    """
    url = fix_url(url)  # Ensure correct URL format
    # print(f"\n🚀 Testing UX: {url}")

    options = webdriver.ChromeOptions()
    options.add_argument("--headless")  # Run in the background
    options.add_argument("--disable-gpu")
    options.add_argument("--window-size=1920x1080")

    # ✅ Start WebDriver with proper path
    service = Service(CHROMEDRIVER_PATH)
    driver = webdriver.Chrome(service=service, options=options)

    try:
        # ✅ Open the website
        driver.get(url)
        time.sleep(2)  # Wait for page to load

        # ✅ Extract Page Title
        page_title = driver.title
        # print(f"🔍 Page Title: {page_title}")

        # ✅ Count UX-related elements
        total_images = len(driver.find_elements(By.TAG_NAME, "img"))
        total_buttons = len(driver.find_elements(By.TAG_NAME, "button"))
        total_links = len(driver.find_elements(By.TAG_NAME, "a"))

        # print(f"🖼️ Images: {total_images}, 🔘 Buttons: {total_buttons}, 🔗 Links: {total_links}")

        # ✅ UX Score Calculation (Normalized)
        max_elements = 100  # Normalization factor
        ux_score = min(round(((total_images + total_buttons + total_links) / max_elements) * 100, 2), 100)

        # ✅ Prepare Output
        result = {
            # "Page Title": page_title,
            # "Images": total_images,
            # "Buttons": total_buttons,
            # "Links": total_links,
            "UX Score": ux_score,
        }

        # print(f"🎨 UX Score: {result['UX Score']}%")

    except Exception as e:
        print(f"❌ Error: {e}")
        result = {"error": str(e)}

    finally:
        driver.quit()
        # print(f"✅ Finished Testing UX: {url}\n")

    return result

In [11]:
!pip install textstat

Defaulting to user installation because normal site-packages is not writeable


In [12]:
import textstat
import re
from collections import Counter
from urllib.parse import urlparse
from collections import Counter

In [13]:
# ✅ Set up Selenium WebDriver
CHROMEDRIVER_PATH = r"C:\Users\Lenovo\Downloads\chromedriver-win64\chromedriver.exe"

def get_page_source(url):
    """
    Uses Selenium to get the full page source (including JavaScript-rendered content).

    Parameters:
        url (str): The website URL.

    Returns:
        str: The full HTML source code of the page.
    """
    url = fix_url(url)  # Ensure correct format

    options = Options()
    options.add_argument("--headless")  # Run without opening the browser
    options.add_argument("--disable-gpu")
    service = Service(CHROMEDRIVER_PATH)
    driver = webdriver.Chrome(service=service, options=options)
    
    try:
        driver.get(url)
        time.sleep(3)  # Wait for dynamic content to load
        return driver.page_source  # Get full HTML
    except Exception as e:
        print(f"❌ Error fetching page source: {e}")
        return ""
    finally:
        driver.quit()

def extract_text(url):
    """
    Extracts visible text from a webpage using BeautifulSoup.

    Parameters:
        url (str): The website URL.

    Returns:
        str: The extracted text content.
    """
    html = get_page_source(url)
    if not html:
        return ""

    soup = BeautifulSoup(html, "html.parser")

    # Remove scripts, styles, and non-visible elements
    for tag in soup(["script", "style", "meta", "noscript"]):
        tag.decompose()

    return soup.get_text(separator=" ", strip=True)

def analyze_content_quality(url):
    """
    Analyzes a webpage's content based on readability, keyword density, text ratio, and heading structure.

    Parameters:
        url (str): The website URL.

    Returns:
        dict: A dictionary containing various content analysis metrics.
    """
    # print(f"\n🚀 Analyzing Content Quality: {url}")
    text = extract_text(url)

    if not text:
        print("❌ No text found on the page!")
        return {"error": "No text content found"}

    # 📖 Readability Score
    readability = textstat.flesch_reading_ease(text)

    # 📝 Word Count
    words = re.findall(r'\b\w+\b', text.lower())  # Extract words
    word_count = len(words)

    # 🔑 Keyword Density (Top 5)
    keyword_freq = Counter(words).most_common(5)

    # 📊 Text-to-HTML Ratio
    html_content = get_page_source(url)
    html_length = len(html_content)
    text_length = len(text)
    text_to_html_ratio = (text_length / html_length) * 100 if html_length > 0 else 0

    # 🔠 Heading Structure
    soup = BeautifulSoup(html_content, "html.parser")
    headings = {
        "h1": len(soup.find_all("h1")),
        "h2": len(soup.find_all("h2")),
        "h3": len(soup.find_all("h3")),
    }

    # ✅ Prepare Output
    result = {
        "Readability Score": round(readability, 2),
        "Word Count": word_count,
        "Keyword Density (Top 5)": keyword_freq,
        "Text-to-HTML Ratio (%)": round(text_to_html_ratio, 2),
        "Heading Structure": headings
    }

    # ✅ Print Results
    for key, value in result.items():
        print(f"{key}: {value}")

    return result

In [14]:
import cv2
import numpy as np
from colorthief import ColorThief
import os
from PIL import Image
from io import BytesIO
from collections import Counter
import webcolors
from urllib.parse import urljoin
import colorsys

In [15]:
# ✅ Path to ChromeDriver (Modify if needed)
CHROMEDRIVER_PATH = r"C:\Users\Lenovo\Downloads\chromedriver-win64\chromedriver.exe"

def extract_hex_colors(url, top_n=10):
    """
    Extracts top N most used hex color values from inline and external CSS using Selenium.
    
    Parameters:
        url (str): Website URL to analyze.
        top_n (int): Number of top colors to return.

    Returns:
        list: Top extracted hex colors.
    """
    try:
        # ✅ Setup Selenium WebDriver
        options = Options()
        options.add_argument("--headless")  # Run in background
        options.add_argument("--disable-gpu")  
        options.add_argument("--no-sandbox")  
        service = Service(CHROMEDRIVER_PATH)
        driver = webdriver.Chrome(service=service, options=options)
        
        # ✅ Open the webpage
        driver.get(url)
        time.sleep(3)  # Wait for full render
        
        # ✅ Extract HTML source with rendered CSS
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, "html.parser")

        driver.quit()  # Close the browser

        css_content = ""

        # ✅ Extract inline CSS
        css_content += " ".join(style.get_text() for style in soup.find_all("style"))

        # ✅ Extract external CSS
        for link in soup.find_all("link", rel="stylesheet"):
            href = link.get("href")
            if href and href.startswith(("http", "/", "./")):
                full_url = urljoin(url, href)
                try:
                    css_response = requests.get(full_url, timeout=5)
                    if css_response.status_code == 200:
                        css_content += css_response.text
                except requests.RequestException:
                    pass  # Ignore inaccessible stylesheets

        # ✅ Regex to find hex colors
        color_pattern = r'#[0-9a-fA-F]{6}|#[0-9a-fA-F]{3}'
        colors = re.findall(color_pattern, css_content)

        # ✅ Normalize short hex codes (e.g., #abc → #aabbcc)
        def normalize_hex(hex_color):
            return (
                f"#{hex_color[1]*2}{hex_color[2]*2}{hex_color[3]*2}"
                if len(hex_color) == 4
                else hex_color
            )

        colors = [normalize_hex(color) for color in colors]

        # ✅ Count occurrences & return top N colors
        color_counts = Counter(colors)
        top_colors = [color for color, _ in color_counts.most_common(top_n)]

        print(f"\n🔹 Extracted Hex Colors (Top {top_n} Used):", top_colors)
        return top_colors if top_colors else ["No Colors Found"]

    except Exception as e:
        print(f"❌ Error extracting colors: {e}")
        return []


# ✅ Function to calculate RGB distance
def rgb_distance(color1, color2):
    """
    Calculates the Euclidean distance between two RGB colors.

    Parameters:
        color1 (str): First hex color (e.g., "#FF5733").
        color2 (str): Second hex color (e.g., "#A2B3C4").

    Returns:
        float: Distance between colors (lower = more similar).
    """
    try:
        r1, g1, b1 = [int(color1[i:i+2], 16) for i in (1, 3, 5)]
        r2, g2, b2 = [int(color2[i:i+2], 16) for i in (1, 3, 5)]
        return ((r1 - r2) ** 2 + (g1 - g2) ** 2 + (b1 - b2) ** 2) ** 0.5
    except ValueError:
        return float("inf")  # Return a large distance for invalid hex values

# ✅ Function to rate UI/UX match based on color similarity
def rate_color_match(extracted_colors, reference_colors):
    """
    Rates how well extracted colors match a reference color palette.

    Parameters:
        extracted_colors (list): List of extracted hex colors.
        reference_colors (list): List of predefined reference colors.

    Returns:
        float: UI/UX match rating (0-10).
    """
    if not extracted_colors or extracted_colors == ["No Colors Found"]:
        print("❌ No colors found, cannot rate UI/UX match.")
        return 0.0  # No colors found = 0 match

    scores = []
    for color in extracted_colors:
        min_distance = min(rgb_distance(color, ref_color) for ref_color in reference_colors)
        score = max(0, 10 - min_distance / 50)  # Lower distance = higher score
        scores.append(score)

    rating = round(sum(scores) / len(scores), 2) if scores else 0
    return rating

# ✅ Define a reference color palette (Adjust based on industry standards)
reference_colors = ["#FF5733", "#A2B3C4", "#3E8E41", "#F1C40F", "#0A53BE"]

In [16]:
CHROMEDRIVER_PATH = r"C:\Users\Lenovo\Downloads\chromedriver-win64\chromedriver.exe"

def check_mobile_friendliness(url):
    """Checks mobile friendliness of a website using Selenium."""
    try:
        # ✅ Setup WebDriver with Headless Mode
        options = Options()
        options.add_argument("--headless")
        options.add_argument("--disable-gpu")  
        options.add_argument("--no-sandbox")  
        service = Service(CHROMEDRIVER_PATH)
        driver = webdriver.Chrome(service=service, options=options)

        # ✅ Open the URL
        driver.get(url)
        time.sleep(3)  # Wait for page to load

        # print("\n📱 Mobile-Friendliness Debug Info:")

        # ✅ 1. Check for Viewport Meta Tag
        viewport_tag = driver.find_elements(By.NAME, "viewport")
        has_viewport = bool(viewport_tag)
        # print(f"✔️ Viewport Meta Tag: {has_viewport}")

        # ✅ 2. Check for CSS Media Queries
        has_media_queries = driver.execute_script("return window.matchMedia !== undefined")
        # print(f"✔️ CSS Media Queries Found: {has_media_queries}")

        # ✅ 3. Check if Body Width is Adaptable
        body_width = driver.execute_script("return document.body.clientWidth")
        screen_width = driver.execute_script("return window.innerWidth")
        is_responsive = body_width <= screen_width
        # print(f"✔️ Responsive Layout Detected: {is_responsive} (Body: {body_width}px, Screen: {screen_width}px)")

        # ✅ 4. Check for Horizontal Scrolling
        horizontal_scroll = driver.execute_script("return document.body.scrollWidth > document.body.clientWidth")
        # print(f"✔️ Horizontal Scroll Exists: {horizontal_scroll}")

        # ✅ 5. Check for Touch-Friendly Buttons (Minimum 48px)
        touch_elements = driver.execute_script("""
            return [...document.querySelectorAll('button, a, input, select, textarea')]
                   .filter(el => el.getBoundingClientRect().height >= 48);
        """)
        has_touch_friendly_buttons = len(touch_elements) > 0
        # print(f"✔️ Touch-Friendly Elements Found: {has_touch_friendly_buttons}")

        # ✅ Calculate Mobile-Friendliness Score (Out of 10)
        mobile_friendly_score = 0
        if has_viewport: mobile_friendly_score += 3
        if has_media_queries: mobile_friendly_score += 2
        if is_responsive: mobile_friendly_score += 2
        if not horizontal_scroll: mobile_friendly_score += 2
        if has_touch_friendly_buttons: mobile_friendly_score += 1

        # ✅ Ensure score is between 0 and 10
        mobile_friendly_score = min(max(mobile_friendly_score, 0), 10)

        # ✅ Close WebDriver
        driver.quit()

        # print(f"\n📱 Final Mobile Friendliness Score: {mobile_friendly_score}/10")
        return mobile_friendly_score

    except Exception as e:
        print(f"❌ Error checking mobile friendliness: {e}")
        return None

    # print(f"📱 Mobile Friendliness Score: {result}/10")

In [17]:
def check_page_load_speed(url):
    """Check the page load speed using Selenium's Performance Timing API."""
    try:
        # ✅ Setup WebDriver with Optimized Settings
        options = Options()
        options.add_argument("--headless")
        options.add_argument("--disable-gpu")  
        options.add_argument("--no-sandbox")  
        options.add_argument("--disable-dev-shm-usage")  # Helps in limited resource environments

        service = Service(CHROMEDRIVER_PATH)
        driver = webdriver.Chrome(service=service, options=options)

        # ✅ Start Timer & Open Website
        start_time = time.time()
        driver.get(url)
        time.sleep(3)  # Allow time for page to load

        # ✅ Extract Performance Timing Data
        timing = driver.execute_script("return window.performance.timing")
        navigation_start = timing.get("navigationStart", 0)
        load_event_end = timing.get("loadEventEnd", 0)

        # ✅ Close Browser Session
        driver.quit()

        # ✅ Calculate Load Time (in Seconds)
        if navigation_start == 0 or load_event_end == 0:
            raise ValueError("Performance Timing API data is incomplete.")
        
        page_load_time = (load_event_end - navigation_start) / 1000  # Convert to seconds

        # ✅ Score Calculation Based on Load Time
        if page_load_time < 2:
            score = 10
        elif page_load_time < 5:
            score = 7 + (2 - (page_load_time / 2))  # Smooth scaling
        elif page_load_time < 10:
            score = 4 + (5 - (page_load_time / 5))
        else:
            score = max(2, 10 - (page_load_time / 3))  # Ensure minimum score

        # ✅ Debugging Output
        # print("\n🚀 Page Load Speed Analysis:")

        return {
         "page_load_score": round(score, 2)  # ✅ Removed incorrect syntax
        }

    except Exception as e:
        print(f"❌ Error: {e}")
        return {"error": str(e)}

In [18]:
# ✅ Path to ChromeDriver (Update as per your system)
CHROMEDRIVER_PATH = r"C:\Users\Lenovo\Downloads\chromedriver-win64\chromedriver.exe"

# ✅ User-Agent to Mimic Real Browsing
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}

def check_broken_links(url):
    """Checks broken links on a website using Selenium & Requests."""
    try:
        # ✅ Setup WebDriver in Headless Mode
        options = Options()
        options.add_argument("--headless")
        options.add_argument(f"user-agent={HEADERS['User-Agent']}")
        service = Service(CHROMEDRIVER_PATH)
        driver = webdriver.Chrome(service=service, options=options)

        # ✅ Open Website
        driver.get(url)
        
        # ✅ Handle Access Denial
        if "403" in driver.page_source or "Access Denied" in driver.page_source:
            return {"error": "This website blocks automated access. Skipping..."}

        # ✅ Extract All Links
        links = set(a.get_attribute("href") for a in driver.find_elements(By.TAG_NAME, "a") if a.get_attribute("href"))

        driver.quit()  # Close browser

        if not links:
            return {"error": "No links found on the webpage."}

        # ✅ Check Broken Links
        broken_links = 0
        total_links = len(links)

        for link in links:
            try:
                response = requests.head(link, headers=HEADERS, allow_redirects=True, timeout=5)
                if response.status_code >= 400:
                    broken_links += 1
            except requests.RequestException:
                broken_links += 1  # Count timeouts/errors as broken links

        # ✅ Calculate Broken Link Score
        broken_percentage = (broken_links / total_links) * 100 if total_links else 0
        score = max(2, 10 - (broken_percentage / 5))  # Ensure a minimum score of 2
        
        return {
            "broken_link_score": round(score, 2)
        }

    except Exception as e:
        print(f"❌ Error: {e}")
        return {"error": str(e)}

In [19]:
import ssl
import socket
from urllib.parse import urljoin

In [20]:
import pandas as pd
import concurrent.futures
import time
from concurrent.futures import TimeoutError

class WebsiteAnalyzer:
    def __init__(self, urls=None, file_path=None):
        if urls:
            if isinstance(urls, str):
                self.urls = [urls]  # Convert single URL to list
            else:
                self.urls = urls  # List of URLs
        elif file_path:
            # Load URLs from the CSV file
            self.urls = pd.read_csv(file_path)['Website URL'].tolist()
        else:
            self.urls = []

    def analyze_single_url(self, url):
        """Analyze a single URL and return data as a dictionary."""
        start_time = time.time()  # Track time for each URL
        try:
            seo_data = fetch_seo_data(url)
            if "error" in seo_data:
                return None  # Skip URLs with errors
            
            website_type = detect_website_type(url, seo_data)
            seo_score = calculate_seo_score(seo_data) if website_type == "Company/Business Website" else None
            portfolio_score = analyze_portfolio(seo_data) if website_type == "Personal Portfolio/Project" else None
            ui_analysis = analyze_ui(url)
            ux_analysis = analyze_ux(url)
            hex_colors = extract_hex_colors(url, top_n=10)
            mobile_friendliness = check_mobile_friendliness(url)
            page_speed = check_page_load_speed(url)
            broken_links = check_broken_links(url)

            # Store relevant data for DataFrame
            data = {
                "Website URL": url,
                "Website Type": website_type,
                "SEO Score (%)": seo_score if seo_score is not None else 0,
                "Portfolio Score (%)": portfolio_score if portfolio_score is not None else 0,
                "UI Score (10)": ui_analysis.get("UI Score", 5),
                "UX Score (10)": ux_analysis.get("UX Score", 5),
                "Extracted Colors": ', '.join(hex_colors) if isinstance(hex_colors, list) else "N/A",
                "Mobile Friendliness Score (10)": mobile_friendliness if isinstance(mobile_friendliness, int) else mobile_friendliness.get("Mobile Friendliness Score", 5),
                "Page Load Speed Score (10)": page_speed if isinstance(page_speed, int) else page_speed.get("page_load_score", 5),
                "Broken Links Score (10)": broken_links if isinstance(broken_links, int) else broken_links.get("broken_link_score", 5)
            }

            # ✅ Calculate Overall Website Score
            score_columns = ["UI Score (10)", "UX Score (10)", "Mobile Friendliness Score (10)", "Page Load Speed Score (10)", "Broken Links Score (10)"]
            valid_scores = [data[col] for col in score_columns if data[col] is not None]
            data["Overall Score (100%)"] = round((sum(valid_scores) / len(valid_scores)) * 10, 2) if valid_scores else None

            return data

        except TimeoutError:
            print(f"❌ Timeout for {url}. Skipping this website.")
            return None
        except Exception as e:
            print(f"❌ Error for {url}: {str(e)}")
            return None
        finally:
            # Check if the time taken exceeds 4 minutes (240 seconds)
            elapsed_time = time.time() - start_time
            if elapsed_time > 240:  # 4 minutes timeout
                print(f"❌ {url} took too long to process ({elapsed_time:.2f} seconds). Skipping this website.")
                return None

    def analyze_all(self):
        """Analyze all URLs concurrently and return a DataFrame."""
        results = []
        with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
            # Use map to execute the analysis concurrently with a timeout of 240 seconds
            results = list(executor.map(self.analyze_single_url, self.urls))
        
        # Filter out None values (websites that failed or were skipped)
        results = [result for result in results if result]

        # Convert to DataFrame
        df = pd.DataFrame(results)

        # ✅ Save results to CSV
        df.to_csv("website_analysis_results.csv", index=False)
        return df




In [21]:
# 🎯 Usage Example:
urls = ["https://anupkumarrsing.github.io/MyPortfolio_Anup/"]
analyzer = WebsiteAnalyzer(urls)
df = analyzer.analyze_all()
df.head()


🔹 Extracted Hex Colors (Top 10 Used): ['#ffffff', '#000000', '#0d6efd', '#6c757d', '#212529', '#f8f9fa', '#dc3545', '#198754', '#ffc107', '#0dcaf0']


Unnamed: 0,Website URL,Website Type,SEO Score (%),Portfolio Score (%),UI Score (10),UX Score (10),Extracted Colors,Mobile Friendliness Score (10),Page Load Speed Score (10),Broken Links Score (10),Overall Score (100%)
0,https://anupkumarrsing.github.io/MyPortfolio_A...,Personal Portfolio/Project,0,25.0,40.0,40.0,"#ffffff, #000000, #0d6efd, #6c757d, #212529, #...",10,10,5.71,211.42


In [None]:
# Example of how to use it:
df = pd.read_csv("website_classification.csv")  # Update with your file path
df = df.head(500)  # Get the first 500 rows
urls = df['website_url'].tolist()  # Extract URLs

analyzer = WebsiteAnalyzer(urls=urls)  # Provide the list of URLs
df_result = analyzer.analyze_all()  # Analyze all URLs from the CSV file
df_result.head()  # Display the results


❌ Error for https://www.momondo.in/?ispredir=true: Message: Service C:\Users\Lenovo\Downloads\chromedriver-win64\chromedriver.exe unexpectedly exited. Status code was: 1

❌ Error for https://www.kayak.co.in/?ispredir=true: Message: invalid session id
Stacktrace:
	GetHandleVerifier [0x00007FF6E259C6A5+28789]
	(No symbol) [0x00007FF6E2505B20]
	(No symbol) [0x00007FF6E2398DCC]
	(No symbol) [0x00007FF6E23DF1CF]
	(No symbol) [0x00007FF6E24171F2]
	(No symbol) [0x00007FF6E2411B89]
	(No symbol) [0x00007FF6E2410C39]
	(No symbol) [0x00007FF6E2365595]
	GetHandleVerifier [0x00007FF6E28E1AED+3458237]
	GetHandleVerifier [0x00007FF6E28F829C+3550316]
	GetHandleVerifier [0x00007FF6E28EDB9D+3507565]
	GetHandleVerifier [0x00007FF6E2662C6A+841274]
	(No symbol) [0x00007FF6E25109EF]
	(No symbol) [0x00007FF6E23641AE]
	GetHandleVerifier [0x00007FF6E29666A8+4001912]
	BaseThreadInitThunk [0x00007FFF7FF0259D+29]
	RtlUserThreadStart [0x00007FFF80FEAF38+40]

❌ Error for https://www.hoteltonight.com/: Message: inva

Service process refused to terminate gracefully with SIGTERM, escalating to SIGKILL.
Traceback (most recent call last):
  File "C:\Users\Lenovo\AppData\Roaming\Python\Python313\site-packages\selenium\webdriver\common\service.py", line 183, in _terminate_process
    self.process.wait(60)
    ~~~~~~~~~~~~~~~~~^^^^
  File "C:\Program Files\Python313\Lib\subprocess.py", line 1274, in wait
    return self._wait(timeout=timeout)
           ~~~~~~~~~~^^^^^^^^^^^^^^^^^
  File "C:\Program Files\Python313\Lib\subprocess.py", line 1603, in _wait
    raise TimeoutExpired(self.args, timeout)
subprocess.TimeoutExpired: Command '['C:\\Users\\Lenovo\\Downloads\\chromedriver-win64\\chromedriver.exe', '--port=51146']' timed out after 60 seconds


❌ Error: HTTPConnectionPool(host='localhost', port=52470): Read timed out. (read timeout=120)
❌ Error: HTTPConnectionPool(host='localhost', port=52653): Read timed out. (read timeout=120)
❌ Error extracting colors: HTTPConnectionPool(host='localhost', port=52639): Read timed out. (read timeout=120)
❌ Error for https://livestream.com: HTTPConnectionPool(host='localhost', port=52650): Read timed out. (read timeout=120)
❌ Error: HTTPConnectionPool(host='localhost', port=52644): Read timed out. (read timeout=120)
❌ Error for https://www.vplayed.com: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


In [None]:
!pip install numpy pandas scikit-learn tensorflow beautifulsoup4 requests

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

In [None]:
import joblib
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# Load and preprocess data
df = pd.read_csv("website_analysis_results.csv")
df = df.drop(columns=['Extracted Colors'])
df.fillna(0, inplace=True)

# Define features and target
features = ['SEO Score (%)', 'Portfolio Score (%)', 'UI Score (10)', 'UX Score (10)', 'Mobile Friendliness Score (10)', 'Page Load Speed Score (10)', 'Broken Links Score (10)']
target = 'Overall Score (100%)'

X = df[features]
y = df[target]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


In [None]:
import joblib

# Assuming 'model' is your trained model
joblib.dump(model, '../overall_score_model.pkl')

In [None]:
model = joblib.load('../overall_score_model.pkl')

In [None]:
import joblib
import pandas as pd

# Example new data (replace with actual data)
new_data = {
    'SEO Score (%)': 85,
    'Portfolio Score (%)': 75,
    'UI Score (10)': 8,
    'UX Score (10)': 7,
    'Mobile Friendliness Score (10)': 9,
    'Page Load Speed Score (10)': 8,
    'Broken Links Score (10)': 6
}

# Convert to DataFrame
new_df = pd.DataFrame([new_data])

# Load the trained model
model = joblib.load('../overall_score_model.pkl')

# Make prediction
predicted_score = model.predict(new_df)[0]
print(f"Predicted Overall Score: {predicted_score:.2f}")
