In [None]:
from google.colab import drive, files

#A popular library for making HTTP requests. Here, it is used to fetch web pages and CSS files.
import requests

#Part of the bs4 package; it parses HTML content, making it easier to search and extract data from web pages.
from bs4 import BeautifulSoup

#Used for data manipulation. In this script, it reads a CSV file containing website URLs.
import pandas as pd

#Provides a progress bar (designed for notebook environments) to visually track the progress of loops.
from tqdm.notebook import tqdm
from urllib.parse import urljoin
import os

drive.mount('/content/drive')

csv_file_path = "/content/drive/My Drive/tranco.csv" 

# this is the Function to find CSS files on a website
def find_css_files(url):
    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, "html.parser")
            css_links = soup.find_all("link", rel="stylesheet")
            return [urljoin(url, link["href"]) for link in css_links if "href" in link.attrs]
    except Exception as e:
        print(f"Error fetching {url}: {e}")
    return []

# Main function to scrape and download CSS files individually
def scrape_and_download_css_files(file_path, output_dir, start_index=0, batch_size=1000, css_threshold=50 * 1024):
    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # Load websites from the CSV file
    websites_df = pd.read_csv(file_path, header=None, names=["rank", "website"])
    websites = websites_df["website"].tolist()

    # Get the batch of websites
    batch_websites = websites[start_index:start_index + batch_size]
    failed_websites = []
    success_count = 0

    for index, website in enumerate(tqdm(batch_websites, desc="Scraping websites"), start=start_index):
        url = f"http://{website}"  # Add HTTP scheme if not present
        css_files = find_css_files(url)
        if not css_files:
            failed_websites.append(website)
            continue

        for css_url in css_files:
            try:
                # Fetch CSS file content
                css_response = requests.get(css_url, timeout=10)
                if css_response.status_code == 200 and len(css_response.content) > css_threshold:
                    # Save CSS file
                    filename = css_url.split("/")[-1]
                    if not filename.endswith(".css"):
                        filename += ".css"
                    file_path = os.path.join(output_dir, filename)
                    with open(file_path, "wb") as f:
                        f.write(css_response.content)

                    # Download the file to the local device immediately
                    files.download(file_path)
                    print(f"Downloaded CSS file from {css_url} to local device (size: {len(css_response.content)} bytes).")
                    success_count += 1
                    break  # Exit after the first successful CSS file
            except Exception as e:
                print(f"Error downloading CSS from {css_url}: {e}")
                failed_websites.append(website)

    print(f"Completed scraping websites from column {start_index} to column {start_index + len(batch_websites)}.")
    print(f"Successful scrapes: {success_count}")
    print(f"Failed websites: {len(failed_websites)}")

    # Save failed websites for future review
    failed_file = os.path.join(output_dir, f"failed_websites_{start_index}_{start_index + len(batch_websites)}.csv")
    pd.DataFrame(failed_websites, columns=["website"]).to_csv(failed_file, index=False)
    print(f"Failed websites saved to {failed_file}")

# Parameters
output_directory = "scraped_css_files" 
start_index = 3000  # Update this value for subsequent batches
batch_size = 1000  # Number of websites to process in this batch

# Run the scraper for the specified batch
scrape_and_download_css_files(csv_file_path, output_directory, start_index=start_index, batch_size=batch_size)


Mounted at /content/drive


Scraping websites:   0%|          | 0/1000 [00:00<?, ?it/s]

Error fetching http://lgsmartad.com: HTTPConnectionPool(host='lgsmartad.com', port=80): Max retries exceeded with url: / (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7e5edf1c5010>, 'Connection to lgsmartad.com timed out. (connect timeout=10)'))


KeyboardInterrupt: 

In [None]:
from google.colab import files
import shutil
import os

# Directory where CSS files are saved
output_directory = "scraped_css_files"  # Ensure this matches your scraper's output directory

# Check if the directory exists and contains files
if not os.path.exists(output_directory) or len(os.listdir(output_directory)) == 0:
    print(f"No CSS files found in '{output_directory}'. Ensure scraping was successful.")
else:
    # Compress all files in the directory into a single ZIP file
    zip_file = "scraped_css_files.zip"
    shutil.make_archive("scraped_css_files", "zip", output_directory)

    # Download the ZIP file
    print(f"Downloading all files as '{zip_file}'...")
    files.download(zip_file)