In [None]:
!pip install arxiv requests psutil memory_profiler
!apt-get install -y file

In [None]:
%%writefile arxiv_crawler.py
import arxiv
import os
import re
import json
import time
import tarfile
import shutil
import subprocess
import gzip

SAVE_DIR = "./23127238"

def detect_and_fix_filetype(tar_path):
    try:
        result = subprocess.run(["file", tar_path], capture_output=True, text=True, errors='ignore')
        output = result.stdout.strip()
    except FileNotFoundError:
        print("X 'file' command not found. Install 'file' utility.")
        return tar_path, "unknown", None
    except Exception as e:
        print(f"X Error running 'file': {e}")
        return tar_path, "unknown", None

    if "PDF document" in output:
        print(f"  -> Detected PDF: {os.path.basename(tar_path)}")
        return tar_path, "pdf", None
    elif "gzip compressed data" in output:
        match = re.search(r', was "([^"]+)"', output)
        if match:
            return tar_path, "gz", os.path.basename(match.group(1))
        else:
            return tar_path, "tar.gz", None
    elif "tar archive" in output:
        return tar_path, "tar.gz", None
    else:
        print(f"  Unknown format: {output}")
        return tar_path, "unknown", None


def extract_and_clean(tar_path, dest_folder, base_name):
    fixed_path, filetype, orig_name = detect_and_fix_filetype(tar_path)
    extract_path = os.path.join(dest_folder, base_name)
    os.makedirs(extract_path, exist_ok=True)
    deleted = 0

    if filetype == "pdf":
        return (os.path.basename(tar_path), True, 0, "pdf")
    if filetype == "unknown":
        return (os.path.basename(tar_path), False, 0, "unknown")

    try:
        if filetype == "tar.gz":
            with tarfile.open(fixed_path, 'r:*') as tar:
                tar.extractall(path=extract_path)
        elif filetype == "gz":
            out_name = orig_name or f"{base_name}.file"
            out_path = os.path.join(extract_path, out_name)
            with gzip.open(fixed_path, 'rb') as fin, open(out_path, 'wb') as fout:
                shutil.copyfileobj(fin, fout)
    except Exception as e:
        print(f"X Extract error: {e}")
        shutil.rmtree(extract_path, ignore_errors=True)
        return (os.path.basename(tar_path), False, 0, "extract_fail")

    # Clean: keep only .tex and .bib
    for root, _, files in os.walk(extract_path):
        for f in files:
            if not f.lower().endswith(('.tex', '.bib')):
                try:
                    os.remove(os.path.join(root, f))
                    deleted += 1
                except:
                    pass
    return (os.path.basename(tar_path), True, deleted, "ok")


def crawl_single_paper(arxiv_id, save_dir=SAVE_DIR):
    """
    Download and process a single arXiv paper with all its versions.

    Args:
        arxiv_id: arXiv ID in format yymm.nnnnn (e.g., "2305.04793")
        save_dir: Directory to save the paper data

    Returns:
        bool: True if successful, False otherwise
    """
    client = arxiv.Client()
    paper_folder = None
    tex_folder = None
    versions_processed = 0
    latest_version = 0

    # Validate and split ID
    if '.' not in arxiv_id:
        print(f"X Invalid arxiv_id: {arxiv_id}")
        return False

    prefix, suffix = arxiv_id.split('.')
    paper_folder = os.path.join(save_dir, f"{prefix}-{suffix}")
    tex_folder = os.path.join(paper_folder, "tex")
    os.makedirs(tex_folder, exist_ok=True)

    # Get latest version from v1
    try:
        search = arxiv.Search(id_list=[arxiv_id])
        base_paper = next(client.results(search))
        match = re.search(r'v(\d+)$', base_paper.entry_id)
        latest_version = int(match.group(1)) if match else 1
        print(f"[{arxiv_id}] Found {latest_version} version(s)")
    except StopIteration:
        print(f"X [{arxiv_id}] Paper not found")
        return False
    except Exception as e:
        print(f"X [{arxiv_id}] Error finding latest version: {e}")
        return False

    # Collect metadata from v1
    title = base_paper.title
    authors = [a.name for a in base_paper.authors]
    submission_date = base_paper.published.strftime("%Y-%m-%d") if base_paper.published else None
    publication_venue = base_paper.journal_ref if base_paper.journal_ref else None
    categories = base_paper.categories
    abstract = base_paper.summary.replace("\n", " ").strip()
    pdf_url = base_paper.pdf_url
    revised_dates = []

    # Get revised dates for v2..vN
    if latest_version > 1:
        for v in range(2, latest_version + 1):
            try:
                vid = f"{arxiv_id}v{v}"
                search_v = arxiv.Search(id_list=[vid])
                paper_v = next(client.results(search_v))
                revised_dates.append(paper_v.updated.strftime("%Y-%m-%d") if paper_v.updated else None)
            except:
                revised_dates.append(None)

    # Save metadata.json
    metadata = {
        "arxiv_id": arxiv_id.replace('.', '-'),
        "paper_title": title,
        "authors": authors,
        "submission_date": submission_date,
        "revised_dates": revised_dates,
        "publication_venue": publication_venue,
        "latest_version": latest_version,
        "categories": categories,
        "abstract": abstract,
        "pdf_url": pdf_url,
    }

    metadata_path = os.path.join(paper_folder, "metadata.json")
    try:
        with open(metadata_path, 'w', encoding='utf-8') as f:
            json.dump(metadata, f, indent=4, ensure_ascii=False)
        print(f"  [{arxiv_id}] Saved metadata.json")
    except Exception as e:
        print(f"X [{arxiv_id}] Failed to save metadata: {e}")
        return False

    # Download all versions into tex folder
    for v in range(1, latest_version + 1):
        version_id = f"{arxiv_id}v{v}"
        version_folder_name = f"{prefix}-{suffix}v{v}"
        temp_tar = os.path.join(paper_folder, f"{version_id}.tar.gz")

        try:
            search_v = arxiv.Search(id_list=[version_id])
            paper_v = next(client.results(search_v))

            print(f"  [{arxiv_id}] Downloading {version_id}...")
            paper_v.download_source(dirpath=paper_folder, filename=f"{version_id}.tar.gz")

            # Extract & Clean into tex folder
            file_name, success, deleted_count, ftype = extract_and_clean(temp_tar, tex_folder, version_folder_name)

            if success:
                versions_processed += 1
                print(f"  [{arxiv_id}] Extracted & cleaned: {version_id} ({deleted_count} files removed)")
            else:
                print(f"X [{arxiv_id}] Failed to extract {version_id}")

            # Delete .tar.gz
            try:
                os.remove(temp_tar)
            except:
                pass

            time.sleep(0.5)  # Be nice to arXiv

        except StopIteration:
            print(f"X [{arxiv_id}] Version {version_id} not found")
            continue
        except Exception as e:
            print(f"X [{arxiv_id}] Download error {version_id}: {e}")
            continue

    # Final check
    success = (versions_processed > 0)
    if success:
        print(f"✓ [{arxiv_id}] COMPLETED ({versions_processed}/{latest_version} versions)")
    else:
        print(f"X [{arxiv_id}] FAILED - no versions downloaded")

    return success

In [None]:
%%writefile reference_extractor.py
import requests
import json
import os
import time
import re


def format_arxiv_id_for_key(arxiv_id):
    """
    Convert arXiv ID to folder format (yymm-nnnnn).
    Examples:
        "2305.04793" -> "2305-04793"
        "2305.04793v1" -> "2305-04793"
    """
    # Remove version suffix if present
    clean_id = re.sub(r'v\d+$', '', arxiv_id)
    # Replace dot with dash
    return clean_id.replace('.', '-')


def get_paper_references(arxiv_id, delay=3):
    """
    Fetch references for a paper from Semantic Scholar API.
    Retries indefinitely until success or 404.

    Args:
        arxiv_id: arXiv ID (format: YYMM.NNNNN or YYMM.NNNNNvN)
        delay: delay between retries in seconds

    Returns:
        tuple: (list of references, total_found_count) or (None, 0) if 404 error
    """
    # Clean arxiv_id (remove version suffix if present)
    clean_id = re.sub(r'v\d+$', '', arxiv_id)
    url = f"https://api.semanticscholar.org/graph/v1/paper/arXiv:{clean_id}"
    params = {
        "fields": "references,references.title,references.authors,references.year,references.venue,references.externalIds,references.publicationDate"
    }

    ### My SEMANTIC_SCHOLAR_API_KEY
    API_KEY = os.getenv("a8okwqTLp18Ku1vBXJ1Jb6eRoDKpmAem41VjtFCY")
    headers = {}
    if API_KEY:
        headers["x-api-key"] = API_KEY

    while True:
        try:
            #response = requests.get(url, params=params, timeout=10)
            response = requests.get(url, params=params, headers=headers, timeout=10)
            if response.status_code == 200:
                data = response.json()
                references = data.get("references", [])
                total_found = len(references) if references else 0
                return references, total_found
            elif response.status_code == 429:
                print(f"  [{arxiv_id}] Rate limit hit. Waiting {delay}s...")
                time.sleep(delay)
            elif response.status_code == 404:
                print(f"  [{arxiv_id}] Paper not found in Semantic Scholar (404)")
                return None, 0  # Return None to indicate 404 error
            else:
                print(f"  [{arxiv_id}] API returned status {response.status_code}, retrying in {delay}s...")
                time.sleep(delay)
        except requests.exceptions.RequestException as e:
            print(f"  [{arxiv_id}] Request error: {e}, retrying in {delay}s...")
            time.sleep(delay)


def convert_to_references_dict(references):
    """
    Convert Semantic Scholar references to the required format:
    Dictionary with arXiv IDs as keys (in "yyyymm-id" format) for papers with arXiv IDs.

    Args:
        references: List of references from Semantic Scholar API

    Returns:
        dict: Dictionary with paper IDs as keys and metadata as values
    """
    result = {}

    for ref in references:
        # Skip if reference is None or empty
        if not ref:
            continue

        # Extract external IDs (may be None)
        external_ids = ref.get("externalIds", {})
        if external_ids is None:
            external_ids = {}

        arxiv_id = external_ids.get("ArXiv", "")

        # Only keep references that have arXiv_id
        if not arxiv_id:
            continue

        # Use arXiv ID in yyyymm-id format
        key = format_arxiv_id_for_key(arxiv_id)

        # Extract authors
        authors_list = ref.get("authors", [])
        authors = [author.get("name", "") for author in authors_list if author.get("name")]

        # Extract dates (use publicationDate if available)
        publication_date = ref.get("publicationDate", "")
        year = ref.get("year")

        # If no publication date but have year, create an ISO-like format
        if not publication_date and year:
            publication_date = f"{year}-01-01"  # Use Jan 1st as placeholder

        # Build metadata dictionary with required fields
        metadata = {
            "paper_title": ref.get("title", ""),
            "authors": authors,
            "submission_date": publication_date if publication_date else "",
            "semantic_scholar_id": ref.get("paperId"),
            "year": year
        }

        result[key] = metadata

    return result


def extract_references_for_paper(arxiv_id, save_dir="./23127238"):
    """
    Extract references for a paper and save to references.json.

    Args:
        arxiv_id: arXiv ID in format yymm.nnnnn (e.g., "2305.04793")
        save_dir: Base directory containing paper folders

    Returns:
        bool: True if successful (found and saved references), False otherwise
    """
    # Convert arxiv_id to folder format
    paper_id_key = format_arxiv_id_for_key(arxiv_id)
    paper_folder = os.path.join(save_dir, paper_id_key)

    # Check if the folder exists
    if not os.path.exists(paper_folder):
        print(f"X [{arxiv_id}] Paper folder not found: {paper_folder}")
        return False

    print(f"[{arxiv_id}] Fetching references...")

    try:
        json_path = os.path.join(paper_folder, "references.json")
        references, total_found = get_paper_references(arxiv_id)

        # If we got None (404 error), save empty file and return failure
        if references is None:
            print(f"X [{arxiv_id}] Failed to fetch references from Semantic Scholar (404)")
            with open(json_path, 'w', encoding='utf-8') as f:
                json.dump({}, f, indent=4, ensure_ascii=False)
            return False

        if not references or total_found == 0:
            print(f"X [{arxiv_id}] No references found (total_found: 0)")
            # Save empty dict but return False
            with open(json_path, 'w', encoding='utf-8') as f:
                json.dump({}, f, indent=4, ensure_ascii=False)
            return False

        references_dict = convert_to_references_dict(references)
        total_saved = len(references_dict)

        # Save only the references dict (no statistics in JSON)
        with open(json_path, 'w', encoding='utf-8') as f:
            json.dump(references_dict, f, indent=4, ensure_ascii=False)

        # Log statistics to console only
        print(f"✓ [{arxiv_id}] Found {total_found} references, saved {total_saved} (with arXiv IDs) to references.json")
        return True

    except Exception as e:
        print(f"X [{arxiv_id}] Error extracting references: {e}")
        return False

In [None]:
%%writefile main.py
import time
import os
import shutil
import psutil
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock
from arxiv_crawler import crawl_single_paper
from reference_extractor import extract_references_for_paper


# Global statistics
stats_lock = Lock()
stats = {
    "total_processed": 0,           # Total number of papers processed
    "both_success": 0,              # Both 2 parts are successful
    "only_crawler_success": 0,      # Only successful crawlers
    "only_references_success": 0,   # Only successful references (not logical)
    "crawler_failed": 0,            # Crawler Failed
    "references_failed": 0,         # References failed (crawler successful)
    "both_failed": 0,               # Both 2 parts failed
}

monitor_running = True
ram_samples_bytes = []
peak_disk_usage_bytes = 0

def _monitor_resources(baseline_ram, baseline_disk, sleep_interval=2):
    """
    Runs in the background to monitor average RAM and peak Disk.
    """
    global ram_samples_bytes, peak_disk_usage_bytes, monitor_running

    # Reset
    ram_samples_bytes = []
    peak_disk_usage_bytes = 0

    print(f"[Monitor] Start (Baseline RAM: {baseline_ram / (1024**3):.2f} GB, Baseline Disk: {baseline_disk / (1024**3):.2f} GB)")

    while monitor_running:
        try:
            # 1. Đo RAM
            current_ram = psutil.virtual_memory().used
            ram_above_baseline = current_ram - baseline_ram
            ram_samples_bytes.append(ram_above_baseline)

            # 2. Đo Disk
            current_disk = shutil.disk_usage('/').used
            disk_above_baseline = current_disk - baseline_disk

            if disk_above_baseline > peak_disk_usage_bytes:
                peak_disk_usage_bytes = disk_above_baseline

        except Exception as e:
            print(f"[Monitor Error] {e}")

        time.sleep(sleep_interval)

    print("[Monitor] Stop.")

def _print_custom_resource_report(disk_start, disk_end):
    """
    Print reports for missing requests
    """
    global ram_samples_bytes, peak_disk_usage_bytes

    #1. Average RAM
    avg_ram_bytes = 0
    if ram_samples_bytes:
        avg_ram_bytes = sum(ram_samples_bytes) / len(ram_samples_bytes)

    # 2. The Last Disk
    final_disk_bytes = disk_end - disk_start

    print("\n" + "="*80)
    print("ADDITIONAL RESOURCE REPORT (MINUS BASELINE)")

    print(f"  Average RAM : {avg_ram_bytes / (1024**2):.2f} MB")
    print(f"  Peak Disk   : {peak_disk_usage_bytes / (1024**2):.2f} MB")
    print(f"  Final Disk  : {final_disk_bytes / (1024**2):.2f} MB")

    print("="*80)

def process_paper(arxiv_id, save_dir="./23127238"):
    """
    Process a single paper: crawl data first, then extract references.

    Args:
        arxiv_id: arXiv ID in format yymm.nnnnn
        save_dir: Directory to save data

    Returns:
        tuple: (arxiv_id, crawler_success, references_success)
    """
    print(f"\n{'='*80}")
    print(f"Processing paper: {arxiv_id}")
    print(f"{'='*80}")

    # Step 1: Crawl paper data
    crawler_success = crawl_single_paper(arxiv_id, save_dir)

    # Step 2: Extract references (only if crawler succeeded)
    references_success = False
    if crawler_success:
        references_success = extract_references_for_paper(arxiv_id, save_dir)
    else:
        print(f"X [{arxiv_id}] Skipping reference extraction (crawler failed)")

    # Update statistics
    with stats_lock:
        stats["total_processed"] += 1

        if crawler_success and references_success:
            stats["both_success"] += 1
        elif crawler_success and not references_success:
            stats["only_crawler_success"] += 1
            stats["references_failed"] += 1
        elif not crawler_success:
            stats["crawler_failed"] += 1
            if references_success:
                stats["only_references_success"] += 1
            else:
                stats["both_failed"] += 1

    return arxiv_id, crawler_success, references_success


def check_paper_exists(arxiv_id, save_dir="./23127238"):
    """
    Check if a paper exists by attempting to crawl it.

    Args:
        arxiv_id: arXiv ID in format yymm.nnnnn
        save_dir: Directory to save data

    Returns:
        bool: True if paper exists, False otherwise
    """
    success = crawl_single_paper(arxiv_id, save_dir)

    # If failed, clean up any created folders
    if not success:
        prefix, suffix = arxiv_id.split('.')
        paper_folder = os.path.join(save_dir, f"{prefix}-{suffix}")
        if os.path.exists(paper_folder):
            try:
                shutil.rmtree(paper_folder)
                print(f"  Cleaned up folder for non-existent paper: {arxiv_id}")
            except Exception as e:
                print(f"  Warning: Could not clean up folder: {e}")

    return success


def find_last_valid_id(prefix, start_id, save_dir="./23127238"):
    """
    Find the last valid paper ID in a month by checking consecutive failures.

    Args:
        prefix: Month prefix (e.g., "2305")
        start_id: Starting ID to check from
        save_dir: Directory to save data

    Returns:
        int: Last valid ID found, or 0 if none found
    """
    consecutive_failures = 0
    max_consecutive_failures = 3
    current_id = start_id
    last_valid_id = start_id - 1

    print(f"\n{'='*80}")
    print(f"Finding last valid ID for {prefix}.xxxxx starting from {start_id}")
    print(f"{'='*80}")

    while consecutive_failures < max_consecutive_failures:
        arxiv_id = f"{prefix}.{current_id:05d}"
        print(f"\nProbing: {arxiv_id}")

        exists = check_paper_exists(arxiv_id, save_dir)

        if exists:
            consecutive_failures = 0
            last_valid_id = current_id
            print(f"✓ Found valid paper: {arxiv_id}")
        else:
            consecutive_failures += 1
            print(f"X Paper not found: {arxiv_id} (failure {consecutive_failures}/{max_consecutive_failures})")

        current_id += 1
        time.sleep(0.5)  # Be nice to arXiv

    print(f"\n{'='*80}")
    print(f"Last valid ID found: {prefix}.{last_valid_id:05d}")
    print(f"{'='*80}\n")

    return last_valid_id


def generate_paper_ids(start_month, start_id, end_month, end_id, save_dir="./23127238"):
    """
    Generate list of arXiv IDs based on date range.

    Args:
        start_month: Start month in format "YYYY-MM"
        start_id: Starting ID number
        end_month: End month in format "YYYY-MM"
        end_id: Ending ID number
        save_dir: Directory to save data

    Returns:
        list: List of arXiv IDs in format "yymm.nnnnn"
    """
    start_year, start_mon = start_month.split('-')
    end_year, end_mon = end_month.split('-')
    start_prefix = start_year[2:] + start_mon
    end_prefix = end_year[2:] + end_mon

    paper_ids = []

    if start_month == end_month:
        # Same month - simple range
        print(f"Single month mode: {start_prefix}.{start_id:05d} → {start_prefix}.{end_id:05d}")
        for i in range(start_id, end_id + 1):
            paper_ids.append(f"{start_prefix}.{i:05d}")
    else:
        # Different months - need to find last valid ID in start month
        print(f"Multi-month mode: {start_prefix}.{start_id:05d} → {end_prefix}.{end_id:05d}")

        # Find last valid ID in start month
        last_valid_start_month = find_last_valid_id(start_prefix, start_id, save_dir)

        # Add papers from start month
        for i in range(start_id, last_valid_start_month + 1):
            paper_ids.append(f"{start_prefix}.{i:05d}")

        # Add papers from end month (from 1 to end_id)
        print(f"\nAdding papers from end month: {end_prefix}.00001 → {end_prefix}.{end_id:05d}")
        for i in range(1, end_id + 1):
            paper_ids.append(f"{end_prefix}.{i:05d}")

    return paper_ids


def print_progress_report():
    """Print current statistics."""
    with stats_lock:
        print(f"\n{'='*80}")
        print("CURRENT PROGRESS:")
        print(f"  Total processed                       : {stats['total_processed']}")
        print(f"  Both success                          : {stats['both_success']}")
        print(f"  Only crawler success                  : {stats['only_crawler_success']}")
        print(f"  Only references success               : {stats['only_references_success']}")
        print(f"  Crawler failed                        : {stats['crawler_failed']}")
        print(f"  References failed (404, Not Found)    : {stats['references_failed']}")
        print(f"  Both failed                           : {stats['both_failed']}")
        print(f"{'='*80}\n")


def print_final_report():
    """Print final statistics with percentages."""
    total = stats['total_processed']

    # Calculate success rates
    both_success_rate = (stats['both_success'] / total * 100) if total > 0 else 0
    phase2_fail_rate = (stats['references_failed'] / total * 100) if total > 0 else 0

    print(f"\n{'='*80}")
    print("FINAL REPORT:")
    print(f"\n{'='*80}")
    print("CURRENT PROGRESS:")
    print(f"  Total processed                       : {stats['total_processed']}")
    print(f"  Both success                          : {stats['both_success']}")
    print(f"  Only crawler success                  : {stats['only_crawler_success']}")
    print(f"  Only references success               : {stats['only_references_success']}")
    print(f"  Crawler failed                        : {stats['crawler_failed']}")
    print(f"  References failed (404, Not Found)    : {stats['references_failed']}")
    print(f"  Both failed                           : {stats['both_failed']}")
    print(f"{'='*80}\n")
    print("SUCCESS RATES:")
    print(f"{'='*80}")
    print(f"  Both phases success rate : {both_success_rate:.2f}%")
    print(f"  Phase 2 (references) fail: {phase2_fail_rate:.2f}%")
    print(f"{'='*80}")


def run_parallel_processing(start_month, start_id, end_month, end_id,
                            max_parallels=5, save_dir="./23127238"):
    """
    Main function to run parallel processing of papers.

    Args:
        start_month: Start month in format "YYYY-MM"
        start_id: Starting ID number
        end_month: End month in format "YYYY-MM"
        end_id: Ending ID number
        max_parallels: Number of parallel threads (default: 5)
        save_dir: Directory to save data
    """
    # Reset stats
    with stats_lock:
        for key in stats:
            stats[key] = 0

    # Generate paper IDs
    paper_ids = generate_paper_ids(start_month, start_id, end_month, end_id, save_dir)
    total_papers = len(paper_ids)

    print(f"\n{'='*80}")
    print("STARTING PARALLEL PROCESSING")
    print(f"{'='*80}")
    print(f"Range: {start_month} ID {start_id} → {end_month} ID {end_id}")
    print(f"Total papers to process: {total_papers}")
    print(f"Parallel threads: {max_parallels}")
    print(f"Output directory: {save_dir}")
    print(f"{'='*80}\n")

    start_time = time.time()

    # Process papers in parallel
    with ThreadPoolExecutor(max_workers=max_parallels) as executor:
        futures = {
            executor.submit(process_paper, arxiv_id, save_dir): arxiv_id
            for arxiv_id in paper_ids
        }

        completed = 0
        for future in as_completed(futures):
            arxiv_id = futures[future]
            completed += 1

            try:
                paper_id, crawler_ok, refs_ok = future.result()
                status = "✓✓" if (crawler_ok and refs_ok) else \
                         "✓X" if (crawler_ok and not refs_ok) else \
                         "XX"
                print(f"\n[{completed}/{total_papers}] {status} {paper_id}")

                # Print progress every 10 papers
                if completed % 10 == 0:
                    print_progress_report()

            except Exception as e:
                print(f"\n[{completed}/{total_papers}] !! {arxiv_id} - Error: {e}")

    elapsed_time = time.time() - start_time

    # Print final report with percentages
    print(f"\n{'='*80}")
    print("PROCESSING COMPLETE!")
    print(f"{'='*80}")
    print(f"Time elapsed: {elapsed_time:.2f} seconds")
    print(f"Average time per paper: {elapsed_time/total_papers:.2f} seconds" if total_papers > 0 else "")
    print_final_report()


def main():
    """The main function to run the entire processing"""

    # === CONFIGS ===
    START_MONTH = "2023-05"
    START_ID = 2001
    END_MONTH = "2023-05"
    END_ID = 4500
    MAX_PARALLELS = 3
    SAVE_DIR = "./23127238"

    # STEP 1: MEASURE INITIAL RESOURCES (BASELINE)
    print("="*50)
    print("Baseline resource measurement...")
    disk_usage_start = shutil.disk_usage('/').used
    ram_usage_start = psutil.virtual_memory().used
    print(f"  Disk ban đầu: {disk_usage_start / (1024**3):.2f} GB")
    print(f"  RAM ban đầu : {ram_usage_start / (1024**3):.2f} GB")

    # STEP 2: START MONITORING THREAD
    print("Start the resource monitor thread...")
    global monitor_running
    monitor_running = True

    monitor_thread = threading.Thread(
        target=_monitor_resources,
        args=(ram_usage_start, disk_usage_start, 2), # (baseline_ram, baseline_disk, 2s interval)
        daemon=True
    )
    monitor_thread.start()

    # STEP 3: RUN THE MAIN TASK
    run_parallel_processing(
        start_month=START_MONTH,
        start_id=START_ID,
        end_month=END_MONTH,
        end_id=END_ID,
        max_parallels=MAX_PARALLELS,
        save_dir=SAVE_DIR
    )

    # STEP 4: STOP MONITORING & FINAL MEASUREMENT
    print("\n" + "="*50)
    print("Task completed. Stopping monitoring stream...")
    monitor_running = False
    monitor_thread.join() # Wait for the stream to completely shut down

    disk_usage_end = shutil.disk_usage('/').used
    print(f"  Last disk: {disk_usage_end / (1024**3):.2f} GB")

    # STEP 5: PRINT ADDITIONAL REPORT
    # (This function will print average RAM, peak Disk, last Disk)
    _print_custom_resource_report(disk_usage_start, disk_usage_end)


In [None]:
%load_ext memory_profiler

# Import your main.py file as a module
import main
import importlib

importlib.reload(main)

print("--- START MEASURING PEAK RAM ---")
%memit main.main()
print("--- PEAK RAM MEASUREMENT END ---")

In [None]:
print("--- COLAB DRIVE OVERVIEW ---")
!df -h /

print("\n--- REQUIRED OUTPUT CAPACITY ---")
# Measure output folder size
!du -sh ./23127238