In [None]:
import pandas as pd
import requests
import time
import os
from concurrent.futures import ThreadPoolExecutor, as_completed

API_KEY = 'AIzaSyBFsbsTJzCB8HnIYked27vT0aKulRgO-rs'
INPUT_FILE = '../data/reduction_about_20k.csv'
OUTPUT_DIR = '../data/batch_data/pagespeed_batches'
LOG_FILE = os.path.join(OUTPUT_DIR, 'error_log.txt')
BATCH_SIZE = 3000
MAX_WORKERS = 10  # 병렬 처리할 스레드 수

# URL 형식 보정
def format_url(url):
    if not url.startswith("http"):
        return "http://" + url
    return url

# 최종 리디렉션 URL 얻기
def get_final_url(url):
    try:
        response = requests.get(url, timeout=5, allow_redirects=True)
        return response.url
    except Exception:
        return None

# PageSpeed API 호출
def fetch_pagespeed_data(url):
    api_url = (
        f"https://www.googleapis.com/pagespeedonline/v5/runPagespeed"
        f"?url={url}&key={API_KEY}&strategy=desktop"
    )
    try:
        res = requests.get(api_url, timeout=10)
        if res.status_code != 200:
            return {"error": f"HTTP {res.status_code}"}
        data = res.json()
        lighthouse = data.get("lighthouseResult", {})
        audits = lighthouse.get("audits", {})
        categories = lighthouse.get("categories", {})
        loading = data.get("loadingExperience", {})
        origin = data.get("originLoadingExperience", {})

        return {
            "performance_score": categories.get("performance", {}).get("score"),
            "FCP": audits.get("first-contentful-paint", {}).get("numericValue"),
            "LCP": audits.get("largest-contentful-paint", {}).get("numericValue"),
            "CLS": audits.get("cumulative-layout-shift", {}).get("numericValue"),
            "TTFB": audits.get("server-response-time", {}).get("numericValue"),
            "SpeedIndex": audits.get("speed-index", {}).get("numericValue"),
            "TBT": audits.get("total-blocking-time", {}).get("numericValue"),
            "FCP_loadingExperience": loading.get("metrics", {}).get("FIRST_CONTENTFUL_PAINT_MS", {}).get("percentile"),
            "FCP_loadingExperience_category": loading.get("metrics", {}).get("FIRST_CONTENTFUL_PAINT_MS", {}).get("category"),
            "FID_loadingExperience": loading.get("metrics", {}).get("FIRST_INPUT_DELAY_MS", {}).get("percentile"),
            "FID_loadingExperience_category": loading.get("metrics", {}).get("FIRST_INPUT_DELAY_MS", {}).get("category"),
            "overall_category_loadingExperience": loading.get("overall_category"),
            "FCP_origin": origin.get("metrics", {}).get("FIRST_CONTENTFUL_PAINT_MS", {}).get("percentile"),
            "FCP_origin_category": origin.get("metrics", {}).get("FIRST_CONTENTFUL_PAINT_MS", {}).get("category"),
            "FID_origin": origin.get("metrics", {}).get("FIRST_INPUT_DELAY_MS", {}).get("percentile"),
            "FID_origin_category": origin.get("metrics", {}).get("FIRST_INPUT_DELAY_MS", {}).get("category"),
            "overall_category_origin": origin.get("overall_category"),
            "error": None
        }
    except Exception as e:
        return {k: None for k in [
            "performance_score", "FCP", "LCP", "CLS", "TTFB",
            "SpeedIndex", "TBT", "FCP_loadingExperience", "FCP_loadingExperience_category",
            "FID_loadingExperience", "FID_loadingExperience_category", "overall_category_loadingExperience",
            "FCP_origin", "FCP_origin_category", "FID_origin", "FID_origin_category",
            "overall_category_origin"
        ]} | {"error": str(e)}

# 병렬 작업 단일 단위
def process_row(i, row):
    original_url = format_url(row['url'])
    final_url = get_final_url(original_url)

    base_data = row.to_dict()

    if not final_url:
        base_data.update({
            'final_url': None, 'tested_url': None,
            **{k: None for k in [
                "performance_score", "FCP", "LCP", "CLS", "TTFB",
                "SpeedIndex", "TBT", "FCP_loadingExperience", "FCP_loadingExperience_category",
                "FID_loadingExperience", "FID_loadingExperience_category", "overall_category_loadingExperience",
                "FCP_origin", "FCP_origin_category", "FID_origin", "FID_origin_category",
                "overall_category_origin"
            ]},
            'error': 'Redirect failed'
        })
        return (i, original_url, final_url, base_data)

    data = fetch_pagespeed_data(final_url)
    base_data.update({'final_url': final_url, 'tested_url': final_url, **data})
    return (i, original_url, final_url, base_data)

# 메인 실행
def main():
    print("🚀 병렬 PageSpeed 측정 시작")
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    df = pd.read_csv(INPUT_FILE)

    batch = []
    batch_index = 1
    log_lines = []

    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = {executor.submit(process_row, i, row): i for i, row in df.iterrows()}
        for future in as_completed(futures):
            i, original_url, final_url, base_data = future.result()
            batch.append(base_data)

            if base_data.get("error"):
                error_msg = f"[Error] {original_url} → {final_url} | {base_data['error']}"
                log_lines.append(error_msg)

            print(f"[{i+1}/{len(df)}] ✅ {original_url} → {final_url} | 점수: {base_data.get('performance_score')}")

            if len(batch) >= BATCH_SIZE:
                pd.DataFrame(batch).to_csv(os.path.join(OUTPUT_DIR, f'batch_{batch_index}.csv'), index=False)
                print(f"💾 저장 완료: batch_{batch_index}.csv")
                batch = []
                batch_index += 1

    if batch:
        pd.DataFrame(batch).to_csv(os.path.join(OUTPUT_DIR, f'batch_{batch_index}.csv'), index=False)
        print(f"💾 저장 완료: batch_{batch_index}.csv (잔여 데이터)")

    if log_lines:
        with open(LOG_FILE, 'w', encoding='utf-8') as f:
            f.write("\n".join(log_lines))
        print(f"📝 에러 로그 저장됨: {LOG_FILE}")

    print("✅ 전체 병렬 처리 완료")

if __name__ == "__main__":
    main()


🚀 병렬 PageSpeed 측정 시작
[13/189243] ✅ http://nesn.com/2014/07/veteran-leaders-return-to-field-for-day-1-of-patriots-training-camp-video/ → https://nesn.com/video/veteran-leaders-return-to-field-for-day-1-of-patriots-training-camp-video/ | 점수: None
[5/189243] ✅ http://celebrityphotoz.com/Jayne_Heitmeyer/ → http://celebrityphotoz.com/Jayne_Heitmeyer/ | 점수: None
[8/189243] ✅ http://artfact.com/artist/charchoune-serge-8q9mqrd5b3 → https://www.invaluable.com | 점수: None
[24/189243] ✅ http://espn.go.com/nhl/player/_/id/601/randy-mckay → http://espn.go.com/nhl/player/_/id/601/randy-mckay | 점수: None
[1/189243] ✅ http://metrotravelguide.com/hotels_nearby/kansas_city/mo/usa/0/kansas_city_international_airport_(mci)/ → https://www.hotelplanner.com | 점수: None
[20/189243] ✅ http://lrboi-nsn.gov/ → https://lrboi-nsn.gov/ | 점수: None
[16/189243] ✅ http://towleroad.com/alexandre_despatie/index.html → https://www.towleroad.com/alexandre_despatie/index | 점수: None
[9/189243] ✅ http://illinoisattorneygeneral.g