In [9]:
import pandas as pd
import requests
import time
import os

API_KEY = 'AIzaSyBFsbsTJzCB8HnIYked27vT0aKulRgO-rs'
INPUT_FILE = '../data/URL_metadata_cleaned_reduction.csv'
OUTPUT_FILE = '../data/pagespeed_results_desktop_full.csv'

# URL 형식 보정
def format_url(url):
    if not url.startswith("http"):
        return "http://" + url
    return url

# 최종 리디렉션 URL 얻기
def get_final_url(url):
    try:
        response = requests.get(url, timeout=5, allow_redirects=True)
        return response.url
    except Exception as e:
        print(f"⚠️ Redirect 실패: {url} ({e})")
        return None

# PageSpeed API 호출
def fetch_pagespeed_data(url):
    api_url = (
        f"https://www.googleapis.com/pagespeedonline/v5/runPagespeed"
        f"?url={url}&key={API_KEY}&strategy=desktop"
    )
    try:
        res = requests.get(api_url, timeout=10)
        if res.status_code != 200:
            return {"error": f"HTTP {res.status_code}"}

        data = res.json()
        lighthouse = data.get("lighthouseResult", {})
        audits = lighthouse.get("audits", {})
        categories = lighthouse.get("categories", {})
        loading = data.get("loadingExperience", {})
        origin = data.get("originLoadingExperience", {})

        return {
            "performance_score": categories.get("performance", {}).get("score"),
            "FCP": audits.get("first-contentful-paint", {}).get("numericValue"),
            "LCP": audits.get("largest-contentful-paint", {}).get("numericValue"),
            "CLS": audits.get("cumulative-layout-shift", {}).get("numericValue"),
            "TTFB": audits.get("server-response-time", {}).get("numericValue"),
            "SpeedIndex": audits.get("speed-index", {}).get("numericValue"),
            "TBT": audits.get("total-blocking-time", {}).get("numericValue"),
            "FCP_loadingExperience": loading.get("metrics", {}).get("FIRST_CONTENTFUL_PAINT_MS", {}).get("percentile"),
            "FCP_loadingExperience_category": loading.get("metrics", {}).get("FIRST_CONTENTFUL_PAINT_MS", {}).get("category"),
            "FID_loadingExperience": loading.get("metrics", {}).get("FIRST_INPUT_DELAY_MS", {}).get("percentile"),
            "FID_loadingExperience_category": loading.get("metrics", {}).get("FIRST_INPUT_DELAY_MS", {}).get("category"),
            "overall_category_loadingExperience": loading.get("overall_category"),
            "FCP_origin": origin.get("metrics", {}).get("FIRST_CONTENTFUL_PAINT_MS", {}).get("percentile"),
            "FCP_origin_category": origin.get("metrics", {}).get("FIRST_CONTENTFUL_PAINT_MS", {}).get("category"),
            "FID_origin": origin.get("metrics", {}).get("FIRST_INPUT_DELAY_MS", {}).get("percentile"),
            "FID_origin_category": origin.get("metrics", {}).get("FIRST_INPUT_DELAY_MS", {}).get("category"),
            "overall_category_origin": origin.get("overall_category"),
            "error": None
        }
    except Exception as e:
        return {
            "performance_score": None, "FCP": None, "LCP": None, "CLS": None,
            "TTFB": None, "SpeedIndex": None, "TBT": None,
            "FCP_loadingExperience": None, "FCP_loadingExperience_category": None,
            "FID_loadingExperience": None, "FID_loadingExperience_category": None,
            "overall_category_loadingExperience": None,
            "FCP_origin": None, "FCP_origin_category": None,
            "FID_origin": None, "FID_origin_category": None,
            "overall_category_origin": None,
            "error": str(e)
        }

# 실행
def main():
    print("🚀 PageSpeed (Desktop) 측정 시작")
    df = pd.read_csv(INPUT_FILE)

    # ✅ 3,000개 샘플링
    df = df.sample(n=100, random_state=42).reset_index(drop=True)

    results = []

    for i, row in df.iterrows():
        original_url = format_url(row['url'])
        final_url = get_final_url(original_url)

        # 원본 row를 딕셔너리로 변환
        base_data = row.to_dict()

        if not final_url:
            base_data.update({
                'final_url': None,
                'tested_url': None,
                'performance_score': None, 'FCP': None, 'LCP': None,
                'CLS': None, 'TTFB': None, 'SpeedIndex': None, 'TBT': None,
                'FCP_loadingExperience': None, 'FCP_loadingExperience_category': None,
                'FID_loadingExperience': None, 'FID_loadingExperience_category': None,
                'overall_category_loadingExperience': None,
                'FCP_origin': None, 'FCP_origin_category': None,
                'FID_origin': None, 'FID_origin_category': None,
                'overall_category_origin': None,
                'error': 'Redirect failed'
            })
            results.append(base_data)
            continue

        data = fetch_pagespeed_data(final_url)

        base_data.update({
            'final_url': final_url,
            'tested_url': final_url,
            **data
        })

        results.append(base_data)
        print(f"[{i+1}/{len(df)}] ✅ {original_url} → {final_url} | 점수: {data.get('performance_score')}")
        time.sleep(1.2)

    # 결과 저장
    os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)
    pd.DataFrame(results).to_csv(OUTPUT_FILE, index=False)
    print(f"✅ 저장 완료: {OUTPUT_FILE}")

if __name__ == "__main__":
    main()


🚀 PageSpeed (Desktop) 측정 시작
[1/100] ✅ http://myspace.com/lowfreqshaman → https://myspace.com:443/lowfreqshaman | 점수: None
[2/100] ✅ http://kidsfestshellharbour.com.au/index.php?option=com_content&view=article&id=113:day3-tue-a&catid=49&Itemid=75 → http://kidsfestshellharbour.com.au/index.php?option=com_content&view=article&id=113:day3-tue-a&catid=49&Itemid=75 | 점수: None
[3/100] ✅ http://niobestudio.com/index.php/iphone-ipad → https://niobestudio.com/index.php/iphone-ipad | 점수: None
⚠️ Redirect 실패: http://boots4all.com/converse9.htm (('Connection aborted.', ConnectionResetError(10054, '현재 연결은 원격 호스트에 의해 강제로 끊겼습니다', None, 10054, None)))
[5/100] ✅ http://siliconera.com/2010/06/28/sonys-japan-studio-is-working-on-ape-escape-4/ → https://www.siliconera.com/sonys-japan-studio-is-working-on-ape-escape-4/ | 점수: None
[6/100] ✅ http://tools.ietf.org/html/rfc3192 → https://datatracker.ietf.org/doc/html/rfc3192 | 점수: None
[7/100] ✅ http://gonehikin.blogspot.com/2011/06/ricketts-glen-state-park-pa-