In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import quote
from tqdm import tqdm
import time

headers = {"User-Agent": "Mozilla/5.0 (DataCollector)"}

# üìã Daftar halaman yang akan digabung
list_urls = [
    "https://en.wikipedia.org/wiki/List_of_largest_companies_by_revenue",
    "https://en.wikipedia.org/wiki/List_of_unicorn_startup_companies",
    "https://en.wikipedia.org/wiki/List_of_technology_companies",
    "https://en.wikipedia.org/wiki/List_of_largest_manufacturing_companies_by_revenue"
]

companies = set()

# --- Ambil nama perusahaan dari beberapa halaman ---
print("üîç Mengambil daftar perusahaan dari beberapa halaman Wikipedia...")
for url in list_urls:
    res = requests.get(url, headers=headers)
    soup = BeautifulSoup(res.text, "html.parser")

    # Cari nama dari kolom tabel
    for row in soup.select("table.wikitable tbody tr"):
        cols = row.find_all("td")
        if len(cols) > 0:
            name = cols[0].text.strip()
            if not name:
                continue
            # Beberapa tabel menaruh nama di kolom ke-1 atau ke-2
            if len(name.split()) < 2 and len(cols) > 1:
                name = cols[1].text.strip()
            companies.add(name)

print(f"‚úÖ Total unik perusahaan ditemukan: {len(companies)}")

# --- Ambil data ringkasan dari Wikipedia REST API ---
results = []
for company in tqdm(list(companies)[:1000], desc="üì¶ Mengambil data dari Wikipedia"):
    try:
        url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{quote(company)}"
        res = requests.get(url, headers=headers)
        if res.status_code == 200:
            data = res.json()
            results.append({
                "name": data.get("title", ""),
                "description": data.get("description", ""),
                "summary": data.get("extract", "")
            })
        time.sleep(0.2)
    except Exception as e:
        print(f"‚ö†Ô∏è Gagal ambil data {company}: {e}")

# --- Simpan ke CSV ---
df = pd.DataFrame(results)
df = df.drop_duplicates(subset=["name"])
df.to_csv("wikipedia_companies.csv", index=False, encoding="utf-8-sig")

print(f"\n‚úÖ Selesai! {len(df)} data disimpan ke 'wikipedia_companies.csv'")


üîç Mengambil daftar perusahaan dari beberapa halaman Wikipedia...
‚úÖ Total unik perusahaan ditemukan: 554


üì¶ Mengambil data dari Wikipedia: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 554/554 [07:43<00:00,  1.20it/s]


‚úÖ Selesai! 330 data disimpan ke 'wikipedia_companies.csv'



