Wikipedia's terms and conditions state: 

"You are free to:

Read and Print our articles and other media free of charge.

Share and Reuse our articles and other media under free and open licenses.

Contribute To and Edit our various websites or Projects."

In [127]:
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd

In [128]:
# List of Wikipedia URLs for the 16 different Taiwanese indigenous tribes
tribe_urls = [
    "https://en.wikipedia.org/wiki/Amis_people",
    "https://en.wikipedia.org/wiki/Atayal_people",
    "https://en.wikipedia.org/wiki/Bunun_people",
    "https://en.wikipedia.org/wiki/Kanakanavu_people",
    "https://en.wikipedia.org/wiki/Kavalan_people",
    "https://en.wikipedia.org/wiki/Paiwan_people",
    "https://en.wikipedia.org/wiki/Puyuma_people",
    "https://en.wikipedia.org/wiki/Rukai_people",
    "https://en.wikipedia.org/wiki/Saaroa_people",
    "https://en.wikipedia.org/wiki/Saisiyat_people",
    "https://en.wikipedia.org/wiki/Sakizaya_people",
    "https://en.wikipedia.org/wiki/Seediq_people",
    "https://en.wikipedia.org/wiki/Taroko_people",
    "https://en.wikipedia.org/wiki/Thao_people",
    "https://en.wikipedia.org/wiki/Tsou_people",
    "https://en.wikipedia.org/wiki/Tao_people"
]

In [129]:
# Create an empty list to store data
data = []

In [130]:
# Loop through each URL
for url in tribe_urls:
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    # Get the tribe name (page title)
    tribe_name = soup.find("h1", {"id": "firstHeading"}).text.strip()

    # Extract all paragraphs (<p> tags)
    paragraphs = soup.find_all("p")

    # Combine all paragraphs into a single string
    full_text = " ".join([para.text for para in paragraphs if para.text.strip() != ""])

    # Get rid of empty lines
    full_text = full_text.replace("\n", "")

    # Extract the "last edited" date
    last_edited = soup.find("li", {"id": "footer-info-lastmod"})
    last_edited = last_edited_text = last_edited.text.strip()

    # Append data to the list
    data.append({"Tribe": tribe_name, "Full Text": full_text, "Last Edited": last_edited_text})

In [131]:
# Convert to a Pandas DataFrame
df = pd.DataFrame(data)

In [132]:
df

Unnamed: 0,Tribe,Full Text,Last Edited
0,Amis people,Nationally Recognized Locally recognized Unrec...,"This page was last edited on 5 December 2024, ..."
1,Atayal people,"The Atayal (Chinese: 泰雅; pinyin: Tàiyǎ), also ...","This page was last edited on 7 December 2024, ..."
2,Bunun people,"The Bunun (Bunun: Bunun), also historically kn...","This page was last edited on 26 November 2024,..."
3,Kanakanavu people,The Kanakanavu (Chinese: 卡那卡那富族; Wade–Giles: K...,"This page was last edited on 26 November 2024,..."
4,Kavalan people,"The Kavalan (endonym kbalan [kɨβaɾán]; ""people...","This page was last edited on 26 November 2024,..."
5,Paiwan people,The Paiwan (Paiwan: Kacalisian; Chinese: 排灣; P...,"This page was last edited on 28 November 2024,..."
6,Puyuma people,The Puyuma (Chinese: 卑南族; pinyin: Bēinánzú; Pe...,"This page was last edited on 26 November 2024,..."
7,Rukai people,The Rukai (Rukai: Drekay) are one of the indig...,"This page was last edited on 26 November 2024,..."
8,Saaroa people,The Saaroa or Hla'alua people (Chinese: 拉阿魯哇族)...,"This page was last edited on 26 November 2024,..."
9,Saisiyat people,The Saisiyat (Chinese: 賽夏; pinyin: Sàixià; Wad...,"This page was last edited on 26 November 2024,..."


In [133]:
# Save to a CSV file:
df.to_csv("taiwan_indigenous_tribes.csv", index=False, encoding="utf-8")
