In [1]:
%reload_ext autotime
import wikipedia
import requests
import requests_cache
requests_cache.install_cache("cache")
import pandas as pd
import transformers
import torch
import json
from tqdm.auto import tqdm
tqdm.pandas()
pd.set_option('display.max_colwidth', 1000)

In [2]:
BASE_URL = "https://en.wikipedia.org"
df = pd.read_html("https://en.wikipedia.org/wiki/Cabinet_of_New_Zealand", header=0, extract_links="body")[2].rename(columns={"Incumbent.1": "Name"}).ffill()
df["page"] = df.Name.str[1]
df.Name = df.Name.str[0]
df.dropna(inplace=True)
df.drop(index=0, inplace=True)
df = df[df.Name != "Ministers outside Cabinet[n 2]"]
df.drop(columns=["Incumbent"], inplace=True)
df.rename(columns={"Unnamed: 6": "Type"}, inplace=True)
df.Type = df.Type.str[0]
df.Portfolios = df.Portfolios.str[0]
df.Image = BASE_URL + df.Image.str[1]
df.page = df.page.str.replace("/wiki/", "")
df["Additional responsibilities"] = df["Additional responsibilities"].str[0]
df.Electorate = df.Electorate.str[0]
df

Unnamed: 0,Portfolios,Image,Name,Additional responsibilities,Electorate,Type,page
2,Prime Minister Minister for National Security and Intelligence,"https://en.wikipedia.org/wiki/File:LUXON,_Christopher_-_Botany_(cropped).png",Christopher Luxon,Minister Responsible for Ministerial Services,Botany,National Party Ministers,Christopher_Luxon
3,Minister of Finance Minister for the Public Service Minister for Social Investment,"https://en.wikipedia.org/wiki/File:WILLIS,_Nicola_-_Ohariu_(cropped).png",Nicola Willis,Associate Minister of Climate Change,List,National Party Ministers,Nicola_Willis
4,Minister of Housing Minister for Infrastructure Minister Responsible for RMA Reform Minister for Sport and Recreation,"https://en.wikipedia.org/wiki/File:BISHOP,_Chris_-_Hutt_South_(cropped).png",Chris Bishop,Leader of the House Associate Minister of Finance,Hutt South,National Party Ministers,Chris_Bishop
5,Minister of Health Minister for Pacific Peoples,"https://en.wikipedia.org/wiki/File:RETI,_Shane_-_Whangarei_(cropped).png",Shane Reti,,Whangārei,National Party Ministers,Shane_Reti
6,Minister for Energy Minister of Local Government Minister of Transport,"https://en.wikipedia.org/wiki/File:BROWN,_Simeon_-_Pakuranga_(cropped).png",Simeon Brown,Minister for Auckland Deputy Leader of the House,Pakuranga,National Party Ministers,Simeon_Brown
7,Minister of Education Minister for Immigration,"https://en.wikipedia.org/wiki/File:STANFORD,_Erica_-_East_Coast_Bays_(cropped).png",Erica Stanford,Lead Coordination Minister for the Government’s Response to the Royal Commission’s Report into Historical Abuse in State Care and in the Care of Faith-based Institutions,East Coast Bays,National Party Ministers,Erica_Stanford
8,"Minister for Arts, Culture and Heritage Minister of Justice Minister for Media and Communications Minister for State Owned Enterprises Minister for Treaty of Waitangi Negotiations","https://en.wikipedia.org/wiki/File:GOLDSMITH,_Paul_-_Epsom_(cropped).png",Paul Goldsmith,,List,National Party Ministers,Paul_Goldsmith_(politician)
9,Minister for the Community and Voluntary Sector Minister for Disability Issues Minister for Social Development and Employment,"https://en.wikipedia.org/wiki/File:UPSTON,_Louise_-_Taupo_(cropped).png",Louise Upston,Minister for Child Poverty Reduction,Taupō,National Party Ministers,Louise_Upston
10,"Attorney-General Minister of Defence Minister for Digitising Government Minister Responsible for the GCSB Minister Responsible for the NZSIS Minister for Science, Innovation and Technology Minister for Space","https://en.wikipedia.org/wiki/File:COLLINS,_Judith_-_Papakura_(cropped).png",Judith Collins,Lead Coordination Minister for the Government’s Response to the Royal Commission’s Report into the Terrorist Attack on the Christchurch Mosques,Papakura,National Party Ministers,Judith_Collins
11,Minister of Corrections Minister for Emergency Management and Recovery Minister of Police,"https://en.wikipedia.org/wiki/File:MITCHELL,_Mark_-_Whangaparaoa_(cropped).png",Mark Mitchell,,Whangaparāoa,National Party Ministers,Mark_Mitchell_(New_Zealand_politician)


In [3]:
model_id = "meta-llama/Llama-3.1-8B-Instruct"

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
)

2024-12-18 17:09:24.006042: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-18 17:09:24.029531: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-18 17:09:24.052439: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-18 17:09:24.058923: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-18 17:09:24.076974: I tensorflow/core/platform/cpu_feature_guar

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cuda:0


In [4]:
results = []
for i, row in tqdm(df.iterrows(), total=len(df)):
    page = wikipedia.page(row.page, auto_suggest=False)
    row = row.to_dict()
    messages = [
        {"role": "system", "content": """This is a wikipedia page about a person. Extract the following information about their education, in JSON format:
            degree: A list of degrees the person has, comma separated if there's more than one, or "" if they have none
            institution: The institutions where the person studied, comma separated if there's more than one, or "" if they have none
            year: The year they graduated, comma separated if there's more than one, or "" if they have none
            faculty: The faculties their degrees are under, comma separated if there's more than one. One of Arts, Science, Engineering, Medicine, Law, Business, Education, or "" if they have none.
        
            Do not include comments in your JSON response. Only respond with the JSON object. Make sure the JSON is valid.
        """},
        {"role": "user", "content": page.content},
    ]
    for retry in range(10):
        outputs = pipeline(
            messages,
            max_new_tokens=1024,
            pad_token_id = pipeline.tokenizer.eos_token_id
        )
        result = outputs[0]["generated_text"][-1]["content"]
        try:
            result = json.loads(result)
            row.update(result)
            results.append(row)
            break
        except json.JSONDecodeError:
            print(f"Unable to parse: {result}")

df = pd.DataFrame(results)
df

  0%|          | 0/30 [00:00<?, ?it/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Unnamed: 0,Portfolios,Image,Name,Additional responsibilities,Electorate,Type,page,degree,institution,year,faculty
0,Prime Minister Minister for National Security and Intelligence,"https://en.wikipedia.org/wiki/File:LUXON,_Christopher_-_Botany_(cropped).png",Christopher Luxon,Minister Responsible for Ministerial Services,Botany,National Party Ministers,Christopher_Luxon,Master of Commerce (Business Administration),University of Canterbury,1992,Arts
1,Minister of Finance Minister for the Public Service Minister for Social Investment,"https://en.wikipedia.org/wiki/File:WILLIS,_Nicola_-_Ohariu_(cropped).png",Nicola Willis,Associate Minister of Climate Change,List,National Party Ministers,Nicola_Willis,"First-class honours degree in English literature, post-graduate diploma in journalism","Victoria University of Wellington, University of Canterbury","2003, 2017",Arts
2,Minister of Housing Minister for Infrastructure Minister Responsible for RMA Reform Minister for Sport and Recreation,"https://en.wikipedia.org/wiki/File:BISHOP,_Chris_-_Hutt_South_(cropped).png",Chris Bishop,Leader of the House Associate Minister of Finance,Hutt South,National Party Ministers,Chris_Bishop,"Bachelor of Arts in History and Politics, Bachelor of Laws (first-class honours)",Victoria University of Wellington,2000,"Arts, Law"
3,Minister of Health Minister for Pacific Peoples,"https://en.wikipedia.org/wiki/File:RETI,_Shane_-_Whangarei_(cropped).png",Shane Reti,,Whangārei,National Party Ministers,Shane_Reti,"Bachelor of Human Biology, MB ChB, Diploma in Obstetrics, Diploma in Dermatological Science","University of Waikato, University of Auckland, University of Wales, Cardiff","1985, 1987, 1989, 1991","Arts, Science, Medicine"
4,Minister for Energy Minister of Local Government Minister of Transport,"https://en.wikipedia.org/wiki/File:BROWN,_Simeon_-_Pakuranga_(cropped).png",Simeon Brown,Minister for Auckland Deputy Leader of the House,Pakuranga,National Party Ministers,Simeon_Brown,"Bachelor of Commerce, Bachelor of Laws",University of Auckland,2016,"Arts, Science"
5,Minister of Education Minister for Immigration,"https://en.wikipedia.org/wiki/File:STANFORD,_Erica_-_East_Coast_Bays_(cropped).png",Erica Stanford,Lead Coordination Minister for the Government’s Response to the Royal Commission’s Report into Historical Abuse in State Care and in the Care of Faith-based Institutions,East Coast Bays,National Party Ministers,Erica_Stanford,Bachelor of Arts,University of Auckland,1978,Arts
6,"Minister for Arts, Culture and Heritage Minister of Justice Minister for Media and Communications Minister for State Owned Enterprises Minister for Treaty of Waitangi Negotiations","https://en.wikipedia.org/wiki/File:GOLDSMITH,_Paul_-_Epsom_(cropped).png",Paul Goldsmith,,List,National Party Ministers,Paul_Goldsmith_(politician),MA,University of Auckland,1996,Arts
7,Minister for the Community and Voluntary Sector Minister for Disability Issues Minister for Social Development and Employment,"https://en.wikipedia.org/wiki/File:UPSTON,_Louise_-_Taupo_(cropped).png",Louise Upston,Minister for Child Poverty Reduction,Taupō,National Party Ministers,Louise_Upston,"Master of Business Administration, Bachelor of Laws","Waikato Management School, Rangitoto College","1988, unknown","Business, Law"
8,"Attorney-General Minister of Defence Minister for Digitising Government Minister Responsible for the GCSB Minister Responsible for the NZSIS Minister for Science, Innovation and Technology Minister for Space","https://en.wikipedia.org/wiki/File:COLLINS,_Judith_-_Papakura_(cropped).png",Judith Collins,Lead Coordination Minister for the Government’s Response to the Royal Commission’s Report into the Terrorist Attack on the Christchurch Mosques,Papakura,National Party Ministers,Judith_Collins,"LLB, LLM (Hons), MTaxS, Graduate Diploma in Occupational Health and Safety","University of Canterbury, University of Auckland, Massey University","1979, 2020","Arts, Science"
9,Minister of Corrections Minister for Emergency Management and Recovery Minister of Police,"https://en.wikipedia.org/wiki/File:MITCHELL,_Mark_-_Whangaparaoa_(cropped).png",Mark Mitchell,,Whangaparāoa,National Party Ministers,Mark_Mitchell_(New_Zealand_politician),,"Rosmini College, Wharton Business School",,


In [5]:
df.to_excel("cabinet.xlsx", index=False)

In [9]:
sum(df.faculty.str.contains("Science|Engineering")), sum(df.faculty != "")

(10, 23)