In [40]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd
from PIL import Image
from bs4 import BeautifulSoup
import random
import string
import os
import glob
import io
import imagehash
import re

In [41]:
article_db = pd.DataFrame(columns=['title', 'url', 'image_url', 'image_path', "image_alt", "org_name"])

In [42]:
removing_files = glob.glob('images/*.jpg')
for i in removing_files:
    os.remove(i)


# Load Party Data

In [43]:
MP_LIST = pd.read_csv('mp_data/mp_list.csv')
# MP_LIST = MP_LIST.drop(columns=['Unnamed: 4'])
MP_LIST.head()

Unnamed: 0,Party,Name,Headshot,Description
0,Labour,Kiritapu Allan,allan.jpeg,Kiri Allan is a staunch advocate for the East ...
1,Labour,Ginny Andersen,andersen.jpg,Ginny is a longstanding member of the Labour P...
2,ACT,Chris Ballie,baillie.jpg,"Before entering Parliament, Chris worked for 2..."
3,National,Andew Bayly,bayly.jpeg,National is focused on supporting Port Waikato...
4,Labour,Camilla Belich,belich.jpg,I stood for Parliament because we have more wo...


In [44]:
PARTY_LIST = pd.read_csv('mp_data/party_list.csv')
PARTY_LIST.head()

Unnamed: 0,Party,Party Leader,Right/Left,Ideology,Description
0,Labour Party,Chris Hipkins,-0.5,Social democracy,
1,National Party,Chrisopher Luxon,0.5,"Conservativsm, liberalism",
2,Green Party,James Shaw and Marama Davidson,-1.0,"Green politics, social democracy",
3,ACT,David Seymour,1.0,"Classical liberalism, conservativism",
4,Te Pati Maori,Rawiri Waititi and Debbie Ngarewa-Parker,-1.0,"Maori rights, tino rangatiratanga",


In [45]:
full_name_list = MP_LIST["Name"].tolist()
full_name_list = [x.lower() for x in full_name_list]
full_party_list = PARTY_LIST["Party"].tolist()
full_party_list = [x.lower() for x in full_party_list]

keyword_list = full_name_list + full_party_list
keyword_list

['kiritapu allan',
 'ginny andersen',
 'chris ballie',
 'andew bayly',
 'camilla belich',
 'david bennett',
 'glen bennett',
 'chris bishop',
 'rachel boyack',
 'rachel brooking',
 'chris hipkins',
 'labour party',
 'national party',
 'green party',
 'act',
 'te pati maori']

In [46]:
downloaded_thumbnail_list = []

def check_if_existing_thumbnail(img, image_name: str):
    global downloaded_thumbnail_list
    img_hash = imagehash.average_hash(img)
    x_size, y_size = img.size
    for i in downloaded_thumbnail_list:
        if i["hash"] == img_hash:
            if i["x_size"] < x_size and i["y_size"] < y_size:
                i["x_size"] = x_size
                i["y_size"] = y_size
                return i["image_name"]
            else:
                return False
    # Hash not found, add to thumbnail list
    downloaded_thumbnail_list.append({
        "hash": img_hash,
        "x_size": x_size,
        "y_size": y_size,
        "image_name": image_name
    })
    return image_name

def download_image(article_name: str, article_url: str, alt: str, image_url: str, org_name: str):
    global article_db
    # Check if text contains at least one MP or party
    if not any(word in alt.lower() for word in keyword_list):
        return
    print(alt, image_url)
    # Download image
    img_data = requests.get(image_url).content
    img = Image.open(io.BytesIO(img_data))

    randomID = ''.join(random.choices(string.ascii_uppercase + string.digits, k=10))
    norm_image_name = alt.lower().replace(' ', '_').replace('/','')
    if len(norm_image_name) > 40:
        norm_image_name = norm_image_name[:40]
    image_name = f"{org_name}_{norm_image_name}_{randomID}.jpg"

    record_image = check_if_existing_thumbnail(img, image_name)
    if not record_image:
        print("Skipping image", image_url, alt)
        return
    
    # If it has been downloaded before, it will return the name of the existing image
    img.save("images/"+record_image, "JPEG")
    
    if record_image == image_name:
        # Has not been downloaded before
        new_row = pd.DataFrame({
            'title': [article_name],
            'url': [article_url],
            'image_url': [image_url],
            'image_path': [image_name],
            'image_alt': [alt],
            'org_name': [org_name],
        })
        article_db = pd.concat([article_db, new_row], ignore_index=True)

In [47]:
def get_rss_feed(url: str) -> list:
    rss_feed = requests.get(url).text
    tree = ET.fromstring(rss_feed)
    tree = tree[0]

    # Get all items, and their links and their article links
    article_list = []

    for article in tree.findall("./item"):
        article_link = article.findtext("link")
        if article_link is None:
            print("Missing article link?")
            continue
        title = article.findtext("title")
        article_html = requests.get(article_link).text
        article_list.append({"title": title, "article_link": article_link, "article_html": article_html})

    return article_list

# Get list of articles

In [48]:
# RNZ
RNZ_POLITICAL_FEED = "https://www.rnz.co.nz/rss/political.xml"
rnz_article_list = get_rss_feed(RNZ_POLITICAL_FEED)

In [49]:
# Get article and image data

for article in rnz_article_list:
    # Parse HTML
    soup = BeautifulSoup(article["article_html"])
    for i in soup.find_all("img"):
        if i["src"].startswith("https://rnz-ressh.cloudinary.com/image"):
            # Detect if full body image
            if not i["alt"] or i["alt"].startswith("No caption"):
                continue
            first_word  = i["alt"].split(" ")[0]
            # Non-full body images have a hash in front as the first word
            if len(first_word)==7 and any(char.isdigit() for char in first_word) and any(char.isalpha() for char in first_word):
                continue
            download_image(article["title"], article["article_link"],i["alt"], i["src"], "rnz")

Chris Hipkins and Labour hoardings https://rnz-ressh.cloudinary.com/image/upload/s--cHodeeec--/c_scale,f_auto,q_auto,w_1050/v1692320291/4L42YN3_FOCUS_ON_POLITICS_gst_jpg
Prime Minister Chris Hipkins announces Labour's tax policy ahead of the 2023 general election, in Lower Hutt on 13 August, 2023. https://rnz-ressh.cloudinary.com/image/upload/s--st3mAZoT--/ar_16:10,c_fill,f_auto,g_auto,q_auto,w_1050/v1691963292/4L4C4ML_RNZD4461_jpg
Green Party Co-Leader James Shaw https://rnz-ressh.cloudinary.com/image/upload/s--AwYyYxck--/ar_16:10,c_fill,f_auto,g_auto,q_auto,w_576/v1691708882/4L8V8IH_20230517_James_Shaw0001_jpg
Chris Hipkins and Winston Peters https://rnz-ressh.cloudinary.com/image/upload/s--Kxp0hvdO--/ar_16:10,c_fill,f_auto,g_auto,q_auto,w_1050/v1692312571/4L434LJ_week_jpg
Prime Minister Chris Hipkins and other Labour MPs announce four weeks paid parental leave for partners at Brooklyn Kindergarten https://rnz-ressh.cloudinary.com/image/upload/s--Mk1wcMVo--/ar_16:10,c_fill,f_auto,g_a

In [50]:
article_db.to_csv('article_db.csv', index=False)
article_db.head()

Unnamed: 0,title,url,image_url,image_path,image_alt,org_name
0,Focus on Politics: Labour's GST-free strategy ...,https://www.rnz.co.nz/national/programmes/focu...,https://rnz-ressh.cloudinary.com/image/upload/...,rnz_chris_hipkins_and_labour_hoardings_R2D4EVC...,Chris Hipkins and Labour hoardings,rnz
1,Focus on Politics: Labour's GST-free strategy ...,https://www.rnz.co.nz/national/programmes/focu...,https://rnz-ressh.cloudinary.com/image/upload/...,rnz_prime_minister_chris_hipkins_announces_l_5...,Prime Minister Chris Hipkins announces Labour'...,rnz
2,Government announces $370m plan to help farmer...,https://www.rnz.co.nz/news/national/496070/gov...,https://rnz-ressh.cloudinary.com/image/upload/...,rnz_green_party_co-leader_james_shaw_GSX1FMXBY...,Green Party Co-Leader James Shaw,rnz
3,Week in Politics: NZ First gains traction whil...,https://www.rnz.co.nz/news/political/496090/we...,https://rnz-ressh.cloudinary.com/image/upload/...,rnz_chris_hipkins_and_winston_peters_SMPBZ2YZD...,Chris Hipkins and Winston Peters,rnz
4,Week in Politics: NZ First gains traction whil...,https://www.rnz.co.nz/news/political/496090/we...,https://rnz-ressh.cloudinary.com/image/upload/...,rnz_prime_minister_chris_hipkins_and_other_l_K...,Prime Minister Chris Hipkins and other Labour ...,rnz


In [51]:
# RNZ
HERALD_POLITICAL_FEED = "https://www.nzherald.co.nz/arc/outboundfeeds/rss/section/nz/?outputType=xml&_website=nzh"
herald_article_list = get_rss_feed(HERALD_POLITICAL_FEED)

In [52]:
# Get article and image data

for article in herald_article_list:
    # Parse HTML
    soup = BeautifulSoup(article["article_html"])
    for i in soup.find_all("img"):
        print(i)
        if i["src"].startswith("https://www.nzherald.co.nz/resizer"):
            # Detect if full body image
            if not i["alt"] or i["alt"].startswith("No caption"):
                continue
            # The Resizer holds the link to the full-quality photo
            resizer = i["src"]
            match = re.match(r"(cloudfront-ap-southeast-2\.images\.arcpublishing\.com\/.*)", resizer)
            if match:
                # Get the full-quality image
                full_image = "https://" + match.group(1)
                # Download the image
                download_image(article["title"], article["article_link"],i["alt"], full_image, "herald")

<img alt="Voyager 2023 media awards" src="/pf/resources/images/brands/Voyager-2023-Website-OTY.svg?d=647"/>
<img alt="Thomas Coughlan" class="author__image" data-test-ui="author__image" src="https://s3.amazonaws.com/arc-authors/nzme/f87e20f7-9a5c-4fdd-a0ca-90ad59252111.png"/>
<img src="https://media2.nzherald.co.nz/image.jpg" style="display: none;"/>
<img alt="Watch: NZ First candidate Shane Jones sings to voters" class="story-card__image-link__picture responsively-lazy" data-srcset="https://www.nzherald.co.nz/resizer/Uwblk7ELGxA-2NPUaBLp5ayntgY=/300x169/smart/filters:quality(70)/cloudfront-ap-southeast-2.images.arcpublishing.com/nzme/UKCPGCLZSBE25OWBJVIC25BY5M.png 320w,https://www.nzherald.co.nz/resizer/Kl018sLBgaagafrip7mObe_PLWM=/576x324/smart/filters:quality(70)/cloudfront-ap-southeast-2.images.arcpublishing.com/nzme/UKCPGCLZSBE25OWBJVIC25BY5M.png 576w,https://www.nzherald.co.nz/resizer/OVmz3sI2EbIqAQ_erevHqgWSmGA=/620x349/smart/filters:quality(70)/cloudfront-ap-southeast-2.images.