In [105]:
import requests
import pandas as pd
from PIL import Image
from bs4 import BeautifulSoup
import random
import string
import os
import glob
import io
import imagehash
import re
from google_images_search import GoogleImagesSearch
import dotenv
dotenv.load_dotenv()

True

In [106]:
IMAGE_FOLDER = f'images_{"".join(random.choices(string.ascii_uppercase + string.digits, k=10))}'
os.mkdir(IMAGE_FOLDER)
IMAGE_FOLDER

'images_ML3G9KN5HD'

# Load Party Data

In [107]:
MP_LIST = pd.read_csv('mp_data/mp_list.csv')
# MP_LIST = MP_LIST.drop(columns=['Unnamed: 4'])
MP_LIST.head()

Unnamed: 0,Party,Name,Headshot,Description
0,Labour,Kiritapu Allan,allan.jpeg,Kiri Allan is a staunch advocate for the East ...
1,Labour,Ginny Andersen,andersen.jpg,Ginny is a longstanding member of the Labour P...
2,ACT,Chris Ballie,baillie.jpg,"Before entering Parliament, Chris worked for 2..."
3,National,Andew Bayly,bayly.jpeg,National is focused on supporting Port Waikato...
4,Labour,Camilla Belich,belich.jpg,I stood for Parliament because we have more wo...


In [108]:
PARTY_LIST = pd.read_csv('mp_data/party_list.csv')
PARTY_LIST.head()

Unnamed: 0,Party,Party Leader,Right/Left,Ideology,Description
0,Labour Party,Chris Hipkins,-0.5,Social democracy,
1,National Party,Chrisopher Luxon,0.5,"Conservativsm, liberalism",
2,Green Party,James Shaw and Marama Davidson,-1.0,"Green politics, social democracy",
3,ACT,David Seymour,1.0,"Classical liberalism, conservativism",
4,Te Pati Maori,Rawiri Waititi and Debbie Ngarewa-Parker,-1.0,"Maori rights, tino rangatiratanga",


In [109]:
full_name_list = MP_LIST["Name"].tolist()
full_name_list = [x.lower() for x in full_name_list]
full_party_list = PARTY_LIST["Party"].tolist()
full_party_list = [x.lower() for x in full_party_list]

keyword_list = full_name_list + full_party_list
keyword_list

['kiritapu allan',
 'ginny andersen',
 'chris ballie',
 'andew bayly',
 'camilla belich',
 'david bennett',
 'glen bennett',
 'chris bishop',
 'rachel boyack',
 'rachel brooking',
 'chris hipkins',
 'simeon brown',
 'gerry brownly',
 'mark cameron',
 'naisi chen',
 'karen chhour',
 'david clark',
 'tamati coffey',
 'judith collins',
 'simon court',
 'liz craig',
 'marama davidson',
 'kelvin davis',
 'jacqui dean',
 'matt doocey',
 'paul eagle',
 'barbara edmonds',
 'julie anne genter',
 'golriz ghahraman',
 'paul goldsmith',
 'nicola grigg',
 'shanan halbert',
 'peeni henare',
 'emily henderson',
 'harete hipango',
 'labour party',
 'national party',
 'green party',
 'act',
 'te pati maori']

In [110]:
NEWS_ORG_LIST = pd.read_csv('mp_data/news_org_list.csv')
NEWS_ORG_LIST.head()

Unnamed: 0,Name,URL
0,RNZ,https://www.rnz.co.nz
1,Newsroom,https://www.newsroom.co.nz
2,1News,https://www.1news.co.nz
3,Stuff NZ,https://www.stuff.co.nz
4,NZ Herald,https://www.nzherald.co.nz


In [111]:
GOOGLE_DEV_API_KEY = os.environ["GOOGLE_DEV_API_KEY"]
GOOGLE_PROJECT_CX = os.environ["GOOGLE_PROJECT_CX"]

gis = GoogleImagesSearch(GOOGLE_DEV_API_KEY, GOOGLE_PROJECT_CX)



In [112]:
MASTER_IMAGE_TABLE = pd.DataFrame(columns=['source_url', 'image_url', 'image_path', "mp_name", "org_name"])

In [130]:
downloaded_thumbnail_list = []

def check_if_existing_thumbnail(img, image_name: str):
    global downloaded_thumbnail_list
    img_hash = imagehash.crop_resistant_hash(img)
    x_size, y_size = img.size
    for i in downloaded_thumbnail_list:
        if i["hash"] == img_hash:
            if i["x_size"] < x_size and i["y_size"] < y_size:
                i["x_size"] = x_size
                i["y_size"] = y_size
                return i["image_name"]
            else:
                return False
    # Hash not found, add to thumbnail list
    downloaded_thumbnail_list.append({
        "hash": img_hash,
        "x_size": x_size,
        "y_size": y_size,
        "image_name": image_name
    })
    return image_name

def download_image(source_url: str, image_url: str, mp_name: str, org_name: str):
    global MASTER_IMAGE_TABLE
    # Check if text contains at least one MP or party
    # Download image
    img_data = requests.get(image_url).content
    img = Image.open(io.BytesIO(img_data))

    randomID = ''.join(random.choices(string.ascii_uppercase + string.digits, k=10))
    alt = source_url.split('/')[-1]
    norm_image_name = alt.lower().replace(' ', '_').replace('/','').replace(".","")
    if len(norm_image_name) > 40:
        norm_image_name = norm_image_name[:40]
    image_name = f"{org_name}_{norm_image_name}_{randomID}.jpg"

    record_image = check_if_existing_thumbnail(img, image_name)
    if not record_image:
        print("Skipping image", image_url, alt)
        return
    
    # If it has been downloaded before, it will return the name of the existing image
    img.convert("RGB").save(f"{IMAGE_FOLDER}/{record_image}", "JPEG")
    
    if record_image == image_name:
        # Has not been downloaded before
        new_row = pd.DataFrame({
            'source_url': [source_url],
            'image_url': [image_url],
            'image_path': [image_name],
            'mp_name': [mp_name],
            'org_name': [org_name],
        })
        MASTER_IMAGE_TABLE = pd.concat([MASTER_IMAGE_TABLE, new_row], ignore_index=True)

In [131]:
def search_images(news_link, mp_name):
    print("\nGoogling for images of", mp_name['Name'], "on", news_link['Name'])
    _search_params = {
        'q': f"{mp_name['Name']} site:{news_link['URL']}",
        'num': 10,
        'fileType': 'jpg|png',
        'imgType': 'face',
        'imgColorType': 'color',
    }
    # this will only search for images:
    gis.search(search_params=_search_params)
    for image in gis.results():
        print(".", end="")
        download_image(image.referrer_url, image.url, mp_name['Name'], news_link['Name'])        


In [132]:
number_of_images = 10 * len(MP_LIST) * len(PARTY_LIST)
print("Images to download:",number_of_images)
estimated_time = (len(MP_LIST) * len(PARTY_LIST)*16)
print("Estimated time:",round(estimated_time/60),"minutes")


Images to download: 1750
Estimated time: 47 minutes


In [133]:
# search_images("https://www.stuff.co.nz/", "jacinda ardern")

import multiprocessing as mp

def download_worker(news_org):
    print(news_org)
    for mp in MP_LIST.iloc:
        search_images(news_org[1], mp)

with mp.Pool(len(NEWS_ORG_LIST)) as pool:
    pool.map(download_worker, NEWS_ORG_LIST.iterrows())


(1, Name                      Newsroom
URL     https://www.newsroom.co.nz
Name: 1, dtype: object)

Googling for images of Kiritapu Allan on Newsroom
..Skipping image https://d3pbdxdl8c65wb.cloudfront.net/cloudinary/2022/Apr/08/fh80tQxSFgFWUGQ0nOaR.jpg emma-espiner-the-death-of-the-mori-party
.....Skipping image https://d3pbdxdl8c65wb.cloudfront.net/cloudinary/2022/Apr/10/ek0RbMggzBcD8pRMwyIr.jpg whaitiri-to-be-absent-from-parliament
...Skipping image https://res.cloudinary.com/cognitives-s3/image/upload/c_fill,dpr_auto,f_auto,fl_lossy,h_800,q_auto,w_1200/v1/cog-aap/n/303/2022/May/11/KwgYIEaDs9M1aMqGXHTr.jpg mature-race-relations-debate-missing-from-parliament

Googling for images of Ginny Andersen on Newsroom
..Skipping image https://res.cloudinary.com/cognitives-s3/image/upload/c_fill,dpr_auto,f_auto,fl_lossy,g_faces:auto,h_440,q_auto,w_970/v1/cog-aap/n/303/2023/Jul/24/gVpVWCHZkchPc5AZsDGi.jpg the-lens-mps-see-mental-health-through-revealed
..Skipping image https://res.cloudinary.com/



.Skipping image https://res.cloudinary.com/cognitives/image/upload/c_limit,dpr_auto,f_auto,fl_lossy,q_auto,w_750/jmbbknzax0pdun14zlvt act-candidates-freely-choose
.Skipping image https://res.cloudinary.com/cognitives/image/upload/c_limit,dpr_auto,f_auto,fl_lossy,q_auto,w_1200/yuvaqfqq0bcksqc7d3c1 act-candidates-freely-choose
.Skipping image https://res.cloudinary.com/cognitives-s3/image/upload/c_fill,dpr_auto,f_auto,fl_lossy,g_faces:auto,h_440,q_auto,w_970/v1/cog-aap/n/303/2023/Jan/23/EuAMmhpGmzTKIu83nuBV.jpg its-not-politics-and-its-not-ok
.Skipping image https://res.cloudinary.com/cognitives-s3/image/upload/c_fill,dpr_auto,f_auto,fl_lossy,h_800,q_auto,w_1200/v1/cog-aap/n/303/2023/Jan/23/EuAMmhpGmzTKIu83nuBV.jpg its-not-politics-and-its-not-ok
.Skipping image https://res.cloudinary.com/cognitives/image/upload/c_limit,dpr_auto,f_auto,fl_lossy,q_auto,w_1200/kvkvq7gsgg3db9wzn9g9 act-candidates-freely-choose
.Skipping image https://d3pbdxdl8c65wb.cloudfront.net/cloudinary/2022/May/17/hZ8s

UnidentifiedImageError: cannot identify image file <_io.BytesIO object at 0x7f8669596a20>

In [None]:
MASTER_IMAGE_TABLE.to_csv('ImagesTable.csv', index=False)
MASTER_IMAGE_TABLE

Unnamed: 0,source_url,image_url,image_path,mp_name,org_name
0,https://www.rnz.co.nz/news/political/476976/te...,https://rnz-ressh.cloudinary.com/image/upload/...,RNZ_terrorism-laws-to-be-expanded-targeting-_8...,Kiritapu Allan,RNZ
1,https://www.rnz.co.nz/news/political/482766/ja...,https://rnz-ressh.cloudinary.com/image/upload/...,RNZ_jacinda-ardern-quits-live-updates-on-res_F...,Kiritapu Allan,RNZ
2,https://www.rnz.co.nz/news/political/494354/po...,https://rnz-ressh.cloudinary.com/image/upload/...,RNZ_political-leaders-react-to-resignation-a_3...,Kiritapu Allan,RNZ
3,https://www.rnz.co.nz/news/political/494454/rn...,https://rnz-ressh.cloudinary.com/image/upload/...,RNZ_rnz-board-member-jason-ake-makes-fresh-c_9...,Kiritapu Allan,RNZ
4,https://www.rnz.co.nz/news/ldr/433417/west-coa...,https://rnz-ressh.cloudinary.com/image/upload/...,RNZ_west-coast-conservation-board-at-loggerh_5...,Kiritapu Allan,RNZ
...,...,...,...,...,...
63,https://www.newsroom.co.nz/three-waters-select...,https://res.cloudinary.com/cognitives-s3/image...,Newsroom_three-waters-select-committee-slates-...,Glen Bennett,Newsroom
64,https://www.newsroom.co.nz/campaign-diary-no-4...,https://res.cloudinary.com/cognitives/image/up...,Newsroom_campaign-diary-no-4-chris-bishop-mp_3...,Chris Bishop,Newsroom
65,https://www.newsroom.co.nz/election-2020-whats...,https://res.cloudinary.com/cognitives/image/up...,Newsroom_election-2020-whats-happening-what-it...,Rachel Boyack,Newsroom
66,https://www.newsroom.co.nz/one-wedding-two-hon...,https://res.cloudinary.com/cognitives-s3/image...,Newsroom_one-wedding-two-honeymoons-and-a-fune...,Chris Hipkins,Newsroom
