In [233]:
import requests
import pandas as pd
from PIL import Image
from bs4 import BeautifulSoup
import random
import string
import os
import glob
import io
import imagehash
import re
from google_images_search import GoogleImagesSearch
import dotenv
dotenv.load_dotenv()

True

In [234]:
IMAGE_FOLDER = f'images_{"".join(random.choices(string.ascii_uppercase + string.digits, k=10))}'
os.mkdir(IMAGE_FOLDER)
IMAGE_FOLDER

'images_LMQ7TC9UFW'

# Load Party Data

In [235]:
MP_LIST = pd.read_csv('mp_data/mp_list.csv')
# MP_LIST = MP_LIST.drop(columns=['Unnamed: 4'])
MP_LIST.head()

Unnamed: 0,Party,Name,Headshot,Description
0,Labour,Kiritapu Allan,allan.jpeg,Kiri Allan is a staunch advocate for the East ...
1,Labour,Ginny Andersen,andersen.jpg,Ginny is a longstanding member of the Labour P...
2,ACT,Chris Ballie,baillie.jpg,"Before entering Parliament, Chris worked for 2..."
3,National,Andew Bayly,bayly.jpeg,National is focused on supporting Port Waikato...
4,Labour,Camilla Belich,belich.jpg,I stood for Parliament because we have more wo...


In [236]:
PARTY_LIST = pd.read_csv('mp_data/party_list.csv')
PARTY_LIST.head()

Unnamed: 0,Party,Party Leader,Right/Left,Ideology,Description
0,Labour Party,Chris Hipkins,-0.5,Social democracy,
1,National Party,Chrisopher Luxon,0.5,"Conservativsm, liberalism",
2,Green Party,James Shaw and Marama Davidson,-1.0,"Green politics, social democracy",
3,ACT,David Seymour,1.0,"Classical liberalism, conservativism",
4,Te Pati Maori,Rawiri Waititi and Debbie Ngarewa-Parker,-1.0,"Maori rights, tino rangatiratanga",


In [237]:
full_name_list = MP_LIST["Name"].tolist()
full_name_list = [x.lower() for x in full_name_list]
full_party_list = PARTY_LIST["Party"].tolist()
full_party_list = [x.lower() for x in full_party_list]

keyword_list = full_name_list + full_party_list
keyword_list

['kiritapu allan',
 'ginny andersen',
 'chris ballie',
 'andew bayly',
 'camilla belich',
 'david bennett',
 'glen bennett',
 'chris bishop',
 'rachel boyack',
 'rachel brooking',
 'chris hipkins',
 'simeon brown',
 'gerry brownly',
 'mark cameron',
 'naisi chen',
 'karen chhour',
 'david clark',
 'tamati coffey',
 'judith collins',
 'simon court',
 'liz craig',
 'marama davidson',
 'kelvin davis',
 'jacqui dean',
 'matt doocey',
 'paul eagle',
 'barbara edmonds',
 'julie anne genter',
 'golriz ghahraman',
 'paul goldsmith',
 'nicola grigg',
 'shanan halbert',
 'peeni henare',
 'emily henderson',
 'harete hipango',
 'willie jackson',
 "anahila kanongata'a",
 'barbara kuriger',
 'ingrid leary',
 'anae neru leavasa',
 'melissa lee',
 'steph lewis',
 'andrew little',
 'jan logie',
 'anna lorck',
 'marja lubeck',
 'christopher luxon',
 'jo luxton',
 'nanaia mahuta',
 'kieran mcanulty',
 'todd mcclay',
 'james mcdowall',
 'nicole mckee',
 'ian mckelvie',
 'tracey mclellan',
 'ricardo menénd

In [238]:
NEWS_ORG_LIST = pd.read_csv('mp_data/news_org_list.csv')
NEWS_ORG_LIST.head()

Unnamed: 0,Name,URL
0,RNZ,https://www.rnz.co.nz
1,Newsroom,https://www.newsroom.co.nz
2,1News,https://www.1news.co.nz
3,Stuff NZ,https://www.stuff.co.nz
4,NZ Herald,https://www.nzherald.co.nz


In [239]:
GOOGLE_DEV_API_KEY = os.environ["GOOGLE_DEV_API_KEY"]
GOOGLE_PROJECT_CX = os.environ["GOOGLE_PROJECT_CX"]

gis = GoogleImagesSearch(GOOGLE_DEV_API_KEY, GOOGLE_PROJECT_CX)



In [240]:
downloaded_thumbnail_list = []

def check_if_existing_thumbnail(img, image_name: str):
    global downloaded_thumbnail_list
    img_hash = imagehash.crop_resistant_hash(img)
    x_size, y_size = img.size
    for i in downloaded_thumbnail_list:
        if i["hash"] == img_hash:
            if i["x_size"] < x_size and i["y_size"] < y_size:
                i["x_size"] = x_size
                i["y_size"] = y_size
                return i["image_name"]
            else:
                return False
    # Hash not found, add to thumbnail list
    downloaded_thumbnail_list.append({
        "hash": img_hash,
        "x_size": x_size,
        "y_size": y_size,
        "image_name": image_name
    })
    return image_name

def download_image(source_url: str, image_url: str, mp_name: str, org_name: str)->dict | None:
    # Check if text contains at least one MP or party
    # Download image
    img_data = requests.get(image_url).content
    img = Image.open(io.BytesIO(img_data))

    randomID = ''.join(random.choices(string.ascii_uppercase + string.digits, k=10))
    alt = source_url.split('/')[-1]
    norm_image_name = alt.lower().replace(' ', '_').replace('/','').replace(".","").replace("?","").replace("(","").replace(")","").replace("&","").replace("'","").replace('"','')
    if len(norm_image_name) > 40:
        norm_image_name = norm_image_name[:40]
    image_name = f"{org_name}_{norm_image_name}_{randomID}.jpg"

    record_image = check_if_existing_thumbnail(img, image_name)
    if not record_image:
        print("Skipping image", image_url, alt)
        return None
    
    # If it has been downloaded before, it will return the name of the existing image
    img.convert("RGB").save(f"{IMAGE_FOLDER}/{record_image}", "JPEG")
    
    if record_image == image_name:
        print("Saving image")
        # Has not been downloaded before
        return {
            'source_url': source_url,
            'image_url': image_url,
            'image_path': image_name,
            'mp_name': mp_name,
            'org_name': org_name,
        }
    else:
        return None

In [241]:
def search_images(news_link, mp_name) -> list:
    print("\nGoogling for images of", mp_name['Name'], "on", news_link['Name'])
    _search_params = {
        'q': f"{mp_name['Name']} site:{news_link['URL']}",
        'num': 10,
        'fileType': 'jpg|png',
        'imgType': 'face',
        'imgColorType': 'color',
    }
    # this will only search for images:
    gis.search(search_params=_search_params)
    
    returned_images = []

    for image in gis.results():
        print(".", end="")
        try:
            new_image = download_image(image.referrer_url, image.url, mp_name['Name'], news_link['Name'])
            if new_image:
                returned_images.append(new_image) 
        except Exception as e:
            print("Error downloading image:", e)
    print("Found images:", len(returned_images))
    return returned_images


In [242]:
number_of_images = 10 * len(MP_LIST) * len(PARTY_LIST)
print("Images to download:",number_of_images)
estimated_time = (len(MP_LIST) * len(PARTY_LIST)*16)
print("Estimated time:",round(estimated_time/60),"minutes")


Images to download: 5850
Estimated time: 156 minutes


In [243]:
# search_images("https://www.stuff.co.nz/", "jacinda ardern")

import multiprocessing as mp

def download_worker(news_org):
    print(news_org)
    images = []
    for mp in MP_LIST.iloc:
        # for mp in [{"Name": "Karen"}]:
        new_set = search_images(news_org[1], mp)
        print("Newset", new_set)
        if new_set:
            images.extend(new_set)
    return images

MASTER_IMAGE_TABLE = pd.DataFrame(columns=['source_url', 'image_url', 'image_path', "mp_name", "org_name"])

with mp.Pool(len(NEWS_ORG_LIST)) as pool:
    worker_results = pool.map(download_worker, NEWS_ORG_LIST.iterrows())
    image_list = [item for row in worker_results for item in row]
    MASTER_IMAGE_TABLE = pd.DataFrame(image_list)


(2, Name                      1News
URL     https://www.1news.co.nz
Name: 2, dtype: object)(5, Name                      Newshub
URL     https://www.newshub.co.nz
Name: 5, dtype: object)(3, Name                   Stuff NZ
URL     https://www.stuff.co.nz
Name: 3, dtype: object)
(0, Name                       RNZ
URL     https://www.rnz.co.nz 
Name: 0, dtype: object)(1, Name                      Newsroom
URL     https://www.newsroom.co.nz
Name: 1, dtype: object)


(4, Name                     NZ Herald
URL     https://www.nzherald.co.nz
Name: 4, dtype: object)

Googling for images of
Googling for images of  

Googling for images ofKiritapu Allan  Kiritapu Allan
Googling for images of  ononKiritapu Allan  
Googling for images of 
Googling for images of  onRNZNewsroom Kiritapu AllanKiritapu AllanNZ Herald
 
 onon  Stuff NZKiritapu Allan
Newshub
 
on 1News


..Saving image
..Saving image
..Skipping image https://d3pbdxdl8c65wb.cloudfront.net/cloudinary/2022/Apr/08/fh80tQxSFgFWUGQ0nOaR.jpg emma-espiner-the-death-of-the-mori-party
.Saving image
.Saving image
.Saving image
..Skipping image https://resources.stuff.co.nz/content/dam/images/4/y/r/a/w/q/image.related.StuffLandscapeSixteenByNine.710x400.4yrtru.png/1617667795595.jpg?format=pjpg&optimize=medium conservation-minister-kiritapu-allan-diagnosed-with-stage-3-cervical-cancer
.Skipping image https://www.1news.co.nz/resizer/wv0VynNgVO_Pq2W5lnNiLkIg1CI=/arc-photo-tvnz/ap-se-2-prod/public/LNCWTGHHLVEJFHGVHP2NMMBWQM.png 
.Saving image
.Saving image
.Saving image.
Saving image
.Skipping image https://www.nzherald.co.nz/resizer/mVXOaPfn1e5FBaraSwmw59if15c=/576x324/smart/filters:quality(70)/cloudfront-ap-southeast-2.images.arcpublishing.com/nzme/K7KWYHTXBWCENEYVNUMMD2ZU7U.jpgSkipping image  
https://tvnz-1-news-prod.cdn.arcpublishing.com/resizer/sM5vRmkeHLXy6fg6A-hb6pHG6Pw=/1200x630/filters:forma



Skipping image https://www.nzherald.co.nz/resizer/bfgLdHxOQosW3qBz3Y1CyFkzlNc=/576x377/smart/filters:quality(70)/cloudfront-ap-southeast-2.images.arcpublishing.com/nzme/3PYWXJLSMJAHPERXXJRMNXA5QI.jpg 
.Skipping image https://resources.stuff.co.nz/content/dam/images/4/y/p/l/y/k/image.related.StuffLandscapeSixteenByNine.418x235.21ir44.png/1606185389226.jpg naisi-chen-feels-huge-responsibility-as-only-chinese-mp
.Skipping image https://tvnz-1-news-prod.cdn.arcpublishing.com/resizer/hJ-GzzOjEz5Uai_nRg0a1FVPe0A=/210x140/filters:format(png):quality(70)/cloudfront-ap-southeast-2.images.arcpublishing.com/tvnz/RAZHAXQGFRP5FE3QEVELBAYSLI.png 
.Skipping image https://www.newshub.co.nz/home/politics/2022/09/judith-collins-says-she-self-identify-as-27-year-old-slovakian-model-michael-woodhouse-calls-comment-flippant/_jcr_content/par/video/image.dynimg.360.q75.jpg/v1664237740782/newshub-woodhouse-collins-1120.jpg JudithCollins.html
Found images: 0
Newset []

Googling for images of Willie Jackson on 

In [244]:
MASTER_IMAGE_TABLE.to_csv('ImagesTable.csv', index=False)
MASTER_IMAGE_TABLE

Unnamed: 0,source_url,image_url,image_path,mp_name,org_name
0,https://www.rnz.co.nz/news/political/476976/te...,https://rnz-ressh.cloudinary.com/image/upload/...,RNZ_terrorism-laws-to-be-expanded-targeting-_V...,Kiritapu Allan,RNZ
1,https://www.rnz.co.nz/news/political/482766/ja...,https://rnz-ressh.cloudinary.com/image/upload/...,RNZ_jacinda-ardern-quits-live-updates-on-res_9...,Kiritapu Allan,RNZ
2,https://www.rnz.co.nz/news/political/494354/po...,https://rnz-ressh.cloudinary.com/image/upload/...,RNZ_political-leaders-react-to-resignation-a_I...,Kiritapu Allan,RNZ
3,https://www.rnz.co.nz/news/political/494454/rn...,https://rnz-ressh.cloudinary.com/image/upload/...,RNZ_rnz-board-member-jason-ake-makes-fresh-c_C...,Kiritapu Allan,RNZ
4,https://www.rnz.co.nz/news/ldr/433417/west-coa...,https://rnz-ressh.cloudinary.com/image/upload/...,RNZ_west-coast-conservation-board-at-loggerh_W...,Kiritapu Allan,RNZ
...,...,...,...,...,...
540,https://www.newshub.co.nz/home/politics/2023/0...,https://www.newshub.co.nz/home/politics/2023/0...,Newshub_national-s-erica-stanford-claims-micha...,Jan Tinetti,Newshub
541,https://www.newshub.co.nz/home/politics/2022/0...,https://www.newshub.co.nz/home/politics/2022/0...,Newshub_nanaia-mahuta-under-fire-for-criticisi...,Rino Tirikatene,Newshub
542,https://www.newshub.co.nz/home/politics.html,https://www.newshub.co.nz/home/politics/2023/0...,Newshub_politicshtml_B4MOJHVNG3.jpg,Tim van de Molen,Newshub
543,https://www.newshub.co.nz/home/politics/2023/0...,https://www.newshub.co.nz/home/politics/2023/0...,Newshub_government-promises-to-stamp-out-unfai...,Ayesha Verrall,Newshub
