In [55]:
import requests
import pandas as pd
from PIL import Image
from bs4 import BeautifulSoup
import random
import string
import os
import glob
import io
import imagehash
import re
from google_images_search import GoogleImagesSearch
import dotenv
from tqdm.notebook import tqdm_notebook
dotenv.load_dotenv()

True

In [56]:
IMAGE_FOLDER = f'images_{"".join(random.choices(string.ascii_uppercase + string.digits, k=10))}'
os.mkdir(IMAGE_FOLDER)
IMAGE_FOLDER

'images_8303TG1IHS'

# Load Party Data

In [57]:
MP_LIST = pd.read_csv('mp_data/mp_list.csv')
# MP_LIST = MP_LIST.drop(columns=['Unnamed: 4'])
MP_LIST.head()

Unnamed: 0,Party,Name,Headshot,Description
0,Labour,Kiritapu Allan,allan.jpeg,Kiri Allan is a staunch advocate for the East ...
1,Labour,Ginny Andersen,andersen.jpg,Ginny is a longstanding member of the Labour P...
2,ACT,Chris Ballie,baillie.jpg,"Before entering Parliament, Chris worked for 2..."
3,National,Andew Bayly,bayly.jpeg,National is focused on supporting Port Waikato...
4,Labour,Camilla Belich,belich.jpg,I stood for Parliament because we have more wo...


In [58]:
PARTY_LIST = pd.read_csv('mp_data/party_list.csv')
PARTY_LIST.head()

Unnamed: 0,Party,Party Leader,Right/Left,Ideology,Description
0,Labour Party,Chris Hipkins,-0.5,Social democracy,
1,National Party,Chrisopher Luxon,0.5,"Conservativsm, liberalism",
2,Green Party,James Shaw and Marama Davidson,-1.0,"Green politics, social democracy",
3,ACT,David Seymour,1.0,"Classical liberalism, conservativism",
4,Te Pati Maori,Rawiri Waititi and Debbie Ngarewa-Parker,-1.0,"Maori rights, tino rangatiratanga",


In [59]:
NEWS_ORG_LIST = pd.read_csv('mp_data/news_org_list.csv')
NEWS_ORG_LIST.head()

Unnamed: 0,Name,URL
0,RNZ,https://www.rnz.co.nz
1,Newsroom,https://www.newsroom.co.nz
2,1News,https://www.1news.co.nz
3,Stuff NZ,https://www.stuff.co.nz
4,NZ Herald,https://www.nzherald.co.nz


In [60]:
GOOGLE_DEV_API_KEY = os.environ["GOOGLE_DEV_API_KEY"]
GOOGLE_PROJECT_CX = os.environ["GOOGLE_PROJECT_CX"]

gis = GoogleImagesSearch(GOOGLE_DEV_API_KEY, GOOGLE_PROJECT_CX)



In [61]:
downloaded_thumbnail_list = []

# # Identify photos already gotten from scraping
# ACCEPTED_PHOTOS_FOLDER = '../bias-detector/images'
# ACCEPTED_PHOTOS = [f for f in os.listdir(ACCEPTED_PHOTOS_FOLDER) if os.path.isfile(os.path.join(ACCEPTED_PHOTOS_FOLDER, f))]
# for photo in tqdm_notebook(ACCEPTED_PHOTOS):
#     img = Image.open(os.path.join(ACCEPTED_PHOTOS_FOLDER, photo))
#     img_hash = imagehash.crop_resistant_hash(img)
#     downloaded_thumbnail_list.append({
#         "hash": img_hash,
#         "x_size": 999999, # Never replace
#         "y_size": 999999, # Never replace
#         "image_name": photo
#     })
# downloaded_thumbnail_list

In [62]:
def check_if_existing_thumbnail(img, image_name: str):
    global downloaded_thumbnail_list
    img_hash = imagehash.crop_resistant_hash(img)
    x_size, y_size = img.size
    for i in downloaded_thumbnail_list:
        if i["hash"] == img_hash:
            if i["x_size"] < x_size and i["y_size"] < y_size:
                i["x_size"] = x_size
                i["y_size"] = y_size
                return i["image_name"]
            else:
                return False
    # Hash not found, add to thumbnail list
    downloaded_thumbnail_list.append({
        "hash": img_hash,
        "x_size": x_size,
        "y_size": y_size,
        "image_name": image_name
    })
    return image_name

def download_image(source_url: str, image_url: str, mp_name: str, org_name: str)->dict | None:
    # Check if text contains at least one MP or party
    # Download image
    img_data = requests.get(image_url).content
    img = Image.open(io.BytesIO(img_data))

    randomID = ''.join(random.choices(string.ascii_uppercase + string.digits, k=10))
    alt = source_url.split('/')[-1]
    norm_image_name = alt.lower().replace(' ', '_').replace('/','').replace(".","").replace("?","").replace("(","").replace(")","").replace("&","").replace("'","").replace('"','').replace(":","")
    if len(norm_image_name) > 40:
        norm_image_name = norm_image_name[:40]
    image_name = f"{org_name}_{norm_image_name}_{randomID}.jpg"

    record_image = check_if_existing_thumbnail(img, image_name)
    if not record_image:
        print("Skipping image", image_url, alt)
        return None
    
    # If it has been downloaded before, it will return the name of the existing image
    img.convert("RGB").save(f"{IMAGE_FOLDER}/{record_image}", "JPEG")
    
    if record_image == image_name:
        print("Saving image")
        # Has not been downloaded before
        return {
            'source_url': source_url,
            'image_url': image_url,
            'image_path': image_name,
            'mp_name': mp_name,
            'org_name': org_name,
        }
    else:
        return None

In [63]:
def search_images(news_link, mp_name) -> list:
    print("\nGoogling for images of", mp_name['Name'], "on", news_link['Name'])
    _search_params = {
        'q': f"{mp_name['Name']} site:{news_link['URL']}",
        'num': 20,
        'fileType': 'jpg|png',
        'imgType': 'face',
        'imgColorType': 'color',
    }
    # this will only search for images:
    gis.search(search_params=_search_params)
    
    returned_images = []

    for image in gis.results():
        print(".", end="")
        try:
            new_image = download_image(image.referrer_url, image.url, mp_name['Name'], news_link['Name'])
            if new_image:
                returned_images.append(new_image) 
        except Exception as e:
            print("Error downloading image:", e)
    print("Found images:", len(returned_images))
    return returned_images


In [64]:
TARGET_MP_HIGHLIGHTS = [
    "Chris Hipkins",
    "Chrisopher Luxon",
    "James Shaw",
    "Marama Davidson",
    "David Seymour",
    "Rawiri Waititi",
    "Debbie Ngarewa-Parker",
    "Jacinda Ardern",
    "Winston Peters",
    "John Key"
]

In [65]:
number_of_images = 10 * len(TARGET_MP_HIGHLIGHTS) * len(NEWS_ORG_LIST)
print("Images to download:",number_of_images)
estimated_time = (len(TARGET_MP_HIGHLIGHTS) * len(NEWS_ORG_LIST)*16)
print("Estimated time:",round(estimated_time/60),"minutes")


Images to download: 600
Estimated time: 16 minutes


In [66]:
# search_images("https://www.stuff.co.nz/", "jacinda ardern")

import multiprocessing as mp

def download_worker(news_org):
    images = []
    for mp in MP_LIST.iloc:
        if mp["Name"] in TARGET_MP_HIGHLIGHTS:
            try:
                # for mp in [{"Name": "Karen"}]:
                new_set = search_images(news_org[1], mp)
                print("Newset", new_set)
                if new_set:
                    images.extend(new_set)
            except Exception as e:
                print(e)
    return images

MASTER_IMAGE_TABLE = pd.DataFrame(columns=['source_url', 'image_url', 'image_path', "mp_name", "org_name"])

with mp.Pool(len(NEWS_ORG_LIST)) as pool:
    worker_results = pool.map(download_worker, NEWS_ORG_LIST.iterrows())
    image_list = [item for row in worker_results for item in row]
    MASTER_IMAGE_TABLE = pd.DataFrame(image_list)



Googling for images of
Googling for images of
Googling for images of
Googling for images of
Googling for images of
Googling for images of      Chris HipkinsChris HipkinsChris HipkinsChris HipkinsChris HipkinsChris Hipkins      onononononon      NewshubNewsroomStuff NZNZ Herald1News
RNZ




...Saving image
.Saving image
.Saving image
.Saving image
.Saving image
.Saving imageSaving image

...Skipping image https://resources.stuff.co.nz/content/dam/images/4/z/3/1/x/z/image.related.StuffLandscapeSixteenByNine.710x400.261jsc.png/1682606802265.jpg?format=pjpg&optimize=medium chris-hipkins-vs-christopher-luxon-the-tale-of-two-chrises
.Saving imageSaving image

..Saving image
.Saving image
..Saving image
.Saving image.
.Saving image
.Saving image
.Saving image
.Skipping image https://res.cloudinary.com/cognitives-s3/image/upload/c_fill,dpr_auto,f_auto,fl_lossy,g_faces:auto,h_440,q_auto,w_970/v1/cog-aap/n/303/2023/Jul/12/2IjZdC8OehdOObzEsqeY.jpg chris-hipkins-gift-to-grant-robertson-from-lithu



Skipping image https://resources.stuff.co.nz/content/dam/images/1/r/b/k/c/j/image.related.StuffLandscapeSixteenByNine.710x400.1veii9.png/1560191995596.jpg?format=pjpg&optimize=medium air-new-zealand-to-allow-staff-to-show-tattoos
.Saving image
.Saving image
.Saving image
.Skipping image https://cloudfront-ap-southeast-2.images.arcpublishing.com/nzme/GBN2ZUMHDRCU7LQGLFGU2LJ3FQ.jpg 
.Skipping image https://resources.stuff.co.nz/content/dam/images/4/z/3/m/d/h/image.related.StuffLandscapeSixteenByNine.710x400.26cue2.png/1679888661488.jpg?format=pjpg&optimize=medium marama-davidson-should-have-made-clear-violence-against-women-is-in-every-community
.Saving image
...Skipping image https://resources.stuff.co.nz/content/dam/images/4/y/x/o/a/q/image.related.StuffLandscapeSixteenByNine.710x400.24dyfj.png/1649483812766.jpg?format=pjpg&optimize=medium nearly-halfway-through-the-term-parties-start-to-stake-out-territory
.Skipping image https://d3pbdxdl8c65wb.cloudfront.net/cloudinary/2022/Apr/16/gY

In [67]:
MASTER_IMAGE_TABLE.to_csv('ImagesTable.csv', index=False)
MASTER_IMAGE_TABLE

Unnamed: 0,source_url,image_url,image_path,mp_name,org_name
0,https://www.rnz.co.nz/news/political/483328/pr...,https://rnz-ressh.cloudinary.com/image/upload/...,RNZ_prime-minister-chris-hipkins-still-a-ver_O...,Chris Hipkins,RNZ
1,https://www.rnz.co.nz/news/political/493110/pm...,https://rnz-ressh.cloudinary.com/image/upload/...,RNZ_pm-chris-hipkins-to-put-staffing-claims-_D...,Chris Hipkins,RNZ
2,https://www.rnz.co.nz/news/political/486861/ch...,https://rnz-ressh.cloudinary.com/image/upload/...,RNZ_chris-hipkins-says-polarising-debate-in-_W...,Chris Hipkins,RNZ
3,https://www.rnz.co.nz/news/political/482973/qu...,https://rnz-ressh.cloudinary.com/image/upload/...,RNZ_quickfire-q-and-a-getting-to-know-chris-_V...,Chris Hipkins,RNZ
4,https://www.rnz.co.nz/news/covid-19/423529/cov...,https://rnz-ressh.cloudinary.com/image/upload/...,RNZ_covid-19-more-cases-linked-to-current-cl_X...,Chris Hipkins,RNZ
...,...,...,...,...,...
195,https://www.newshub.co.nz/home/politics/2023/0...,https://www.newshub.co.nz/home/politics/2023/0...,Newshub_amelia-wade-analysis-how-chris-hipkins...,Chris Hipkins,Newshub
196,https://www.newshub.co.nz/home/new-zealand/202...,https://www.newshub.co.nz/home/shows/2022/05/c...,Newshub_coronavirus-prime-minister-jacinda-ard...,Chris Hipkins,Newshub
197,https://www.newshub.co.nz/home/politics/2023/0...,https://www.newshub.co.nz/home/politics/2023/0...,Newshub_ginny-andersen-describes-calm-phone-ca...,Chris Hipkins,Newshub
198,https://www.newshub.co.nz/home/new-zealand/202...,https://www.newshub.co.nz/home/new-zealand/202...,Newshub_coronavirus-rogue-doctors-charging-for...,Chris Hipkins,Newshub
