In [2]:
import os
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import urllib.parse
from urllib.parse import quote
import requests
import re
import time
import mimetypes

In [5]:
# Load the CEO data
ceo_data = "ceo_data.csv"
df = pd.read_csv(ceo_data)

In [15]:
base_url = "https://www.google.com/search?q={query}&tbm=isch&tbs=cdr:1,cd_min:{start_date},cd_max:{end_date}"

In [8]:
df

Unnamed: 0,Company,Ticker,Year,CEO
0,Apple Inc.,AAPL,2010,Steve Jobs
1,Apple Inc.,AAPL,2011,Tim Cook
2,Apple Inc.,AAPL,2012,Tim Cook
3,Apple Inc.,AAPL,2013,Tim Cook
4,Apple Inc.,AAPL,2014,Tim Cook
...,...,...,...,...
675,"VMware, Inc.",VMW,2015,Pat Gelsinger
676,"VMware, Inc.",VMW,2016,Pat Gelsinger
677,"VMware, Inc.",VMW,2017,Pat Gelsinger
678,"VMware, Inc.",VMW,2018,Pat Gelsinger


In [18]:
# Generate URLs for each CEO
search_urls = []
for _, row in df.iterrows():
    name = row["CEO"]
    firm = row["Ticker"]
    start_year = row["Year"]
    current_year = 2019  # Set this to the present year or desired end year
    
    for year in range(start_year, current_year + 1):
        query = quote(f"{name} {firm}")  # Encode the search query
        url = base_url.format(query=query, start_date=f"1/1/2010", end_date=f"12/31/2019")
        search_urls.append((name, year, url))

In [19]:
# Save the URLs to a CSV file
output_filename = "ceo_image_search_urls.csv"
search_df = pd.DataFrame(search_urls, columns=["Name", "Year", "Search_URL"])
search_df
search_df.to_csv(output_filename, index=False)

print(f"Search URLs saved to {output_filename}")

Search URLs saved to ceo_image_search_urls.csv


In [20]:
## STEP 2: RUN GOOGLE IMAGE SEARCHES TO COLLECT PICTURE LINKS + ARTICLES THEY ARE IN

class GoogleImageScraper:
    def __init__(self):
        chrome_options = Options()
        # chrome_options.add_argument('--headless')
        self.driver = webdriver.Chrome(options=chrome_options)
        
    def hover_batch(self, elements, batch_size=5, delay=0.5):
        actions = ActionChains(self.driver)
        for i in range(0, len(elements), batch_size):
            batch = elements[i:i + batch_size]
            for el in batch:
                actions.move_to_element(el)
            actions.perform()
            time.sleep(delay)
                            
    def scrape_images(self, name, year, search_url, max_links=30):
        save_dir = os.path.join('pictures', str(year))
        os.makedirs(save_dir, exist_ok=True)
        
        try:
            self.driver.get(search_url)
            time.sleep(2)
            print(search_url)
            print(name, year)
            
            # Wait for search div and image elements
            search_div = WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.ID, "search"))
            )
            time.sleep(2)

            print('hover start')        
            image_divs = search_div.find_elements(By.CSS_SELECTOR, 'div[jsname="qQjpJ"]')
            image_divs = image_divs[:max_links]  
            self.hover_batch(image_divs, batch_size=5, delay=0.5)
            print('hover done')        
               
            elements = WebDriverWait(search_div, 10).until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div[jsname="qQjpJ"] h3 a'))
            )
            print(f"Found {len(elements)} potential image elements")
            time.sleep(2)

            image_urls = []
            for i, element in enumerate(elements):
                href = element.get_attribute('href')
                if href:
                    match = re.search(r'imgurl=([^&]+)', href)
                    match_story =  re.search(r'imgrefurl=([^&]+)', href)
                    if match and match_story:
                        image_url = urllib.parse.unquote(match.group(1))
                        story_url = urllib.parse.unquote(match_story.group(1))
                        print(f'element {i} has url {image_url}')                                                
                        image_urls.append([image_url, story_url])
                        if len(image_urls) >= max_links:
                            break
            
            # save image urls to csv
            image_urls_df = pd.DataFrame(image_urls, columns=['image_url', 'story_url'])
            image_urls_df.to_csv(os.path.join(save_dir, 'image_urls.csv'), index=False)            

        except Exception as e:
            print(f"Error processing {name} for year {year}: {e}")
                
    def close(self):
        self.driver.quit()

# Collect image articles/picture file links via selenium:

search_df = pd.read_csv("ceo_image_search_urls.csv")

test_df = search_df.sample(10) # run the search on subset 
test_df

scraper = GoogleImageScraper()
for _, row in test_df.iterrows():
    scraper.scrape_images(
        name=row["Name"],
        year=row["Year"],
        search_url=row["Search_URL"]
    )
scraper.close()    

https://www.google.com/search?q=Glen%20F.%20Post%20III%20LUMN&tbm=isch&tbs=cdr:1,cd_min:1/1/2010,cd_max:12/31/2019
Glen F. Post III 2012
hover start
hover done
Found 100 potential image elements
element 0 has url https://news.lumen.com/download/Post_Glen_2016.jpg
element 1 has url https://news.lumen.com/download/Storey_Jeff_2017.jpg
element 2 has url https://www.glenresearch.com/media/wysiwyg/GR22-25Fig1.jpg
element 3 has url https://www.nationalww2museum.org/sites/default/files/2020-12/Cover%20Image%20ChuckYeagerwith%20P-51D%20Glamorous%20Glen%20III%20Courtesyof%20chuckyeager.com_.jpg
element 4 has url https://henryehooper.blog/wp-content/uploads/2015/01/mileswatson.jpg
element 5 has url https://texasborderbusiness.com/wp-content/uploads/2019/01/GLEN-RONEY-Anecdote-web.jpg
element 6 has url https://www.glenresearch.com/media/wysiwyg/GR26-14Fig1.gif
element 7 has url https://www.egaproducts.com/wp-content/uploads/2017/05/PP-5212-WM.jpg
element 8 has url https://media.licdn.com/dms/imag