In [1]:
import os
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import urllib.parse
from urllib.parse import quote
import requests
import re
import time
import mimetypes

In [2]:
# Load the CEO data
ceo_data = "ceo_data.csv"
df = pd.read_csv(ceo_data)

In [7]:
df

Unnamed: 0,Company,Ticker,Year,CEO
0,Apple Inc.,AAPL,2010,Steve Jobs
1,Apple Inc.,AAPL,2011,Tim Cook
2,Apple Inc.,AAPL,2012,Tim Cook
3,Apple Inc.,AAPL,2013,Tim Cook
4,Apple Inc.,AAPL,2014,Tim Cook
...,...,...,...,...
675,"VMware, Inc.",VMW,2015,Pat Gelsinger
676,"VMware, Inc.",VMW,2016,Pat Gelsinger
677,"VMware, Inc.",VMW,2017,Pat Gelsinger
678,"VMware, Inc.",VMW,2018,Pat Gelsinger


In [3]:
base_url = "https://www.google.com/search?q={query}&tbm=isch&tbs=cdr:1,cd_min:{start_date},cd_max:{end_date}"

In [8]:
# Generate URLs for each CEO
search_urls = []
for _, row in df.iterrows():
    name = row["CEO"]
    firm = row["Company"]
    start_year = row["Year"]
    current_year = 2019  # Set this to the present year or desired end year
    
    for year in range(start_year, current_year + 1):
        query = quote(f"{name} {firm}")  # Encode the search query
        url = base_url.format(query=query, start_date=f"1/1/2010", end_date=f"12/31/2019")
        search_urls.append((name, year, url))

In [9]:
# Save the URLs to a CSV file
output_filename = "ceo_image_search_urls.csv"
search_df = pd.DataFrame(search_urls, columns=["Name", "Year", "Search_URL"])
search_df
search_df.to_csv(output_filename, index=False)

print(f"Search URLs saved to {output_filename}")

Search URLs saved to ceo_image_search_urls.csv


In [10]:
## STEP 2: RUN GOOGLE IMAGE SEARCHES TO COLLECT PICTURE LINKS + ARTICLES THEY ARE IN

class GoogleImageScraper:
    def __init__(self):
        chrome_options = Options()
        # chrome_options.add_argument('--headless')
        self.driver = webdriver.Chrome(options=chrome_options)
        
    def hover_batch(self, elements, batch_size=5, delay=0.5):
        actions = ActionChains(self.driver)
        for i in range(0, len(elements), batch_size):
            batch = elements[i:i + batch_size]
            for el in batch:
                actions.move_to_element(el)
            actions.perform()
            time.sleep(delay)
                            
    def scrape_images(self, name, year, search_url, max_links=30):
        save_dir = os.path.join('pictures', str(year))
        os.makedirs(save_dir, exist_ok=True)
        
        try:
            self.driver.get(search_url)
            time.sleep(2)
            print(search_url)
            print(name, year)
            
            # Wait for search div and image elements
            search_div = WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.ID, "search"))
            )
            time.sleep(2)

            print('hover start')        
            image_divs = search_div.find_elements(By.CSS_SELECTOR, 'div[jsname="qQjpJ"]')
            image_divs = image_divs[:max_links]  
            self.hover_batch(image_divs, batch_size=5, delay=0.5)
            print('hover done')        
               
            elements = WebDriverWait(search_div, 10).until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div[jsname="qQjpJ"] h3 a'))
            )
            print(f"Found {len(elements)} potential image elements")
            time.sleep(2)

            image_urls = []
            for i, element in enumerate(elements):
                href = element.get_attribute('href')
                if href:
                    match = re.search(r'imgurl=([^&]+)', href)
                    match_story =  re.search(r'imgrefurl=([^&]+)', href)
                    if match and match_story:
                        image_url = urllib.parse.unquote(match.group(1))
                        story_url = urllib.parse.unquote(match_story.group(1))
                        print(f'element {i} has url {image_url}')                                                
                        image_urls.append([image_url, story_url])
                        if len(image_urls) >= max_links:
                            break
            
            # save image urls to csv
            image_urls_df = pd.DataFrame(image_urls, columns=['image_url', 'story_url'])
            image_urls_df.to_csv(os.path.join(save_dir, 'image_urls.csv'), index=False)            

        except Exception as e:
            print(f"Error processing {name} for year {year}: {e}")
                
    def close(self):
        self.driver.quit()

# Collect image articles/picture file links via selenium:

search_df = pd.read_csv("ceo_image_search_urls.csv")

test_df = search_df.sample(10) # run the search on subset 
test_df

scraper = GoogleImageScraper()
for _, row in test_df.iterrows():
    scraper.scrape_images(
        name=row["Name"],
        year=row["Year"],
        search_url=row["Search_URL"]
    )
scraper.close()    

https://www.google.com/search?q=Tim%20Cook%20Apple%20Inc.&tbm=isch&tbs=cdr:1,cd_min:1/1/2010,cd_max:12/31/2019
Tim Cook 2016
hover start
hover done
Found 100 potential image elements
element 0 has url https://www.apple.com/leadership/images/bio/tim-cook_image.png.og.png?1736783744603
element 1 has url https://cdn.macstories.net/002/25788_Header.png
element 2 has url https://cdn.wccftech.com/wp-content/uploads/2023/11/Apple-CEO.jpg
element 3 has url https://api.time.com/wp-content/uploads/2014/03/rtx13ged.jpg
element 4 has url https://vulcanpost.com/wp-content/uploads/2016/08/apple-event-sept9-2015-tim-cook-2480.jpg
element 5 has url https://s3.amazonaws.com/thetech-production/images/web_photos/web/8004_cook_hero.png?1481183432
element 6 has url https://static.wikia.nocookie.net/ipod/images/8/84/Cook_hero20110204.png/revision/latest?cb=20210413165540
element 7 has url https://corporate-executives.com/wp-content/uploads/2013/07/tim-cook.jpg
element 8 has url https://s.wsj.net/public/reso

In [12]:
def download_images(image_urls, save_dir, limit=10):
    
    def get_file_extension(content_type):
        """Get file extension from content type"""
        extension = mimetypes.guess_extension(content_type)
        if extension:
            return extension
        # Fallback extensions based on common image types
        extension_map = {
            'image/jpeg': '.jpg',
            'image/png': '.png',
            'image/gif': '.gif',
            'image/webp': '.webp'
        }
        return extension_map.get(content_type, '.jpg')  # Default to .jpg if unknown

    image_urls = image_urls[:limit]

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/91.0.4472.124'
    }
    
    print(f"Downloading {len(image_urls)} images")
    for i, url in enumerate(image_urls):
        try:
            img_response = requests.get(url, headers=headers)
            if img_response.status_code == 200:
                content_type = img_response.headers.get('content-type', 'image/jpeg')
                extension = get_file_extension(content_type)
                
                filename = f'pic{i+1}{extension}'
                filepath = os.path.join(save_dir, filename)
                
                with open(filepath, 'wb') as f:
                    f.write(img_response.content)
                
                print(f"Downloaded {filename}")
        except Exception as e:
            print(f"Error downloading image {i}: {e}")

for _, row in test_df.iterrows():
    print(row['Name']), str(row["Year"])
    save_dir = os.path.join('pictures', str(row["Year"]))
    image_urls_df = pd.read_csv(os.path.join(save_dir, 'image_urls.csv'))
    image_urls = image_urls_df['image_url'].tolist()
    download_images(image_urls, save_dir, limit=15)

Tim Cook
Downloading 15 images
Downloaded pic1.jpg
Downloaded pic2.jpg
Downloaded pic3.jpg
Downloaded pic4.jpg
Downloaded pic5.jpg
Downloaded pic6.jpg
Downloaded pic7.jpg
Downloaded pic8.jpg
Downloaded pic9.jpg
Downloaded pic10.jpg
Downloaded pic11.png
Downloaded pic12.bin
Downloaded pic13.jpg
Downloaded pic14.jpg
Gil Shwed
Downloading 15 images
Downloaded pic1.png
Downloaded pic2.jpg
Downloaded pic3.jpg
Downloaded pic4.jpg
Downloaded pic8.jpg
Downloaded pic9.jpg
Downloaded pic10.jpg
Downloaded pic11.jpg
Downloaded pic12.jpg
Downloaded pic13.jpg
Downloaded pic15.jpg
Stephen J. Luczo
Downloading 15 images
Downloaded pic1.jpg
Downloaded pic2.jpg
Downloaded pic3.jpg
Downloaded pic4.jpg
Downloaded pic5.jpg
Downloaded pic6.jpg
Downloaded pic7.jpg
Downloaded pic8.jpg
Downloaded pic9.jpg
Downloaded pic10.jpg
Downloaded pic11.png
Downloaded pic12.bin
Downloaded pic13.jpg
Downloaded pic14.jpg
Scott A. McGregor
Downloading 15 images
Downloaded pic1.jpg
Downloaded pic2.jpg
Downloaded pic3.jpg
Dow

KeyboardInterrupt: 