In [1]:
import requests
import pandas as pd

In [2]:
def find_beer_image_url(beer_name, company_name, api_key, cx):
    search_query = f"{beer_name} {company_name} beer"
    params = {
        "q": search_query,
        "cx": cx,
        "key": api_key,
        "searchType": "image",
        "num": 5  # get up to 5 results to check file types
    }

    response = requests.get("https://www.googleapis.com/customsearch/v1", params=params)
    response.raise_for_status()
    results = response.json()

    if "items" in results:
        for item in results["items"]:
            link = item.get("link", "")
            if link.lower().endswith((".jpg", ".jpeg", ".png")):
                return link
        return None  # No suitable image found
    else:
        return None


In [3]:
import json

with open('../../credentials_google_search.json', 'r') as f:
    creds = json.load(f)
    #content = f.read()
    #print("FILE CONTENT:")
    #print(content)

api_key = creds['google']['api_key']
cx = creds['google']['cx']

In [4]:
import os

if not os.path.exists("../../data/clean/beer_names_breweries_with_images.csv"):
    db_beers = pd.read_csv("../../data/clean/beer_names_breweries.csv")
    db_beers_names_breweries = db_beers[["name", "brewery"]].drop_duplicates()
    print(db_beers.columns.tolist())
    db_beers_names_breweries = db_beers[["name", "brewery"]].drop_duplicates()
    num_of_beers =len(db_beers_names_breweries)
    print("Number of beers:", num_of_beers)
else:
    db_beers_names_breweries = pd.read_csv("../../data/clean/beer_names_breweries_with_images.csv")
    num_of_beers =len(db_beers_names_breweries)
    print("Number of beers:", num_of_beers)



Number of beers: 3197


In [None]:
import time
save_interval = 100  # Save every 100 new image URLs
found_count = 0      # Counter for how many new image URLs have been found
for i in range(len(db_beers_names_breweries)):
    beer_name = db_beers_names_breweries.iloc[i]["name"]
    company_name = db_beers_names_breweries.iloc[i]["brewery"]
    print(f"Beer: {beer_name}, Brewery: {company_name}")
    
    try:
        # Check if "image_url" column exists and the ith value is empty or "None"
        if "image_url" in db_beers_names_breweries.columns and (
            pd.isna(db_beers_names_breweries.at[i, "image_url"]) or 
            str(db_beers_names_breweries.at[i, "image_url"]).lower() in ["none", "nan", ""]
        ):
            image_url = find_beer_image_url(beer_name, company_name, api_key, cx)
            if image_url:
            try:
                head = requests.head(image_url, timeout=5)
                content_type = head.headers.get("Content-Type", "").lower()
                if (head.status_code == 200 and 
                    (image_url.lower().endswith((".jpg", ".jpeg", ".png")) or 
                    any(x in content_type for x in ["image/jpeg", "image/png"]))):
                    print("Valid image URL found:", image_url)
                else:
                    print("URL does not point to a valid jpg/png image:", image_url)
                image_url = None
            except Exception as e:
                print(f"Failed to verify image URL: {e}")
                image_url = None
            else:
                print("No image URL found.")
                image_url = find_beer_image_url(beer_name, company_name, api_key, cx)
                print("Image URL:", image_url)
                db_beers_names_breweries.at[i, "image_url"] = image_url
                found_count += 1

            # Save to CSV every 100 new findings
            if found_count % save_interval == 0:
                filename = f"beer_images_partial_{found_count}.csv"
                db_beers_names_breweries.to_csv(filename, index=False)
                print(f"Saved intermediate results to {filename}")
            time.sleep(1.5)  # delay between requests    
        else:
            print(f"Skipping row {i}, image_url already exists: {db_beers_names_breweries.at[i, 'image_url']}")
    except Exception as e:
        print(f"Error at row {i}: {e}")
        db_beers_names_breweries.at[i, "image_url"] = None
        # Save the current state of the dataframe
        db_beers_names_breweries.to_csv("../../data/clean/beer_names_breweries_with_images.csv", index=False)
        # Fill the rest with "none" and break the loop
        db_beers_names_breweries.loc[i+1:, "image_url"] = "none"
        break

    


Beer: amber, Brewery: alaskan brewing co.
Skipping row 0, image_url already exists: https://tse2.mm.bing.net/th?id=OIP.p2UqLwq4WBWB4rhAdHq--gHaLO&pid=Api&P=0&h=180
Beer: double bag, Brewery: long trail brewing co.
Skipping row 1, image_url already exists: https://www.instacart.com/image-server/1200x1200/www.instacart.com/assets/domains/product-image/file/large_08cdae49-739c-44d8-beaf-be7849984cfa.jpg
Beer: long trail ale, Brewery: long trail brewing co.
Skipping row 2, image_url already exists: https://longtrail.com/wp-content/uploads/2022/06/LTB313-18-SummerAle-Rebrand-Bottle-3D-LR_1.png
Beer: doppelsticke, Brewery: uerige obergarige hausbrauerei gmbh / zum uerige
Skipping row 3, image_url already exists: https://www.bierverkostung.de/bilder_bier/4144_2020-07-25_Uerige_DoppelSticke.jpg
Beer: sleigh'r dark double alt ale, Brewery: ninkasi brewing company
Skipping row 4, image_url already exists: https://cdn.shopify.com/s/files/1/0227/0581/products/Ninkasi-Sleighr-Dark-Double-Alt-12OZ-B

In [13]:
print(db_beers_names_breweries.head())

                           name  \
0                         amber   
1                    double bag   
2                long trail ale   
3                  doppelsticke   
4  sleigh'r dark double alt ale   

                                            brewery  \
0                               alaskan brewing co.   
1                            long trail brewing co.   
2                            long trail brewing co.   
3  uerige obergarige hausbrauerei gmbh / zum uerige   
4                           ninkasi brewing company   

                                           image_url  
0  https://tse2.mm.bing.net/th?id=OIP.p2UqLwq4WBW...  
1  https://www.instacart.com/image-server/1200x12...  
2  https://longtrail.com/wp-content/uploads/2022/...  
3  https://www.bierverkostung.de/bilder_bier/4144...  
4  https://cdn.shopify.com/s/files/1/0227/0581/pr...  
