In [1]:
import os
from googleapiclient.discovery import build
from dotenv import load_dotenv
import pickle
import random
import collections
from tqdm import tqdm

random.seed(333)

In [2]:
load_dotenv()

api_key = os.environ['GOOGLE_API']
cse_id = os.environ['GOOGLE_CSE']

In [3]:
# create a search query
def google_search(search_term, api_key, cse_id, **kwargs):
    
    """
    Creates a search link for the custom Google search.
    """
    
    service = build("customsearch", "v1", developerKey=api_key)
    res = service.cse().list(q=search_term, cx=cse_id, **kwargs).execute()
    return res['items']

In [None]:
# search_results = google_search(palm_species[0], api_key=api_key, cse_id=cse_id)

In [4]:
queries = [
    "",
    "description",
    "diagnosis",
    "attributes"
]

## PALMS

In [None]:
folder = "../../../data/OpenAI/Species/"

palm_species = pickle.load(open(F"{folder}palm_species.pkl", 'rb'))
# palm_species_random = random.sample(palm_species, 50)


try:
    folder = "../../../data/OpenAI/Urls/"
    palm_species_google = pickle.load(open(F"{folder}google_urls_palms.pkl", 'rb'))
    palm_species_urls = pickle.load(open(F"{folder}urls_palms.pkl", 'rb'))
    palms_done = list(palm_species_urls.keys())
except:
    palm_species_urls = collections.defaultdict(list)
    palm_species_google = collections.defaultdict(list)

In [None]:
palms_done

In [None]:
for palm in tqdm(palm_species):

    if palm in palms_done:
        print(palm, "done")
        continue

    for query in queries:
        search_query = F"{palm} {query}"

        # print(search_query)

        # Search results (10 per search)
        search_results = google_search(search_query, api_key=api_key, cse_id=cse_id)
        # Google Urls
        palm_species_google[palm] = search_results
        # Just the links
        for result in search_results:
            palm_species_urls[palm].append(result['link'])


folder = "../../../data/OpenAI/Urls/"

with open(F"{folder}urls_palms.pkl", 'wb') as f:
    pickle.dump(palm_species_urls, f)

with open(F"{folder}google_urls_palms.pkl", 'wb') as f:
    pickle.dump(palm_species_google, f)


In [None]:
folder = "../../../data/OpenAI/Urls/"

with open(F"{folder}urls_palms.pkl", 'wb') as f:
    pickle.dump(palm_species_urls, f)

with open(F"{folder}google_urls_palms.pkl", 'wb') as f:
    pickle.dump(palm_species_google, f)


## CARIBBEAN

In [None]:
folder = "../../../data/OpenAI/Species/"

caribbean_species = pickle.load(open(F"{folder}caribbean_species.pkl", 'rb'))

try:
    folder = "../../../data/OpenAI/Urls/"
    caribbean_species_google = pickle.load(open(F"{folder}google_urls_caribbean.pkl", 'rb'))
    caribbean_species_urls = pickle.load(open(F"{folder}urls_caribbean.pkl", 'rb'))
    caribbean_done = list(caribbean_species_urls.keys())
except:
    caribbean_species_urls = collections.defaultdict(list)
    caribbean_species_google = collections.defaultdict(list)
    caribbean_done = []

In [None]:
caribbean_done

In [None]:
for caribbean in tqdm(caribbean_species):

    if caribbean in caribbean_done:
        print(caribbean, "done")
        continue

    for query in queries:
        search_query = F"{caribbean} {query}"

        # print(search_query)

        # Search results (10 per search)
        search_results = google_search(search_query, api_key=api_key, cse_id=cse_id)
        # Google Urls
        caribbean_species_google[caribbean] = search_results
        # Just the links
        for result in search_results:
            caribbean_species_urls[caribbean].append(result['link'])


folder = "../../../data/OpenAI/Urls/"

with open(F"{folder}urls_caribbean.pkl", 'wb') as f:
    pickle.dump(caribbean_species_urls, f)

with open(F"{folder}google_urls_caribbean.pkl", 'wb') as f:
    pickle.dump(caribbean_species_google, f)

In [None]:
folder = "../../../data/OpenAI/Urls/"

with open(F"{folder}urls_caribbean.pkl", 'wb') as f:
    pickle.dump(caribbean_species_urls, f)

with open(F"{folder}google_urls_caribbean.pkl", 'wb') as f:
    pickle.dump(caribbean_species_google, f)

## PlantNet

In [5]:
folder = "../../../data/OpenAI/Species/"

plantnet_species = pickle.load(open(F"{folder}plantnet_species.pkl", 'rb'))

try:
    folder = "../../../data/OpenAI/Urls/"
    plantnet_species_google = pickle.load(open(F"{folder}google_urls_plantnet.pkl", 'rb'))
    plantnet_species_urls = pickle.load(open(F"{folder}urls_plantnet.pkl", 'rb'))
    plantnet_done = list(plantnet_species_urls.keys())
except:
    plantnet_species_urls = collections.defaultdict(list)
    plantnet_species_google = collections.defaultdict(list)
    plantnet_done = []

In [6]:
plantnet_done

['Acacia amythethophylla',
 'Acacia ataxacantha',
 'Acacia dudgeoni',
 'Acacia ehrenbergiana',
 'Acacia erythrocalyx',
 'Acacia gerrardii',
 'Acacia gourmaensis',
 'Acacia hockii',
 'Acacia holosericea',
 'Acacia kirkii',
 'Acacia laeta',
 'Acacia macrostachya',
 'Acacia mellifera',
 'Acacia nilotica',
 'Acacia polyacantha ssp. campylacantha',
 'Acacia senegal',
 'Acacia seyal',
 'Acacia sieberiana',
 'Acacia tortilis ssp. Raddiana',
 'Adansonia digitata',
 'Adenium obesum',
 'Aeschynomene elaphroxylon',
 'Afzelia africana',
 'Agave sisalana',
 'Albizia adianthifolia',
 'Albizia chevalieri',
 'Albizia coriaria',
 'Albizia glaberrima',
 'Albizia lebbeck',
 'Albizia malacophylla',
 'Albizia zygia',
 'Alchornea cordifolia',
 'Allophylus africanus',
 'Anacardium occidentale',
 'Ancylobotrys amoena',
 'Andira inermis',
 'Annona senegalensis',
 'Annona squamosa',
 'Anogeissus leiocarpus',
 'Anthocleista procera',
 'Antidesma venosum',
 'Aphania senegalensis',
 'Azadirachta indica',
 'Baissea

In [7]:
for plantnet in tqdm(plantnet_species):

    if plantnet in plantnet_done:
        print(plantnet, "done")
        continue

    for query in queries:
        search_query = F"{plantnet} {query}"

        # print(search_query)

        # Search results (10 per search)
        search_results = google_search(search_query, api_key=api_key, cse_id=cse_id)
        # Google Urls
        plantnet_species_google[plantnet] = search_results
        # Just the links
        for result in search_results:
            plantnet_species_urls[plantnet].append(result['link'])


folder = "../../../data/OpenAI/Urls/"

with open(F"{folder}urls_plantnet.pkl", 'wb') as f:
    pickle.dump(plantnet_species_urls, f)

with open(F"{folder}google_urls_plantnet.pkl", 'wb') as f:
    pickle.dump(plantnet_species_google, f)

  0%|          | 0/361 [00:00<?, ?it/s]

Acacia amythethophylla done
Acacia ataxacantha done
Acacia dudgeoni done
Acacia ehrenbergiana done
Acacia erythrocalyx done
Acacia gerrardii done
Acacia gourmaensis done
Acacia hockii done
Acacia holosericea done
Acacia kirkii done
Acacia laeta done
Acacia macrostachya done
Acacia mellifera done
Acacia nilotica done
Acacia polyacantha ssp. campylacantha done
Acacia senegal done
Acacia seyal done
Acacia sieberiana done
Acacia tortilis ssp. Raddiana done
Adansonia digitata done
Adenium obesum done
Aeschynomene elaphroxylon done
Afzelia africana done
Agave sisalana done
Albizia adianthifolia done
Albizia chevalieri done
Albizia coriaria done
Albizia glaberrima done
Albizia lebbeck done
Albizia malacophylla done
Albizia zygia done
Alchornea cordifolia done
Allophylus africanus done
Anacardium occidentale done
Ancylobotrys amoena done
Andira inermis done
Annona senegalensis done
Annona squamosa done
Anogeissus leiocarpus done
Anthocleista procera done
Antidesma venosum done
Aphania senegale

 22%|██▏       | 78/361 [00:49<02:58,  1.58it/s]


HttpError: <HttpError 429 when requesting https://customsearch.googleapis.com/customsearch/v1?q=Citrus+aurantifolia+attributes&cx=13a87041f5a6df559&key=AIzaSyAiKMet5uvNWBKxmPC3OurJRs2amYdJaFA&alt=json returned "Quota exceeded for quota metric 'Queries' and limit 'Queries per day' of service 'customsearch.googleapis.com' for consumer 'project_number:830720044910'.". Details: "[{'message': "Quota exceeded for quota metric 'Queries' and limit 'Queries per day' of service 'customsearch.googleapis.com' for consumer 'project_number:830720044910'.", 'domain': 'global', 'reason': 'rateLimitExceeded'}]">

In [8]:
folder = "../../../data/OpenAI/Urls/"

with open(F"{folder}urls_plantnet.pkl", 'wb') as f:
    pickle.dump(plantnet_species_urls, f)

with open(F"{folder}google_urls_plantnet.pkl", 'wb') as f:
    pickle.dump(plantnet_species_google, f)