In [2]:
from bs4 import BeautifulSoup
import requests
from tqdm import tqdm
import ast
import pandas as pd

# Grab the 'menu' item from the American Kennel Club (AKC) website
dog_breed_main = requests.get('https://www.akc.org/dog-breeds/') 
soup = BeautifulSoup(dog_breed_main.content, 'html.parser')
menu = soup.find('div', {'class':'custom-select'})

In [3]:
# Then re-parse menu to get list of links to each breed
menu_soup = BeautifulSoup(str(menu), 'html.parser')
breed_results = menu_soup.find_all('option')[1:]
breed_links = [result['value'] for result in breed_results]

In [4]:
# Scrape each link for information

prefix = 'googletag.pubads().setTargeting'
dog_char_list = []

for link in tqdm(breed_links):
    breed_html = requests.get(link)
    bs = BeautifulSoup(breed_html.content, 'html.parser')
    js_ads = [script for script in bs.find_all('script') if "googletag.pubads()" in str(script)][0]
    pub_ads = [ad.strip()[len(prefix):-1] for ad in str(js_ads).split('\n') if 'googletag.pubads().setTargeting' in ad]
    dog_chars = [ast.literal_eval(char) for char in pub_ads] 
    dog_char_list.append(dog_chars)

100%|██████████| 281/281 [02:42<00:00,  1.72it/s]


In [5]:
dog_char_df = pd.DataFrame(columns=['Breed', 'Size', 'Group'])
dog_char_df_array = []
bad_links = ['spinone-italiano']

def parse_characteristics(chars):
    valid = {
        'coat-type': ['hairless', 'medium', 'smooth', 'short', 'long', 'medium'],
        'barking-level': ['when-necessary', 'medium', 'likes-to-be-vocal', 'infrequent', 'frequent'],
        'trainability': ['may-be-stubborn', 'eager-to-please', 'easy-training', 'agreeable', 'independent'],
        'shedding': ['seasonal', 'infrequent', 'frequent', 'occasional', 'regularly'], 
        'activity-level': ['needs-lots-of-activity', 'regular-exercise', 'energetic', 'calm']
    }
    char_dict = {}
    for subchar in valid.keys():
        subchar_types = [char for char in chars if char.startswith(subchar)]
        if len(subchar_types) >= 1:
            char_dict[subchar] = subchar_types[0][len(subchar)+1:]
        else:
            char_dict[subchar] = 'unknown'
            
    return [char_dict['coat-type'], char_dict['barking-level'], char_dict['trainability'], char_dict['shedding'], char_dict['activity-level']]
            
        
for dog_char in dog_char_list:
    dog_char_dict = {x:y for x,y in dog_char}  
    if dog_char_dict['breed'] not in bad_links:
        df_row = [str.capitalize(dog_char_dict['breed']), dog_char_dict['size'], dog_char_dict['group']]
        dog_char_df_array.append(df_row + parse_characteristics(dog_char_dict['characteristic']))


In [11]:
df = pd.DataFrame(data=dog_char_df_array, columns=["breed", "size", "group", "coat_type", "barking_level", "trainability", "shedding", "activity_level"])
df['coat_type'] = df['coat_type'].replace('medium', 'fair')
df['barking_level'] = df['barking_level'].replace(['unknown', 'medium'], 'average')
df['barking_level'] = df['barking_level'].replace('frequent', 'often')
df['barking_level'] = df['barking_level'].replace('infrequent', 'less')
df['activity_level'] = df['activity_level'].replace('regular-exercise', 'consistent')

In [12]:
df.to_csv("dog_breed_characteristics.csv")
print(df)

                 breed    size                     group coat_type  \
0        Affenpinscher  xsmall                       toy      wire   
1         Afghan-hound   large                     hound      long   
2     Airedale-terrier  medium                   terrier      wire   
3                Akita  xlarge                   working      fair   
4     Alaskan-klee-kai  medium  foundation-stock-service      fair   
..                 ...     ...                       ...       ...   
275  Wirehaired-vizsla  medium                  sporting      wire   
276     Working-kelpie  medium  foundation-stock-service     short   
277     Xoloitzcuintli  medium              non-sporting  hairless   
278     Yakutian-laika  medium  foundation-stock-service      fair   
279  Yorkshire-terrier  xsmall                       toy      long   

      barking_level     trainability    shedding activity_level  
0           average    easy-training    seasonal     consistent  
1           average  may-be