In [2]:
import pandas as pd

# Used for Web Scraping
import requests 
from bs4 import BeautifulSoup 

# Visual
import matplotlib.pyplot as plt

# Ignore warning messages
import warnings
warnings.filterwarnings('ignore')

In [153]:
def get_url_price(url):
    '''
    Gets whisky details' url per region/country.
    
    Parameters
    ----------
    links: list of str
        Whisky's region/country url.
    Returns
    -------
    urls: list of lists
        Gets url and price per whisky page.
    '''
    urls = []
    status_code = 200 # Set 200 by default to loop over all links
    i = 1 # Initial page number
    
    while status_code == 200:
        
        result = requests.get(f'{url}/{i}') # Navigate to product list page
        status_code = result.status_code # Check status code if its valid url
        
        if status_code == 200:
            
            # Get page content and parse and process the source
            src = result.content
            soup = BeautifulSoup(src, 'html.parser')

            # Find product container that has list of whiskies
            products = soup.find_all('div', class_='product-box-wide')
            
            for product in products:

                link = product['data-product-url']
                elem_id = product['id'][:-15]
                price_id = elem_id + 'pricesWrapper'
                elems = soup.find_all('div', {'id': price_id})
                price = None
                
                # Get price - prices are tricky because it has different class names located 
                # in different elements (span, div)
                if (len(elems) > 0):
                    try:
                        price = elems[0].text[1:] # Remove '$' and convert string to float
                    except:
                        price_id = elem_id + 'price'
                        elems = soup.find_all('span', {'id': price_id})
                        price = elems[0].text[1:]
                    
                urls.append([link, price])

            i += 1 # Increment to move onto next page to grab more whisky product links

    return urls

In [1]:
def get_details(urls):
    '''
    Gets whisky details, formulates, and returns whisky dataframe.
    
    Parameters
    ----------
    links: list of lists
        Whisky's price and whiskey details' url.
    Returns
    -------
    df: pandas.DataFrame
        Returns whisky dataframe from specific region/country.
    '''
    names = []
    images = []
    description = []
    prices = []
    countries = []
    distilleries = []
    bottlers = []
    styles = []
    alcohol_content = []
    volume = []
    nose = []
    palate = []
    finish = []
    review_count = []
    ratings = []
    prices = []
    
    for link in urls:
        
        # Get url
        url = link[0] 
        
        # Get whisky details page
        result = requests.get(url)
        src = result.content
        soup = BeautifulSoup(src, 'html.parser')
        
        # Title - if it errors out - it means we got wrong url for whiskey product - so ignore
        try:
            names.append(soup.find('h1', class_='page_header').text)
        except:
            continue # skip
        
        # Get Price
        price = link[1]
        
        if price == None:
            prices.append(price)
        elif price.find('$') == -1:
            prices.append(price.replace(',', '')) # Remove comma ex: 2,238.45 => 2238.45
        else:
            new_price = ''
            
            for i in price.strip('\n').replace('\n', "|")[1:]:
                if i != '|':
                    new_price += i
                else:
                    break
            
            prices.append(new_price)
        
        # Image
        try:
            images.append(soup.find('div', class_='productImageWrap').find('img')['src'][2:-7])
        except:
            images.append(None)
            
        # Description
        description.append(soup.find("div", attrs={"itemprop":'description'}).select_one('p').text)

        # Nose, Palate, Finish (tasting notes)
        notes = soup.find_all('p', class_='pageCopy')
        
        # Set nose, palate, finish as None by default
        nose.append(None)
        palate.append(None)
        finish.append(None)
            
        if len(notes) > 1:
            for i in range(len(notes)):

                # Get whisky characteristic
                note = notes[i].text
                
                if note.find('Nose') > -1: # Nose (smell)
                    nose[len(nose) - 1] = note[5:].strip()
                elif note.find('Palate') > -1: # Palate (taste)
                    palate[len(palate) - 1] = note[7:].strip()
                elif note.find('Finish') > - 1: # Finish (after taste)
                    finish[len(finish) - 1] = note[7:].strip()
                    
        elif len(notes) == 1:
            # At times - nose, palate, and finish characteristics are in one paragraph
            # Therefore, add text to all three categories
            note = notes[0].text
            
            nose[len(nose) - 1] = note
            palate[len(palate) - 1] = note
            finish[len(finish) - 1] = note

        # Get details' values 
        details = soup.find('div', {"id": 'whiskyDetailsWrapper'}).find_all('div')[0].find_all('span', class_='kv-val')
        
        # Get details' titles (country, distillery, bottler, style, alcohol, and volume)
        titles = soup.find('div', {"id": 'whiskyDetailsWrapper'}).find_all('div')[0].find_all('span', class_='kv-key')
        
        #  Set below properties as None by default
        countries.append(None)
        distilleries.append(None)
        bottlers.append(None)
        styles.append(None)
        alcohol_content.append(None)
        volume.append(None)
        
        for i in range(len(titles)):

            detail = details[i].text
            title = titles[i].text.lower()
            
            if title == 'country':
                countries[len(countries) - 1] = detail
            elif title == 'distillery / brand':
                distilleries[len(distilleries) - 1] = detail
            elif title == 'bottler':
                bottlers[len(bottlers) - 1] = detail
            elif title == 'style':
                styles[len(styles) - 1] = detail
            elif title == 'alcohol':
                alcohol_content[len(alcohol_content) - 1] = detail
            elif title == 'volume':
                volume[len(volume) - 1] = detail
        
        # Get review count
        nums = soup.find_all("div", attrs={"itemprop":'reviewCount'})
        
        review_count.append(None)
        ratings.append(None)
            
        # When there is review count retrieve it
        if len(nums) > 0:
            
            # Get review count
            review_count[len(review_count) - 1] = int(nums[0]['content'])
            
            try:
                # Get review rating
                stars = soup.find('div', {'class':['starRating', 'rating-wrapper']})['title']

                # Remove non-integers
                new_str = ''

                for star in stars:

                    # Remove non-integers
                    if star.isdigit() or star == '.':
                        new_str += star

                ratings[len(ratings) - 1] = float(new_str)
            except:
                ratings[len(ratings) - 1] = None
                
            
    # Create whisky dataframe with collected data
    data = {'name': names, 
            'description': description,
            'price': prices,
            'country': countries, 
            'distillery': distilleries, 
            'bottler': bottlers, 
            'style': styles, 
            'alcohol_content': alcohol_content, 
            'volume': volume, 
            'nose': nose, 
            'palate': palate, 'finish': finish, 
            'review_count': review_count, 
            'rating': ratings,
            'image': images
           }
    
    return pd.DataFrame(data=data)

In [186]:
def fix_error(df):
    '''
    Resolves row issues where whisky details are defined in correct row index.
    
    Parameters
    ----------
    df: pandas.DataFrame
        Whisky dataframe
    Returns
    -------
    df: pandas.DataFrame
        Updated and corrected whisky dataframe
    '''
    # Filter rows without image values
    mask = df[['image']].isna().all(axis=1)
    temp_df = df[mask] 
    
    if temp_df.shape[0] > 0:
        temp_df.reset_index(drop=True, inplace=True)

        for i in range(0, temp_df.shape[0], 2):

            j = i + 1
            indices.append(j)
            
            # Update
            temp_df.loc[i, temp_df.columns[2:]] = [
                temp_df.loc[j, 'name'],
                temp_df.loc[j, 'description'], 
                temp_df.loc[j, 'price'], 
                temp_df.loc[j, 'country'],
                temp_df.loc[j, 'distillery'], 
                temp_df.loc[j, 'bottler'],
                temp_df.loc[j, 'style'], 
                temp_df.loc[j, 'alcohol_content'],
                temp_df.loc[j, 'volume'],
                temp_df.loc[j, 'nose'],
                temp_df.loc[j, 'palate'],
                temp_df.loc[j, 'finish'],
                temp_df.loc[j, 'review_count']
            ]

        df = pd.concat([df, temp_df]).drop_duplicates(['name'], keep='last') # Combine and remove duplicates
        df = df.dropna(subset=['name', 'image']) # Drop rows that has missing values on 'name' nd 'image'
        
    return df.reset_index(drop=True) # Reset index

## Get Whisky Product Details Per Country/Region

In [182]:
# Get product id, url, and price per whisky product in each country/style categories
american_urls = get_url_price('https://www.masterofmalt.com/country/american-whisky/')
japanese_urls = get_url_price('https://www.masterofmalt.com/country/japanese-whisky/')
irish_urls = get_url_price('https://www.masterofmalt.com/country/irish-whisky/')
scotch_malt_urls = get_url_price('https://www.masterofmalt.com/country-style/scotch/single-malt-whisky/')
scotch_blended_urls = get_url_price('https://www.masterofmalt.com/country-style/scotch/blended-whisky/')
scotch_blended_malt_urls = get_url_price('https://www.masterofmalt.com/country-style/scotch/blended-malt-whisky/')
scotch_grain_urls = get_url_price('https://www.masterofmalt.com/country-style/scotch/grain-whisky/')
indian_urls = get_url_price('https://www.masterofmalt.com/country/indian-whisky/')
welsh_urls = get_url_price('https://www.masterofmalt.com/country/welsh-whisky/')
english_urls = get_url_price('https://www.masterofmalt.com/country/english-whisky/')
canadian_urls = get_url_price('https://www.masterofmalt.com/country/canadian-whisky/')
swedish_urls = get_url_price('https://www.masterofmalt.com/country/swedish-whisky/')
dutch_urls = get_url_price('https://www.masterofmalt.com/country/dutch-whisky/')
south_african_urls = get_url_price('https://www.masterofmalt.com/country/south-african-whisky/')
australian_urls = get_url_price('https://www.masterofmalt.com/country/australian-whisky/')
kiwi_urls = get_url_price('https://www.masterofmalt.com/country/kiwi-whisky/')

In [183]:
american_df = get_details(american_urls)

In [184]:
japanese_df = get_details(japanese_urls)

In [187]:
irish_df = get_details(irish_urls)

In [197]:
scotch_malt_df = get_details(scotch_malt_urls)

In [198]:
scotch_blended_df = get_details(scotch_blended_urls)

In [199]:
scotch_blended_malt_df = get_details(scotch_blended_malt_urls)

In [200]:
scotch_grain_df = get_details(scotch_grain_urls)

In [188]:
indian_df = get_details(indian_urls)

In [189]:
welsh_df = get_details(welsh_urls)

In [190]:
english_df = get_details(english_urls)

In [191]:
canadian_df = get_details(canadian_urls)

In [192]:
swedish_df = get_details(swedish_urls)

In [193]:
dutch_df = get_details(dutch_urls)

In [194]:
south_african_df = get_details(south_african_urls)

In [195]:
australian_df = get_details(australian_urls)

In [196]:
kiwi_df = get_details(kiwi_urls)

In [187]:
# Resolve erroreneous rows that does not have 'name' and 'image' values
american_df = fix_error(american_df)
japanese_df = fix_error(japanese_df)
irish_df = fix_error(irish_df)
scotch_malt_df = fix_error(scotch_malt_df)
scotch_blended_df = fix_error(scotch_blended_df)
scotch_blended_malt_df = fix_error(scotch_blended_malt_df)
scotch_grain_df = fix_error(scotch_grain_df)
indian_df = fix_error(indian_df)
welsh_df = fix_error(welsh_df)
english_df = fix_error(english_df)
canadian_df = fix_error(canadian_df)
swedish_df = fix_error(swedish_df)
dutch_df = fix_error(dutch_df)
south_african_df = fix_error(south_african_df)
australian_df = fix_error(australian_df)
kiwi_df = fix_error(kiwi_df)

In [3]:
# Combine whiskey dataframes (vertical stacking)
df = pd.concat([american_df, japanese_df, irish_df, scotch_malt_df, scotch_blended_df, scotch_blended_malt_df, 
                scotch_grain_df, indian_df, welsh_df, english_df, canadian_df, swedish_df, dutch_df, 
                south_african_df, australian_df, kiwi_df], axis=0)

(19006, 15)

## Save Whisky Product Details Per Country/Region

In [190]:
american_df.to_csv('dataset/american.csv')
japanese_df.to_csv('dataset/japanese.csv')
irish_df.to_csv('dataset/irish.csv')
scotch_malt_df.to_csv('dataset/scotch_malt.csv')
scotch_blended_df.to_csv('dataset/scotch_blended.csv')
scotch_blended_malt_df.to_csv('dataset/scotch_blended_malt.csv')
scotch_grain_df.to_csv('dataset/scotch_grain.csv')
indian_df.to_csv('dataset/indian.csv')
welsh_df.to_csv('dataset/welsh.csv')
english_df.to_csv('dataset/english.csv')
canadian_df.to_csv('dataset/canadian.csv')
swedish_df.to_csv('dataset/swedish.csv')
dutch_df.to_csv('dataset/dutch.csv')
south_african_df.to_csv('dataset/south_african.csv')
australian_df.to_csv('dataset/australian.csv')
kiwi_df.to_csv('dataset/kiwi.csv')