# Script to Extract Eurorack Dataset from ModularGrid

<br>

Uses Selenium and BeautifulSoup to scrape ModularGrid and generate the dataset. It can take a considerable amount of time to retrieve a large dataset. 

<br>

- <code>get_module_links</code> get links from modulargrid.net main eurorack browser page. 
    Pass browser URL, number of pages to load (1 page = 40 modules), and a short delay to load the page as webdriver scrolls (1.5 - 2 secs). 
    Returns list of links to module pages.
  
<br>

- <code>get_module_data</code> visit each module page grabbed by <code>get_module_links</code>. Pass list of links. 
    Returns pandas dataframe containing module information.

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By 
from selenium.webdriver.common.keys import Keys 
from selenium.webdriver.chrome.options import Options

from bs4 import BeautifulSoup
import requests

import numpy as np
import time
from IPython.display import clear_output, display

import pandas as pd

In [None]:
def get_module_links(url, pages_to_load, scroll_pause_time):
    #run headless chrome
    options = Options()
    options.add_argument("--headless=new")
    driver = webdriver.Chrome(options=options)
    driver.get(url)
    
    #delay to load browser page
    time.sleep(1)
    
    #list of links
    links = []
    
    # Get scroll height
    last_height = driver.execute_script("return document.body.scrollHeight")

    for i in range(0, pages_to_load):
    
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    
        # Wait to load page
        time.sleep(scroll_pause_time)
    
        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height
    
        #Progress counter
        clear_output(wait=True)
        print("Pages Loaded: ", i+1)

    #list of links to scrape
    links = []
    
    #get loaded modules link list
    modules = driver.find_elements(By.CLASS_NAME, 'box-module')
    for module in modules:
         link = module.find_element(By.TAG_NAME, 'a')
         links.append(link.get_attribute('href'))
    
    # close the browser
    driver.quit()

    #amount of module links
    print('Module Links Acquired:', len(links))
    
    return links    

In [None]:
def get_module_data(links):
    #takes list of links to scape
    #gets data using soup, faster than selenium
    
    data_dict_list = []

    #progress Counter
    n = 1
    
    #iterate through links and get data
    for link in links:
        try:
            page = requests.get(link) #headers=headers
            soup = BeautifulSoup(page.text, 'html')
            
            #create list of functions
            functions_html = soup.find_all('span', class_='label label-info')
            functions = [label.text for label in functions_html]
    
            #name
            name = soup.find('h1').text.strip()
    
            #manufacturer
            manufacturer = soup.find('span', class_='vendor-name').text.strip()
    
            #hp
            hp_html = soup.find_all('dd')[0]
            if hp_html:
                hp = int(hp_html.text.strip()[:2])
            else:
                hp = np.nan
        
            #depth
            depth_mm = soup.find_all('dd')[1].text.strip()[3:5]
            if depth_mm == 'mm':
                depth = int(soup.find_all('dd')[1].text.strip()[:2])
            else:
                depth = np.nan
    
            #power
            power_html = soup.find_all('dl')[1]
            try:
                pos12_html = power_html.find_all('dd')[0]
                try:
                    pos12 = float(pos12_html.text.strip().split()[0]) 
                except ValueError:
                    pos12 = np.nan
            except IndexError:
                pos12 = np.nan
            
            try:
                neg12_html = power_html.find_all('dd')[1]
                try:
                    neg12 = float(neg12_html.text.strip().split()[0]) 
                except ValueError:
                    neg12 = np.nan
            except IndexError:
                neg12 = np.nan
            
            try:
                pos5_html = power_html.find_all('dd')[2]
                try:
                    pos5 = float(pos5_html.text.strip().split()[0]) 
                except ValueError:
                    pos5 = np.nan
            except IndexError:
                pos5 = np.nan
    
            
            #price
            price_html = soup.find('span', class_='price')
            if price_html:
                price = float(price_html.text.strip()[1:])
            else:
                price = np.nan
    
            #racks
            racks_html = soup.find('div', id='related-racks')
            racks_html = racks_html.find('strong')
            if racks_html:
                racks = int(racks_html.text.strip())
            else:
                racks = np.nan
        
            #rating
            rating_data_html = soup.find('div', class_='g-descr').find_all('span')
            try:
                rating_html = rating_data_html[-2]
                votes_html = rating_data_html[-1]
                rating = float(rating_html.text.strip())
                votes = int(votes_html.text.strip())
            except IndexError:
                rating = np.nan
                votes = np.nan
            except ValueError:
                rating = np.nan
                vote = np.nan
    
            #available
            if soup.find('p', class_='text-success'):
                available = 'Available'
            elif soup.find('p', class_='text-error'):
                available = 'Discontinued'
            else:
                available = np.nan
    
            #approved
            approved_html = soup.find('div', class_='box-approved')
            if approved_html:
                approved = 1
            else:
                approved = 0
            
            data_dict = {
                'Name': name,
                'Manufacturer': manufacturer,
                'Functions': functions,
                'HP': hp,
                'Depth': depth,
                '+12V (mA)': pos12,
                '-12V (mA)': neg12,
                '+5V (mA)': pos5,
                'Price (€)': price,
                'Racks': racks,
                'Rating': rating,
                'Votes': votes,
                'Available': available,
                'Approved': approved
            }
    
            #add dict to list of dicts
            data_dict_list.append(data_dict)
    
            clear_output(wait=True)
            print('Modules Scraped:', n, '|', manufacturer, '-', name)
            n += 1

        #if something fatal happens break loop so df is created
        except IndexError:
            break
        except ValueError:
            break
    
    #create dataframe from list of dicts
    df = pd.DataFrame(data_dict_list)
    return df

- <code>URL</code> target link to ModularGrid main browser.

- <code>N</code> intended length of dataset.

- <code>YEAR</code> year the dataset was generated.

- <code>PAGES_TO_LOAD</code> number of pages to load (1 page = 40 modules).

- <code>SCROLL_PAUSE_TIME</code> a short delay to load the page as webdriver scrolls (1.5 - 2 secs).

In [None]:
#target url, modulargrid.net - browser - sort by popularity - ascending - only full size modules (no 1U)
URL = 'https://modulargrid.net/e/modules/browser?SearchName=&SearchVendor=&SearchFunction=&SearchSecondaryfunction=&SearchHeight=f&SearchTe=&SearchTemethod=max&SearchBuildtype=&SearchLifecycle=&SearchSet=&SearchMarketplace=&SearchIsmodeled=0&SearchShowothers=0&order=popular&direction=asc'

#max length of dataset
#make sure enough pages are loaded to contain enough modules
N = 40
YEAR = 2024

#the amount of pages to load, each page is 40 modules ish
PAGES_TO_LOAD = 1
    
#give time for new page to load, 1.5 - 2 secs
SCROLL_PAUSE_TIME = 2

#get links to each module from browser page
links = get_module_links(URL, PAGES_TO_LOAD, SCROLL_PAUSE_TIME)

In [None]:
#make list n elements long

#links = links[:N]

In [None]:
#get data from module links
df = get_module_data(links)

In [None]:
df.head()

In [None]:
#save dataframe to csv file on hardrive

#df.to_csv(f"{N}_most_popular_eurorack_modules_{YEAR}_UNCLEANED.csv") 

## Wes Leggo-Morrell 2024

#### **[Instagram](https://www.instagram.com/modular.mooch)**

#### **[GitHub](https://github.com/WesDaMooch)**