# DataCollection

## 1. Crawl data

In [92]:
import pandas as pd
import numpy as np
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from urllib.parse import urljoin, unquote

In [None]:
# Base URL
BASE_URL = "https://www.norbeck.nu/abc/"

# Entry page
start_url = urljoin(BASE_URL, "index.asp")

# List to store scraped data
data = []


def decode_misencoded_url(url):
    """ Fix URL encoding errors to ensure special characters (e.g., å) are correctly displayed """
    try:
        return url.encode('latin1').decode('utf-8')  # Fix 'Ã¥' -> 'å'
    except UnicodeEncodeError:
        return url  # If decoding fails, keep it as is
    except UnicodeDecodeError:
        return url  # Prevent program crashes


def get_links(url, css_selector):
    """
    Extract all links from a given webpage that match the specified CSS selector.
    :param url: The webpage URL to scrape
    :param css_selector: The CSS selector defining which <a> tags to extract
    :return: A list of tuples [(link text, full URL)]
    """
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Failed to fetch {url}: {e}")
        return []

    soup = BeautifulSoup(response.text, "html.parser")

    links = []
    for a_tag in soup.select(css_selector):  # Extract all matching <a> tags
        href = a_tag.get("href")
        text = a_tag.text.strip()
        if href and not href.startswith("#"):
            full_url = unquote(urljoin(BASE_URL, href))  # Decode URL
            full_url = decode_misencoded_url(full_url)  # Fix encoding issues
            links.append((text, full_url))

    return links


def crawl_tune_types():
    """
    Step 1: Scrape all tune type links from the "Irish Tunes," "Swedish Tunes," and "Other Tunes" sections.
    """
    tune_type_links = get_links(start_url, ".col-4.col-m-6 a")  # Get tune type links

    for first_text, first_link in tune_type_links:
        print(f"Processing: {first_text} ({first_link})")
        crawl_tunes(first_text, first_link)  # Proceed to the next level to scrape tune details


def crawl_tunes(first_text, tune_page_url):
    """
    Step 2: Scrape all tune detail links from the category pages.
    :param first_text: The text of the first-level link (e.g., "Jigs", "Reels")
    :param tune_page_url: The URL of the specific tune category page
    """
    tune_links = get_links(tune_page_url, ".col-9 a")  # Select tune links within the table

    for second_text, second_link in tune_links:
        crawl_tune_details(first_text, second_text, second_link)  # Proceed to the tune detail page


def crawl_tune_details(first_text, second_text, tune_detail_url):
    """
    Step 3: Scrape an individual tune detail page and extract the content inside <div id="abc">.
    :param first_text: The text of the first-level link (e.g., "Jigs", "Reels")
    :param second_text: The text of the second-level link (e.g., "The Blarney Pilgrim")
    :param tune_detail_url: The URL of the specific tune's page
    """
    try:
        tune_detail_url = decode_misencoded_url(unquote(tune_detail_url))  # Fix URL encoding issues
        response = requests.get(tune_detail_url, timeout=10)
        response.raise_for_status()

    except requests.exceptions.HTTPError as http_err:
        print(f"HTTPError: {http_err} - Skipping {tune_detail_url}")
        return

    except requests.exceptions.RequestException as req_err:
        print(f"RequestException: {req_err} - Skipping {tune_detail_url}")
        return

    # Parse HTML
    soup = BeautifulSoup(response.text, "html.parser")

    # Extract content from <div id="abc">
    abc_div = soup.find("div", {"id": "abc"})
    abc_content = abc_div.get_text(separator="\n").strip() if abc_div else "No ABC notation found"

    # Store data in the list
    data.append({
        "final_page_url": tune_detail_url,
        "first_page_text": first_text,
        "second_page_text": second_text,
        "abc_notation": abc_content
    })

In [None]:
# Start Crawling
crawl_tune_types()

Processing: airs (https://www.norbeck.nu/abc/index2.asp?cat=i&sort=number&rhythm=air&rhythm2=)
Processing: barn dances (https://www.norbeck.nu/abc/index2.asp?cat=i&sort=number&rhythm=barndance&rhythm2=)
Processing: carolan tunes (https://www.norbeck.nu/abc/index2.asp?cat=i&sort=number&rhythm=carolan&rhythm2=)
Processing: country dances (https://www.norbeck.nu/abc/index2.asp?cat=i&sort=number&rhythm=country+dance&rhythm2=)
Processing: highlands and flings (https://www.norbeck.nu/abc/index2.asp?cat=i&sort=number&rhythm=highland&rhythm2=fling)
Processing: hornpipes (https://www.norbeck.nu/abc/index2.asp?cat=i&sort=number&rhythm=hornpipe&rhythm2=)
Processing: jigs (https://www.norbeck.nu/abc/index2.asp?cat=i&sort=number&rhythm=jig&rhythm2=)
Processing: marches (https://www.norbeck.nu/abc/index2.asp?cat=i&sort=number&rhythm=march&rhythm2=)
Processing: mazurkas (https://www.norbeck.nu/abc/index2.asp?cat=i&sort=number&rhythm=mazurka&rhythm2=)
Processing: polkas (https://www.norbeck.nu/abc/ind

In [None]:
# Store into DataFrame
df = pd.DataFrame(data, columns=["final_page_url", "first_page_text", "second_page_text", "abc_notation"])

Unnamed: 0,final_page_url,first_page_text,second_page_text,abc_notation
0,https://www.norbeck.nu/abc/display.asp?rhythm=...,airs,Down the Hill,X:1\n\nT:Down the Hill\n\nR:\nair\n\nH:Origina...
1,https://www.norbeck.nu/abc/display.asp?rhythm=...,airs,"Eagle's Whistle, The","X:2\n\nT:Eagle's Whistle, The\n\nT:Fead an Iol..."
2,https://www.norbeck.nu/abc/display.asp?rhythm=...,airs,Fead an Iolair,"X:2\n\nT:Eagle's Whistle, The\n\nT:Fead an Iol..."
3,https://www.norbeck.nu/abc/display.asp?rhythm=...,airs,O'Donovan's March,"X:2\n\nT:Eagle's Whistle, The\n\nT:Fead an Iol..."
4,https://www.norbeck.nu/abc/display.asp?rhythm=...,airs,Gol na mBan san Ár,"X:2\n\nT:Eagle's Whistle, The\n\nT:Fead an Iol..."
...,...,...,...,...
4701,https://www.norbeck.nu/abc/display.asp?rhythm=...,wayñu,Ojos Azules,X:18\n\nT:Ojos Azules\n\nR:\nwaynu\n\nO:Peru\n...
4702,https://www.norbeck.nu/abc/display.asp?rhythm=...,wayñu,A las orillas del Titicaca,X:19\n\nT:A las orillas del Titicaca\n\nR:\nwa...
4703,https://www.norbeck.nu/abc/display.asp?rhythm=...,wayñu,Dos Palomitas,X:20\n\nT:Dos Palomitas\n\nR:\nwaynu\n\nO:Arge...
4704,https://www.norbeck.nu/abc/display.asp?rhythm=...,wayñu,Saririway,X:21\n\nT:Saririway\n\nR:\nwaynu\n\nH:The firs...


In [16]:
df.drop_duplicates(subset='final_page_url',inplace=True)
df.reset_index(inplace=True,drop=True)
df

Unnamed: 0,final_page_url,first_page_text,second_page_text,abc_notation
0,https://www.norbeck.nu/abc/display.asp?rhythm=...,airs,Down the Hill,X:1\n\nT:Down the Hill\n\nR:\nair\n\nH:Origina...
1,https://www.norbeck.nu/abc/display.asp?rhythm=...,airs,"Eagle's Whistle, The","X:2\n\nT:Eagle's Whistle, The\n\nT:Fead an Iol..."
2,https://www.norbeck.nu/abc/display.asp?rhythm=...,airs,Tabhair dom do lámh,X:3\n\nT:Tabhair dom do lámh\n\nT:Give Me Your...
3,https://www.norbeck.nu/abc/display.asp?rhythm=...,airs,An Rogaire Dubh,"X:4\n\nT:An Rogaire Dubh\n\nT:Black Rogue, The..."
4,https://www.norbeck.nu/abc/display.asp?rhythm=...,airs,Molly MacAlpin,X:5\n\nT:Molly MacAlpin\n\nT:Carolan's Dream\n...
...,...,...,...,...
3324,https://www.norbeck.nu/abc/display.asp?rhythm=...,wayñu,Ojos Azules,X:18\n\nT:Ojos Azules\n\nR:\nwaynu\n\nO:Peru\n...
3325,https://www.norbeck.nu/abc/display.asp?rhythm=...,wayñu,A las orillas del Titicaca,X:19\n\nT:A las orillas del Titicaca\n\nR:\nwa...
3326,https://www.norbeck.nu/abc/display.asp?rhythm=...,wayñu,Dos Palomitas,X:20\n\nT:Dos Palomitas\n\nR:\nwaynu\n\nO:Arge...
3327,https://www.norbeck.nu/abc/display.asp?rhythm=...,wayñu,Saririway,X:21\n\nT:Saririway\n\nR:\nwaynu\n\nH:The firs...


In [None]:
# df.to_csv('Data//abc_tunes.csv',encoding='utf-8-sig',index=False)

In [None]:
import pandas as pd

def parse_abc_notation(abc_string):
    """
    Parses a single ABC notation string and stores the data in a dictionary (without using regular expressions).
    :param abc_string: A string containing ABC notation.
    :return: A dictionary.
    """
    # Define the supported fields (in order)
    fields = ["X", "S", "T", "R", "B", "H", "O", "D", "Z", "C", "M", "L", "Q", "W", "K", "Notes"]
    data = {key: [] for key in fields}  # Store parsed data
    last_key = None  # Keep track of the previous field
    
    # Split different sections by "\n\n"
    sections = abc_string.strip().split("\n\n")
    
    for section in sections:
        lines = section.split("\n")  # Split by lines
        for line in lines:
            line = line.strip()
            if not line:
                continue  # Skip empty lines
            
            # Check if the line matches a known field
            key_value = line.split(":", 1)
            if len(key_value) == 2 and key_value[0] in fields:
                key, value = key_value
                key = key.strip()
                value = value.strip()
                
                if key == "K":  # Special handling for the "K" field: first line goes to "K", following lines to "Notes"
                    data["K"].append(value)
                    last_key = "Notes"  # Assign subsequent content to "Notes"
                else:
                    data[key].append(value)
                    last_key = key  # Update the current field
            else:
                # If the line has no field prefix, append it to the previous field
                if last_key:
                    if not data[last_key]:  # Avoid index error
                        data[last_key].append(line.strip())
                    else:
                        data[last_key][-1] += " " + line.strip()
    
    # Merge multiple values into a single string (joined by " ")
    for key in fields:
        data[key] = " ".join(data[key]) if data[key] else None

    return data  # Return dictionary, which can be converted to a DataFrame

In [97]:
parsed_df = df["abc_notation"].apply(parse_abc_notation).apply(pd.Series)
df_final = pd.concat([df, parsed_df], axis=1)
df_final.drop(['first_page_text','second_page_text'],inplace=True,axis=1)
df_final

Unnamed: 0,final_page_url,abc_notation,X,S,T,R,B,H,O,D,Z,C,M,L,Q,W,K,Notes
0,https://www.norbeck.nu/abc/display.asp?rhythm=...,X:1\n\nT:Down the Hill\n\nR:\nair\n\nH:Origina...,1,,Down the Hill,air,,Originally in Gdor and notated in 6/8 time. Ve...,,,id:hn-air-1,,3/4,1/8,1/4=160,,Ador,BAG | E2A2A2 | A3EAB | cBABcA | BAGABG | AGEDE...
1,https://www.norbeck.nu/abc/display.asp?rhythm=...,"X:2\n\nT:Eagle's Whistle, The\n\nT:Fead an Iol...",2,EB,"Eagle's Whistle, The Fead an Iolair O'Donovan'...",air,"Roche Collection, vol. 2, No. 321","See also march#16. The Irish title means ""Lame...",,,id:hn-air-2,,3/4,1/8,1/4=100,,G,GA | B2 dB AG | B{c}B dB AG | A2 {c}AB AG | A2...
2,https://www.norbeck.nu/abc/display.asp?rhythm=...,X:3\n\nT:Tabhair dom do lámh\n\nT:Give Me Your...,3,,Tabhair dom do lámh Give Me Your Hand Da Mihi ...,air,,,,,id:hn-air-3,Ruaidhrí Dall Ó Catháin (c. 1570 - 1650 or 1653),3/4,1/8,1/4=160,,G,D2 | E2G2G2 | G4D2 | E2G2G2 | G4D2 | E2G2G2 | ...
3,https://www.norbeck.nu/abc/display.asp?rhythm=...,"X:4\n\nT:An Rogaire Dubh\n\nT:Black Rogue, The...",4,Sean Ryan and Ciara Ryan on Clare FM,"An Rogaire Dubh Black Rogue, The Did You See t...",air,Bunting 1840 (in key of A),,,,id:hn-air-4,,3/4,1/8,1/4=160,,G,D2 | G3ABG | A2B2d2 | G2A2B2 | d4g2 | G3ABG | ...
4,https://www.norbeck.nu/abc/display.asp?rhythm=...,X:5\n\nT:Molly MacAlpin\n\nT:Carolan's Dream\n...,5,,Molly MacAlpin Carolan's Dream,air,,This was one of Turlough O'Carolan's favourite...,,,id:hn-air-5,William (Laurence) Connellan (c. 1645 - c. 170...,C|,1/8,1/2=50,,Am,|: AB | c2 A>A A2 GA | dc A>G G2 cd | e2de dcA...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3324,https://www.norbeck.nu/abc/display.asp?rhythm=...,X:18\n\nT:Ojos Azules\n\nR:\nwaynu\n\nO:Peru\n...,18,,Ojos Azules,waynu,,,Peru,,id:hn-waynu-18,,2/4,1/8,1/4=90,"Ojos azules, no llores, No llores ni te enamor...",Em,BB Bd | B2 GA |[M:3/4] B2 GA BB |[M:2/4] A2 FG...
3325,https://www.norbeck.nu/abc/display.asp?rhythm=...,X:19\n\nT:A las orillas del Titicaca\n\nR:\nwa...,19,,A las orillas del Titicaca,waynu,,,Peru/Bolivia,Los Incas,id:hn-waynu-19,,3/4,1/8,1/4=90,|: A las orillas del Titicaca :| |: grabé tu n...,Em,B/BB/ e2 g2 | e/dB/ d2 e2 | G/GG/ e2 g2 | e/dB...
3326,https://www.norbeck.nu/abc/display.asp?rhythm=...,X:20\n\nT:Dos Palomitas\n\nR:\nwaynu\n\nO:Arge...,20,,Dos Palomitas,waynu,,,Argentina,,id:hn-waynu-20,,2/4,1/8,1/4=90,Dos palomitas se lamentaban llorando y la una ...,Em,G/GF/ GA | B/BA/ BB | e2 d2 | B4 | B/ed/ BA | ...
3327,https://www.norbeck.nu/abc/display.asp?rhythm=...,X:21\n\nT:Saririway\n\nR:\nwaynu\n\nH:The firs...,21,,Saririway,waynu,,The first verse is in the Aymará language,Bolivia,,id:hn-waynu-21,,3/4,1/8,1/4=90,"Saririway sarjamiway, viditay, qhipaririwa jac...",Em,B/BB/ c/ccc/B |[M:2/4] cd e/ee/ | B/BBG/E | [M...


In [103]:
column_descriptions = {"X":"Index Number",
        "S":"Source",
        "R":"Rhythm(type)",
        "O":"Origin",
        "Z":"Id",
        "M":"Meter",
        "L":"Unit Note Length",
        "K":"Key",
        "Q":"Tempo",
        "H":"History",
        "T":"Title",
        "C":"Composer",
        "W":"Lyrics",
        "B":"Bibliography",
        "D":"Discography"
        }

In [None]:
# Calculate the missing values
missing_values = df_final.isnull().sum()

# Create a DataFrame to display missing values with descriptions
missing_values_df = pd.DataFrame({
    "Column": missing_values.index,
    "Description": [column_descriptions.get(col, "") for col in missing_values.index],
    "Missing Values": missing_values.values,
    "Values": df_final.shape[0] - missing_values.values
})

missing_values_df

Unnamed: 0,Column,Description,Missing Values,Values
0,final_page_url,,0,3329
1,abc_notation,,0,3329
2,X,Index Number,0,3329
3,S,Source,3072,257
4,T,Title,0,3329
5,R,Rhythm(type),0,3329
6,B,Bibliography,3169,160
7,H,History,1809,1520
8,O,Origin,2698,631
9,D,Discography,1887,1442


In [107]:
df_final.to_csv('Data//abc_tunes.csv',index=False,encoding='utf-8-sig')