# Scraping song lyrics from tekstove.org

In [None]:
from bs4 import BeautifulSoup
from bs4.element import Comment
import requests
import csv
import urllib.request

## Prepare the download

* We first fetch all ids in the given pagerange for the category 1 (pop-folk)
* Then we download lyrics by scraping each webpage with the collected ids
* write to file

In [None]:

def zero_pad_num(n):
    # Should only support numbers in [0; 999]
    if n < 10:
        return "00{0}".format(n)
    elif n < 100:
        return "0{0}".format(n)
    else:
        return n
def download_lyrics(file, min_id, max_id, mode, start):
    
    text_complete = ""
    for i in range(min_id, max_id):
        print("Fetching texts.. {0}/{1}".format(i + 1, max_id))
        url = "https://www.tekstove.org/browse.php?id={0}{1}".format(start, zero_pad_num(i))
        page_to_scrape = requests.get(url)
        soup = BeautifulSoup(page_to_scrape.text, "html.parser")
        text = soup.findAll("td", attrs={"valign": "top"})[0]
        total = ""
        for string in text.strings:
            total += string
        text_complete += total
    with open(file, mode, encoding='utf-8') as f:
        f.write(text_complete)

def download_lyrics_V2(pagerange, filename):
    link_ids = []
    for i in pagerange:
        url = "https://www.tekstove.org/category.php?s=1&page={0}".format(i)
        print('fetching ids: ',url)
        page_to_scrape = requests.get(url)
        soup = BeautifulSoup(page_to_scrape.text, "html.parser")
        links = soup.find_all('a', href=lambda href: href and href.startswith("browse.php"))
        for link in links:
            link_ids.append(link['href'])
    with open('ids.txt', "w", encoding='utf-8') as f:
        for link in link_ids:
            f.write(link)
            f.write('\n')
    text_complete = ""
    n = 0
    print("to download lyrics for {0} songs".format(len(link_ids)))
    for song_id in link_ids:
        n+=1
        url = "https://www.tekstove.org/{0}".format(song_id)
        print("Fetching song: ", n)
        page_to_scrape = requests.get(url)
        soup = BeautifulSoup(page_to_scrape.text, "html.parser")
        text = soup.findAll("td", attrs={"valign": "top"})[0]
        total = ""
        for string in text.strings:
            total += string
            if not string.endswith('\n'):
                total += '\n'
        text_complete += total
    with open(filename, "w", encoding='utf-8') as f:
        f.write(text_complete)
download_lyrics_V2(range(25, 140), 'songs-lg-final.txt')        

In [1]:
with open('ids.txt', "r", encoding='utf-8') as f:
    ids = f.readlines()
processed=[]
for id in ids:
    processed.append(id.replace('\n',''))    

## Last one

In [2]:
from bs4 import BeautifulSoup
from bs4.element import Comment
import requests
import csv
import urllib.request

### Chunked lyrics download
This is done because my instance of jupyter hangs after ~300-400 requests.

In [3]:
for i in range(18, 20):
    file_chunk = 'chunk_lg.txt'.format(i)
    start = i * 230
    end = start + 230
    ids_sub = processed[start:end]
    text_complete = ""
    n = 0
    print("\n\n\n\n---------------------------------------------------")
    print("CHUNK ",i)
    print("\n\n\n\n---------------------------------------------------")
    for song_id in ids_sub:
        n+=1
        url = "https://www.tekstove.org/{0}".format(song_id)
        print("Fetching song: ", url)
        try:
            page_to_scrape = requests.get(url)
            soup = BeautifulSoup(page_to_scrape.text, "html.parser")
            text = soup.findAll("td", attrs={"valign": "top"})[0]
            total = ""
            for string in text.strings:
                total += string
                if not string.endswith('\n'):
                    total += '\n'
            text_complete += total
        except:
            print("An exception occurred")
        
    with open(file_chunk, "a", encoding='utf-8') as f:
        f.write("\n\n\n\n")
        f.write(text_complete)





---------------------------------------------------
CHUNK  18




---------------------------------------------------
Fetching song:  https://www.tekstove.org/browse.php?id=602
Fetching song:  https://www.tekstove.org/browse.php?id=601
Fetching song:  https://www.tekstove.org/browse.php?id=600
Fetching song:  https://www.tekstove.org/browse.php?id=599
Fetching song:  https://www.tekstove.org/browse.php?id=598
Fetching song:  https://www.tekstove.org/browse.php?id=597
Fetching song:  https://www.tekstove.org/browse.php?id=596
Fetching song:  https://www.tekstove.org/browse.php?id=595
Fetching song:  https://www.tekstove.org/browse.php?id=594
Fetching song:  https://www.tekstove.org/browse.php?id=593
Fetching song:  https://www.tekstove.org/browse.php?id=592
Fetching song:  https://www.tekstove.org/browse.php?id=591
Fetching song:  https://www.tekstove.org/browse.php?id=590
Fetching song:  https://www.tekstove.org/browse.php?id=589
Fetching song:  https://www.tekstove.org/browse.php?

Fetching song:  https://www.tekstove.org/browse.php?id=446
Fetching song:  https://www.tekstove.org/browse.php?id=445
Fetching song:  https://www.tekstove.org/browse.php?id=444
Fetching song:  https://www.tekstove.org/browse.php?id=443
Fetching song:  https://www.tekstove.org/browse.php?id=442
Fetching song:  https://www.tekstove.org/browse.php?id=441
Fetching song:  https://www.tekstove.org/browse.php?id=440
Fetching song:  https://www.tekstove.org/browse.php?id=438
Fetching song:  https://www.tekstove.org/browse.php?id=437
Fetching song:  https://www.tekstove.org/browse.php?id=436
Fetching song:  https://www.tekstove.org/browse.php?id=435
Fetching song:  https://www.tekstove.org/browse.php?id=434
Fetching song:  https://www.tekstove.org/browse.php?id=433
Fetching song:  https://www.tekstove.org/browse.php?id=432
Fetching song:  https://www.tekstove.org/browse.php?id=431
Fetching song:  https://www.tekstove.org/browse.php?id=430
Fetching song:  https://www.tekstove.org/browse.php?id=4

Fetching song:  https://www.tekstove.org/browse.php?id=269
Fetching song:  https://www.tekstove.org/browse.php?id=268
Fetching song:  https://www.tekstove.org/browse.php?id=267
Fetching song:  https://www.tekstove.org/browse.php?id=266
Fetching song:  https://www.tekstove.org/browse.php?id=265
Fetching song:  https://www.tekstove.org/browse.php?id=264
Fetching song:  https://www.tekstove.org/browse.php?id=263
Fetching song:  https://www.tekstove.org/browse.php?id=262
Fetching song:  https://www.tekstove.org/browse.php?id=261
Fetching song:  https://www.tekstove.org/browse.php?id=260
Fetching song:  https://www.tekstove.org/browse.php?id=259
Fetching song:  https://www.tekstove.org/browse.php?id=258
Fetching song:  https://www.tekstove.org/browse.php?id=257
Fetching song:  https://www.tekstove.org/browse.php?id=256
Fetching song:  https://www.tekstove.org/browse.php?id=255
Fetching song:  https://www.tekstove.org/browse.php?id=254
Fetching song:  https://www.tekstove.org/browse.php?id=2

Fetching song:  https://www.tekstove.org/browse.php?id=95
Fetching song:  https://www.tekstove.org/browse.php?id=94
Fetching song:  https://www.tekstove.org/browse.php?id=93
Fetching song:  https://www.tekstove.org/browse.php?id=92
Fetching song:  https://www.tekstove.org/browse.php?id=91
Fetching song:  https://www.tekstove.org/browse.php?id=90
Fetching song:  https://www.tekstove.org/browse.php?id=89
Fetching song:  https://www.tekstove.org/browse.php?id=88
Fetching song:  https://www.tekstove.org/browse.php?id=87
Fetching song:  https://www.tekstove.org/browse.php?id=86
Fetching song:  https://www.tekstove.org/browse.php?id=85
Fetching song:  https://www.tekstove.org/browse.php?id=84
Fetching song:  https://www.tekstove.org/browse.php?id=83
Fetching song:  https://www.tekstove.org/browse.php?id=82
Fetching song:  https://www.tekstove.org/browse.php?id=81
Fetching song:  https://www.tekstove.org/browse.php?id=80
Fetching song:  https://www.tekstove.org/browse.php?id=79
Fetching song:

## Get all song titles

In [4]:
song_titles = ""
with open("chunk_lg.txt", 'r', encoding='utf-8') as file:
    text = file.read()
    for ln in text.split("\n"):
        if ln.startswith("Припев и куплети на песента:"):
            print(ln)
            ln_s = ln.split("Припев и куплети на песента:")
            song_titles += ln_s[1]
            song_titles += "\n"
with open("song_titles_lg.txt", 'w', encoding='utf-8') as file:
    file.write(song_titles)

Припев и куплети на песента:"Андреа - Никой Друг " 
Припев и куплети на песента:"Азис - Ти ме размаза 2013 " 
Припев и куплети на песента:"DannyMusic и Борис Дали - Ти реши " 
Припев и куплети на песента:"Ани Хоанг и Крум - Целувай и хапи " 
Припев и куплети на песента:"Емилия - Ако си звезда " 
Припев и куплети на песента:"Адриана и Веско - Няма драма (официална версия ) " 
Припев и куплети на песента:"Валентина - Любов отрова " 
Припев и куплети на песента:"Валентина - Слаби ангели " 
Припев и куплети на песента:"Пламена - Не е точно любов " 
Припев и куплети на песента:"Азис - Ти ме размаза " 
Припев и куплети на песента:"Адриана - Няма драмаа " 
Припев и куплети на песента:"Адриана - Няма Няма " 
Припев и куплети на песента:"Адриана и Джамайката - Ти си шефа " 
Припев и куплети на песента:"Малина - Ако ще да звъниш " 
Припев и куплети на песента:"Райна - Виновен " 
Припев и куплети на песента:"Виолета - Сърцето ми вземи " 
Припев и куплети на песента:"Сашка Васева - Уволнение (ft.С