## Using BeautifulSoup

BeautifulSoup is a library which parses HTML and creates a tree structure of python objects that we can navigate through, extract information from, and edit

In [3]:
#!conda install -y bs4

#### Convert the raw HTML string to a BeautifulSoup object

In [3]:
import requests
from bs4 import BeautifulSoup

### Task 1: Extract the poem titles, url and poems with Beautiful Soup

In [4]:
def search_url(url): 
    import os
    import time
    """
    search_url is a function that accepts as parameter a url string
    and then returns a text file named as text with the web page
    requested. 
    It prints in the excecustion if the code is corectly downloaded.
    It uses request package!
    """                                                                                               
    header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
    response = requests.get(url, headers=header)
    time.sleep(3)
    print(f'The status code is: {response.status_code}')
    filename = f"response_files/response.txt"
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    with open(filename, 'w') as f:
        f.write(response.text)
    text = response.text
    return text 

In [5]:
#response.text , response.status_code = 
text = search_url('https://lyrics.az/bob-dylan/allalbums.html')

The status code is: 200


In [6]:
soup = BeautifulSoup(text,'html.parser')
soup


<!DOCTYPE html>

<html lang="en">
<head>
<script async="" src="https://www.googletagmanager.com/gtag/js?id=UA-7578349-3" type="9fc0308aabe99e8fa920e21c-text/javascript"></script>
<script type="9fc0308aabe99e8fa920e21c-text/javascript">
  window.dataLayer = window.dataLayer || [];
  function gtag(){dataLayer.push(arguments);}
  gtag('js', new Date());

  gtag('config', 'UA-7578349-3');
</script>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="E436FF1665FAB5BD991FBCAF8D0692FB" name="msvalidate.01"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<meta content="UeIQ2XDaGMDaY5vNdbWHqWsXU683tDZpsBzFB89b" name="csrf-token"/>
<meta content="#da532c" name="msapplication-TileColor"/>
<meta content="#ffffff" name="theme-color"/>
<title>Bob Dylan albums | AZ Lyrics.az</title>
<meta content="Bob Dylan albums | AZ Lyrics.az" name="title"/>
<link href="https://lyrics.az/amp/bob-dylan/allalbums.html" rel="amphtml"/>
<link href="https://lyric

In [15]:
def find_songs(class_of,saving,artist):
    """
    Makes a Data Frame with the title, url, and lyrics as columns.
    As parameter you have to include the class of the list of songs on the url.
    Be careful about the number of \n in the title and extract them.
    """
    import time
    artist = [artist]
    all_song_titles = soup.find_all(class_= class_of)
    all_song_titles
    titles = []
    for title in all_song_titles:
        #print(title.a.text)
        titles.append(title.a.text.split('\n')[0].strip()) 
    # Extract all links
    links = []
    songlinks = soup.find_all(class_= class_of)
    for link in songlinks:
        links.append(link.a['href'])
    import pandas as pd
    df = pd.DataFrame({'title':titles,'links':links})
    df['song_lyric'] = 'blank'
    for i in range(len(df.links[:])):
        try:
            lyric = BeautifulSoup(requests.get(df.links[i]).text)
            df['song_lyric'][i] = lyric.find('p',attrs={'class':'song-lyrics'}).text
            time.sleep(3)
        except:
            continue
    if saving == True:
        import os
        for i in range(len(df.links[:2])):
            lyric = BeautifulSoup(requests.get(df.links[i]).text)
            time.sleep(3)
            filename = f"response_files/lyrics/{df.title[i]}.txt"
            os.makedirs(os.path.dirname(filename), exist_ok=True)
            with open(filename,'w') as f:
                f.write(lyric.find(class_='poem').pre.text)
                f.close()
    return df

In [16]:
df = find_songs('mt-3 col-sm-9 col-12 list-group mt-sm-0')
df 

Unnamed: 0,title,links,song_lyric
0,As Time Goes By,https://lyrics.az/bob-dylan/triplicate/as-time...,You must remember this\nA kiss is still a kiss...
1,I Shall Be Released,https://lyrics.az/bob-dylan/blood-in-my-eye/i-...,They say every man must need protection\nThey ...
2,All Or Nothing At All,https://lyrics.az/bob-dylan/fallen-angels/all-...,All or nothing at all\nHalf a love never appea...
3,Bye Bye Johnny,https://lyrics.az/bob-dylan/live-on-air-1986/b...,Bye and bye\nI'm breathin' a lover's sigh\nWel...
4,Jet Pilot,https://lyrics.az/bob-dylan/the-bootleg-series...,"Well, she's got jet pilot eyes from her hips o..."
...,...,...,...
75,Ballad In Plain D,https://lyrics.az/bob-dylan/another-side-of-bo...,"I once loved a girl, her skin it was bronze\nW..."
76,Untitled 1 (Baby's Black),https://lyrics.az/bob-dylan/some-other-kinds-o...,baby's black\nbeen had\nain't bad\nsmokestacke...
77,Ballad Of Hollis Brown,https://lyrics.az/bob-dylan/the-times-they-are...,Hollis Brown\nHe lived on the outside of town\...
78,Bob Dylan's Blues,https://lyrics.az/bob-dylan/the-freewheelin-bo...,"Well, the Lone Ranger and Tonto\nThey are ridi..."


#### Bag of words

In [24]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [25]:
def vect_fit_transf(sample):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(sample)
    X_df = pd.DataFrame(X.todense(), columns=vectorizer.get_feature_names())
    y = X.artist

    return X y

In [None]:
vect_fit_transf(df.song_lyric[:])

In [27]:
X = vectorizer.fit_transform(df.song_lyric[:2])
X

<2x119 sparse matrix of type '<class 'numpy.int64'>'
	with 128 stored elements in Compressed Sparse Row format>

In [31]:
X_df = pd.DataFrame(X.todense(), columns=vectorizer.get_feature_names())
X_df



Unnamed: 0,above,all,always,and,any,apply,as,be,been,blame,...,west,what,when,who,will,woman,woo,world,yet,you
0,0,0,1,5,0,1,3,0,0,0,...,0,1,1,0,1,1,1,1,0,3
1,1,1,0,0,4,0,0,2,1,1,...,2,0,0,1,0,0,0,0,1,0


##### How can we remove the most common words?

* Using a list of stop words
* Removing the words that appear in more than X% of documents

In [None]:
X_df

Unnamed: 0,all,are,here,in,is,love,loyalty,more,my,submarine,than,to,trouble,us,was,we,with,worth,yellow,yesterday
Beatles,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0
Beatles,0,0,0,1,0,1,0,0,1,1,0,0,0,0,1,0,0,0,0,1
Eminem,0,1,1,0,0,1,1,0,0,0,0,0,1,0,0,1,1,0,0,0
Eminem,0,0,0,0,2,1,1,1,0,0,1,1,0,1,0,0,0,1,0,0
