## Using BeautifulSoup

BeautifulSoup is a library which parses HTML and creates a tree structure of python objects that we can navigate through, extract information from, and edit

In [2]:
#!conda install -y bs4

#### Convert the raw HTML string to a BeautifulSoup object

In [3]:
import requests
from bs4 import BeautifulSoup

### Task 1: Extract the poem titles, url and poems with Beautiful Soup

In [4]:
def search_url(url): 
    import os
    import time
    """
    search_url is a function that accepts as parameter a url string
    and then returns a text file named as text with the web page
    requested. 
    It prints in the excecustion if the code is corectly downloaded.
    It uses request package!
    """                                                                                               
    header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
    response = requests.get(url, headers=header)
    time.sleep(3)
    print(f'The status code is: {response.status_code}')
    filename = f"response_files/response.txt"
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    with open(filename, 'w') as f:
        f.write(response.text)
    text = response.text

    return text

In [5]:
def find_songs(class_of,saving,artist):
    """
    Makes a Data Frame with the title, url, and lyrics as columns.
    As parameter you have to include the class of the list of songs on the url.
    Be careful about the number of \n in the title and extract them.
    """
    import time
    
    all_song_titles = soup.find_all(class_= class_of)
    all_song_titles
    titles = []
    for title in all_song_titles:
        titles.append(title.a.text.split('\n')[0].strip()) 
    # Extract all links
    links = []
    songlinks = soup.find_all(class_= class_of)
    for link in songlinks:
        links.append(link.a['href'])
    import pandas as pd
    df = pd.DataFrame({'title':titles,'links':links})
    df['artist'] = artist
    df['song_lyric'] = 'blank'
    for i in range(len(df.links[:])):
        try:
            lyric = BeautifulSoup(requests.get(df.links[i]).text,)
            df['song_lyric'][i] = lyric.find('p',attrs={'class':'song-lyrics'}).text
            time.sleep(3)
        except:
            continue
    if saving == True:
        import os
        for i in range(len(df.song_lyric[:])):
            filename = f"response_files/lyrics/{df.title[i]}.txt"
            os.makedirs(os.path.dirname(filename), exist_ok=True)
            with open(filename,'w') as f:
                f.write(df.song_lyric[i])
                f.close()
    return df 

In [6]:

text = search_url('https://lyrics.az/bob-dylan/allalbums.html')
soup = BeautifulSoup(text,'html.parser')
df1 = find_songs('mt-3 col-sm-9 col-12 list-group mt-sm-0',True,'bob_dylan')

text = search_url('https://lyrics.az/eminem/allalbums.html')
soup = BeautifulSoup(text,'html.parser')
df2 = find_songs('mt-3 col-sm-9 col-12 list-group mt-sm-0',True,'eminem')

# text = search_url('https://lyrics.az/simon-and-garfunkel/allsongs.html')
# soup = BeautifulSoup(text,'html.parser')
# df3 = find_songs('px-0 mx-0 mb-5 col-12 col-sm-6 table-responsive',True,'simon&granfunkel')


The status code is: 200
The status code is: 200


In [14]:
# Save data workspace!
import os  
os.makedirs('save_frames_artists', exist_ok=True)  
df1.to_csv('save_frames_artists/bob_dylan.csv') 
df2.to_csv('save_frames_artists/eminem.csv') 

# Bag of words

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
def vect_fit_transf(sample, df): # sample is the df.song_lyric[:2]
    vectorizer = CountVectorizer(stop_words='english',max_df=0.8)#,ngram_range=(1, 2))
    X = vectorizer.fit_transform(sample)
    X_df = pd.DataFrame(X.todense(), columns=vectorizer.get_feature_names(),index = df['artist'])
    y_df = df.artist

    return X_df , y_df

##### How can we remove the most common words?

* Using a list of stop words
* Removing the words that appear in more than X% of documents

In [None]:
X_df , y_df =vect_fit_transf(df1.song_lyric[:],df1)
X_dfb , y_dfb =vect_fit_transf(df2.song_lyric[:],df2)

In [None]:
X_df.shape, y_df.shape
X_dfb.shape, y_dfb.shape

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
X_df.head()

In [None]:
from collections import Counter
c = Counter(X_df)
print(c.most_common(3))

    

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vectorizer = TfidfVectorizer(tokenizer=None)
X = vectorizer.fit_transform(df1.song_lyric[:])
X_df = pd.DataFrame(X.todense(), columns=vectorizer.get_feature_names(), index=df1['artist'])

In [None]:
vectorizer = TfidfVectorizer(tokenizer=None)
Xb = vectorizer.fit_transform(df2.song_lyric[:])
X_dfb = pd.DataFrame(Xb.todense(), columns=vectorizer.get_feature_names(), index=df2['artist'])

In [None]:
X_df

In [None]:
X_dfb

# Explore Data Analysis
1. $\color{blue}{\text{Clean data}}$

In [None]:
X_df.dropna()
X_dfb.dropna()

In [None]:
print(X_df.shape), print(y_df.shape)
print(X_dfb.shape), print(y_dfb.shape)

In [None]:
X_df.reset_index(['artist'],inplace=True)

In [None]:
X_dfb.reset_index(['artist'],inplace=True)

In [None]:
X_df["and"].value_counts()

In [None]:

import matplotlib.pyplot as plt
X_df['1910'].hist(figsize=(12,6), density=True, bins=10)
plt.title('Histogram for df.Speed')
plt.xlabel('Values')
plt.ylabel('Relative frequency of the values')
plt.legend()
plt.show()

In [None]:
np.square(X_df).sum(axis=1)

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
print(operator.itemgetter(*np.argsort(model.coef_[0]))(vectorizer.get_feature_names())[-20:])
print(operator.itemgetter(*np.argsort(model.coef_[0]))(vectorizer.get_feature_names())[:20])