# Lab | Web Scraping Single Page (GNOD part 1)

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:
# find url and store it in a variable
url = "https://www.popvortex.com/music/charts/top-100-albums.php"

In [3]:
# download html with a get request
response = requests.get(url)
response.status_code # 200 status code means OK! (403 is not ok)

200

In [4]:
# parse html (create the 'soup')
soup = BeautifulSoup(response.content, "html.parser")
soup

<!DOCTYPE html>
<html lang="en"><head><meta charset="utf-8"/><title>iTunes Top 100 Albums 2023</title><meta content="width=device-width, initial-scale=1" name="viewport"/><meta content="iTunes top 100 albums chart of the most popular album downloads in the U.S.A. right now. Current chart of the best selling albums 2023 is updated hourly." name="description"/><meta content="iTunes Top 100 Albums 2023" property="og:title"><meta content="iTunes top 100 albums chart of the most popular album downloads in the U.S.A. right now. Current chart of the best selling albums 2023 is updated hourly." property="og:description"><meta content="article" property="og:type"><meta content="https://www.popvortex.com/images/logo-facebook.png" property="og:image"/><meta content="PopVortex" property="og:site_name"/><meta content="https://www.popvortex.com/music/charts/top-100-albums.php" property="og:url"/><meta content="100000239962942" property="fb:admins"/><meta content="178831188827052" property="fb:app_id

In [5]:
#print(soup.prettify())

In [6]:
soup.select("p")
# get all elements of a tag
for p in soup("p"):
    print(p.get_text())

The top 100 most popular and best selling hit albums downloaded on iTunes today. To listen to the music or download any of the top 100 albums you must have Apple's iTunes player installed on your system. The chart of the current top 100 albums in the USA on iTunes is updated hourly.
The #1 most popular album on iTunes right now is Rockstar by Dolly Parton.
Chart of the top 100 albums last updated: November, 20 2023
Related Charts: iTunes Top New Albums  November 2023, iTunes Top 100 Songs, and iTunes Top 200 Albums
1
RockstarDolly Parton
2
New Blue SunAndré 3000
3
HigherChris Stapleton
4
CosmoOzuna
5
Welcome 2 Collegrove2 Chainz & Lil Wayne
6
1989 (Taylor's Version) [Deluxe]Taylor Swift
7
For All The Dogs Scary Hours EditionDrake
8
LoverTaylor Swift
9
GOLDENJung Kook
10
ChristmasCher
11
TROLLS Band Together (Original Motion Picture Soundtrack)Various Artists
12
Equal Strain On All PartsJimmy Buffett
13
The Hunger Games: The Ballad of Songbirds & Snakes (Music From & Inspired By)Olivia 

In [7]:
song_info = []

for song in soup.find_all('p', class_='title-artist'):
    song_title = song.find('cite', class_='title').get_text()
    artist_name = song.find('em', class_='artist').get_text()
    song_info.append((song_title, artist_name))

#print("Songs:", song_info)
top_chart_songs = pd.DataFrame(song_info, columns=['song_title', 'artist_name'])

top_chart_songs

Unnamed: 0,song_title,artist_name
0,Rockstar,Dolly Parton
1,New Blue Sun,André 3000
2,Higher,Chris Stapleton
3,Cosmo,Ozuna
4,Welcome 2 Collegrove,2 Chainz & Lil Wayne
...,...,...
95,Nostalgia,Rod Wave
96,The Dark Side of the Moon,Pink Floyd
97,Traveller,Chris Stapleton
98,ABBA Gold: Greatest Hits,ABBA


In [8]:
# Song recommendation
# Get User Input
user_input_song = input("Enter a song: ")
# Convert the user input and song names to lowercase for case-insensitive comparison
user_input_song_lower = user_input_song.lower()
top_chart_songs_lower = top_chart_songs.apply(lambda x: x.str.lower())
# Check if the user-inputted song is in the top chart songs list (case-insensitive)
if user_input_song_lower in top_chart_songs_lower['song_title'].values:
    # Filter out the user-inputted song from the DataFrame
    top_chart_songs_without_input = top_chart_songs[
        top_chart_songs_lower['song_title'] != user_input_song_lower
    ]
    # Recommend a random song from the filtered DataFrame
    recommended_song = top_chart_songs_without_input.sample().squeeze()
    print(f"You might also like: {recommended_song['song_title']} by {recommended_song['artist_name']}")
else:
    print("We can't provide a recommendation right now.")

Enter a song: nostalgia
You might also like: ROCK-STAR by Stray Kids


# Lab | Web Scraping Multiple Pages.

In [9]:
# Specify the URL of the webpage you want to scrape
url1 = 'https://top40weekly.com/top-100-songs-of-all-time/'

In [10]:
response1 = requests.get(url1)
response1.status_code # 200 status code means OK! (403 is not ok)

200

In [11]:
new_soup = BeautifulSoup(response1.content, "html.parser")
new_soup

<!DOCTYPE html>

<html class="no-js" lang="en-US" prefix="og: https://ogp.me/ns#">
<head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<link href="https://top40weekly.com/xmlrpc.php" rel="pingback"/>
<script data-cfasync="false" data-no-optimize="1">
(function(w, d) {
	w.adthrive = w.adthrive || {};
	w.adthrive.cmd = w.adthrive.cmd || [];
	w.adthrive.plugin = 'adthrive-ads-3.5.5';
	w.adthrive.host = 'ads.adthrive.com';
	w.adthrive.integration = 'plugin';

	var commitParam = (w.adthriveCLS && w.adthriveCLS.bucket !== 'prod' && w.adthriveCLS.branch) ? '&commit=' + w.adthriveCLS.branch : '';

	var s = d.createElement('script');
	s.async = true;
	s.referrerpolicy='no-referrer-when-downgrade';
	s.src = 'https://' + w.adthrive.host + '/sites/5cacd09cb13d6c3ef8c8bbde/ads.min.js?referrer=' + w.encodeURIComponent(w.location.href) + commitParam + '&cb=' + (Math.floor(Math.random() * 100) + 1) + '';
	var n = d.getElementsByTagName('script')[0];
	

In [12]:
new_soup.select('div > div > figure > table > tbody > tr:nth-child(16) > td:nth-child(2)')[0].get_text()

'YEAH!Usher Feat. Lil Jon & LudacrisUS Release: 2004'

In [13]:
#print(new_soup.prettify())

In [14]:
# Look for the song name and artist name in the soup
def scrape_music_info(url):
    response = requests.get(url)

    if response.status_code == 200:
        new_soup = BeautifulSoup(response.text, 'html.parser')

        # Find all table rows in the provided HTML snippet
        rows = new_soup.find_all('tr')

        music_info_list = []

        for row in rows:
            # Find all cells in the current row
            cells = row.find_all('td')

            # Check if there are at least two cells in the row
            if len(cells) >= 2:
                # Extract the text content of the cells
                song_name = cells[1].get_text(strip=True)
                # Extract the artist name from the link text
                artist_name = cells[1].find('br').get_text(strip=True)  

                # Append the extracted information to the list
                music_info_list.append([song_name, artist_name])

        return music_info_list
    else:
        return None

music_info_list = scrape_music_info(url1)
music_info_list


[['THE TWISTChubby CheckerUS Release: 1960', ''],
 ['SMOOTHSantanaFeat. Rob ThomasUS Release: 1999', ''],
 ['MACK THE KNIFEBobby DarinUS Release: 1959', ''],
 ['UPTOWN FUNK!Mark Ronson Feat. Bruno MarsUS Release: 2015', ''],
 ['HOW DO I LIVELeann RimesUS Release: 1997', ''],
 ['PARTY ROCK ANTHEMLMFAO Feat. Lauren Bennett & GoonRockUS Release: 2011',
  ''],
 ['I GOTTA FEELINGThe Black Eyed PeasUS Release: 2009', ''],
 ['MACARENA (BAYSIDE BOYS MIX)Los Del RioUS Release: 1996', ''],
 ['SHAPE OF YOUEd SheeranUS Release: 2017', ''],
 ['PHYSICALOlivia Newton-JohnUS Release: 1981', ''],
 ['YOU LIGHT UP MY LIFEDebby BooneUS Release: 1977', ''],
 ['HEY JUDEThe BeatlesUS Release: 1968', ''],
 ['CLOSERThe Chainsmokers Feat. HalseyUS Release: 2016', ''],
 ['WE BELONG TOGETHERMariah CareyUS Release: 2005', ''],
 ['UN-BREAK MY HEARTToni BraxtonUS Release: 1996', ''],
 ['YEAH!Usher Feat. Lil Jon & LudacrisUS Release: 2004', ''],
 ['BETTE DAVIS EYESKim CarnesUS Release: 1981', ''],
 ['ENDLESS LOVEDian

In [15]:
import re
music_info_list = [
    '- THE TWISTChubby Checker','- SMOOTHSantanaFeat. Rob Thomas','- MACK THE KNIFEBobby Darin',
    'UPTOWN FUNK! - Mark Ronson Feat. Bruno Mars','- HOW DO I LIVELeann Rimes','- PARTY ROCK ANTHEMLMFAO Feat. Lauren Bennett & GoonRock',
    '- I GOTTA FEELINGThe Black Eyed Peas','MACARENA (BAYSIDE BOYS MIX) - Los Del Rio','- SHAPE OF YOUEd Sheeran',
    'PHYSICALOLIVIA NEWTON- - John','- YOU LIGHT UP MY LIFEDebby Boone','- HEY JUDEThe Beatles',
    '- CLOSERThe Chainsmokers Feat. Halsey','- WE BELONG TOGETHERMariah Carey','UN- - BREAK MY HEARTToni Braxton',
    'YEAH! - Usher Feat. Lil Jon & Ludacris','- BETTE DAVIS EYESKim Carnes','- ENDLESS LOVEDiana Ross &Lionel Richie',
    'TONIGHT’S THE NIGHT (GONNA BE ALRIGHT) - Rod Stewart','YOU WERE MEANT FOR ME/ - FOOLISH GAMESJewel',
    '(EVERYTHING I DO) - I DO IT FOR YOUBryan Adams','I’ - LL MAKE LOVE TO YOUBoyz II Men',
    'THE THEME FROM “A SUMMER PLACE” - Percy Faith & His Orchestra','- LE FREAKChic','- HOW DEEP IS YOUR LOVEBee Gees',
    '- EYE OF THE TIGERSurvivor','- WE FOUND LOVERihanna Feat. Calvin Harris','LOWFLO RIDA FEAT. T- - Pain',
    '- JUST WANT TO BE YOUR EVERYTHINGAndy Gibb','- TOO CLOSENext','- EVERY BREATH YOU TAKEThe Police',
    '- SOMEBODY THAT I USED TO KNOWGotye Feat. Kimbra','- DESPACITOLuis Fonsi & Daddy Yankee Feat. Justin Bieber',
    'FLASHDANCE… - WHAT A FEELINGIrene Cara','- ROLLING IN THE DEEPAdele','TOSSIN’ AND TURNIN’ - Bobby Lewis',
    '- THE BATTLE OF NEW ORLEANSJohnny Horton','- ONE SWEET DAYMariah Carey & Boyz II Men',
    '- TRULY MADLY DEEPLYSavage Garden','- SILLY LOVE SONGSWings','LET’ - S GET IT ONMarvin Gaye',
    '- NIGHT FEVERBee Gees','- ANOTHER ONE BITES THE DUSTQueen','- SAY SAY SAYPaul McCartney&Michael Jackson',
    '- HOW YOU REMIND MENickelback','- TIE A YELLOW RIBBON ROUND THE OLE OAK TREEDawn Feat. Tony Orlando',
    'IT’ - S ALL IN THE GAMETommy Edwards','- I WANT TO HOLD YOUR HANDThe Beatles',
    '- SHADOW DANCINGAndy Gibb','- CALL ME MAYBECarly Rae Jepsen','BLURRED LINESROBIN THICKE FEAT. T.I. + - Pharrell',
    'CANDLE IN THE WIND ‘97/ - SOMETHING ABOUT THE WAY YOU LOOK TONIGHTElton John','- NO ONEAlicia Keys',
    '- I WILL ALWAYS LOVE YOUWhitney Houston','- END OF THE ROADBoyz II Men','- BOOM BOOM POWThe Black Eyed Peas',
    '- CALL MEBlondie','- LET ME LOVE YOUMario','STAYIN’ - ALIVEBee Gees','- LADYKenny Rogers',
    '- TIK TOKKeSha','I’ - M A BELIEVERThe Monkees','- GOLD DIGGERKanye West Feat. Jamie Foxx',
    '- APOLOGIZETimbaland Feat. OneRepublic','- THE SIGNAce Of Base','- CENTERFOLDThe J. Geils Band',
    '- ALL ABOUT THAT BASSMeghan Trainor','(JUST LIKE) - STARTING OVERJohn Lennon','- ROYALSLorde',
    '- THE BOY IS MINEBrandy & Monica','BECAUSE I LOVE YOU (THE POSTMAN SONG) - Stevie B',
    'I LOVE ROCK ’ - N ROLLNJoan Jett & the Blackhearts','AQUARIUS/LET THE SUNSHINE INTHE 5TH - Dimension',
    'WHOOMP! (THERE IT IS) - Tag Team','MOVES LIKE JAGGERMAROON 5 - Feat. Christina Aguilera',
    '- EBONY AND IVORYPaul McCartney &Stevie Wonder','- RUSH RUSHPaula Abdul','Unknown',
    '- HAPPYPharrell Williams','- UPSIDE DOWNDiana Ross','SUGAR, - SUGARThe Archies',
    '- JUST THE WAY YOU AREBruno Mars','- DILEMMANelly Feat. Kelly Rowland','- I HEARD IT THROUGH THE GRAPEVINEMarvin Gaye',
    'YOU’ - RE STILL THE ONEShania Twain','- BILLIE JEANMichael Jackson','- HOT STUFFDonna Summer',
    'ROCKSTARPOST MALONE FEAT. 21 - Savage','GANGSTA’ - S PARADISECoolioFeat. L.V.',
    '- ABRACADABRATheSteve Miller Band','- PERFECTEd Sheeran','YOU’ - RE SO VAINCarly Simon',
    '- PLAY THAT FUNKY MUSICWild Cherry','SAY YOU, - SAY MELionel Richie','- MY SHARONAThe Knack',
    'ALL NIGHT LONG (ALL NIGHT) - Lionel Richie','NOTHING COMPARES 2 USINEAD O’ - Connor',
    'I SWEARALL-4- - One','- FAMILY AFFAIRMary J. Blige','- WAITING FOR A GIRL LIKE YOUForeigner'
]

# Because the previous list elements weren't separated clearly by song title and artist:
def separate_capitals(text):
    # Find the index where the capital letters stop being consecutive
    index = next((i for i, c in enumerate(text) if c.islower()), len(text))

    # Separate the capitals until the one before becoming lowercased
    song_title = text[:index-1].strip()
    artist = text[index-1:].strip()

    return artist, song_title

def clean_table(data):
    cleaned_data = []

    for entry in data:
        # Remove numbers and dashes at the beginning
        entry = re.sub(r'^\d+\s*-\s*', '', entry)

        # Separate Artist and Song Title
        artist, song_title = separate_capitals(entry)

        # Remove unwanted characters
        artist = re.sub(r'\W', ' ', artist)
        song_title = re.sub(r'\W', ' ', song_title)

        # Append cleaned entry to the result
        cleaned_data.append({
            'artist_name': artist.strip(),
            'song_title': song_title.strip()
        })

    return cleaned_data
# Clean the data and create the DataFrame
cleaned_data = clean_table(music_info_list)
df = pd.DataFrame(cleaned_data)
df

Unnamed: 0,artist_name,song_title
0,Chubby Checker,THE TWIST
1,SantanaFeat Rob Thomas,SMOOTH
2,Bobby Darin,MACK THE KNIFE
3,Mark Ronson Feat Bruno Mars,UPTOWN FUNK
4,Leann Rimes,HOW DO I LIVE
...,...,...
95,Lionel Richie,ALL NIGHT LONG ALL NIGHT
96,Connor,NOTHING COMPARES 2 USINEAD O
97,One,I SWEARALL 4
98,Mary J Blige,FAMILY AFFAIR


In [16]:
# each list becomes a column
top_chart_songs2 = pd.DataFrame(df)
top_chart_songs2

Unnamed: 0,artist_name,song_title
0,Chubby Checker,THE TWIST
1,SantanaFeat Rob Thomas,SMOOTH
2,Bobby Darin,MACK THE KNIFE
3,Mark Ronson Feat Bruno Mars,UPTOWN FUNK
4,Leann Rimes,HOW DO I LIVE
...,...,...
95,Lionel Richie,ALL NIGHT LONG ALL NIGHT
96,Connor,NOTHING COMPARES 2 USINEAD O
97,One,I SWEARALL 4
98,Mary J Blige,FAMILY AFFAIR


In [17]:
# Concatenate with keys to differentiate the sources
top_songs = pd.concat([top_chart_songs, top_chart_songs2])
top_songs

Unnamed: 0,song_title,artist_name
0,Rockstar,Dolly Parton
1,New Blue Sun,André 3000
2,Higher,Chris Stapleton
3,Cosmo,Ozuna
4,Welcome 2 Collegrove,2 Chainz & Lil Wayne
...,...,...
95,ALL NIGHT LONG ALL NIGHT,Lionel Richie
96,NOTHING COMPARES 2 USINEAD O,Connor
97,I SWEARALL 4,One
98,FAMILY AFFAIR,Mary J Blige


In [18]:
# Getting all lower str
top_songs = top_songs.applymap(lambda x: x.lower() if isinstance(x, str) else x)
top_songs

Unnamed: 0,song_title,artist_name
0,rockstar,dolly parton
1,new blue sun,andré 3000
2,higher,chris stapleton
3,cosmo,ozuna
4,welcome 2 collegrove,2 chainz & lil wayne
...,...,...
95,all night long all night,lionel richie
96,nothing compares 2 usinead o,connor
97,i swearall 4,one
98,family affair,mary j blige


In [19]:
# Song recommendation
# Get User Input
user_input_song2 = input("Enter a song: ")
# Convert the user input and song names to lowercase for case-insensitive comparison
user_input_song_lower = user_input_song2.lower()
top_chart_songs_lower = top_chart_songs2.apply(lambda x: x.str.lower())
# Check if the user-inputted song is in the top chart songs list (case-insensitive)
if user_input_song_lower in top_chart_songs_lower['song_title'].values:
    # Filter out the user-inputted song from the DataFrame
    top_chart_songs_without_input = top_chart_songs2[
        top_chart_songs_lower['song_title'] != user_input_song_lower
    ]
    # Recommend a random song from the filtered DataFrame
    recommended_song = top_chart_songs_without_input.sample().squeeze()
    print(f"You might also like: {recommended_song['song_title']} by {recommended_song['artist_name']}")
else:
    print("We can't provide a recommendation right now.")

Enter a song: family affair
You might also like: TOSSIN  AND TURNIN by Bobby Lewis
