Importing the required modules

In [8]:
import requests
from bs4 import BeautifulSoup
import csv
import json

I'm going to write a web crawler to extract information of NCT Dream's discography

Defining the url we want

In [9]:
url = 'https://kprofiles.com/nct-dream-discography/'

Printing out the contents in the web page

In [48]:
response = requests.get(url)

if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'html.parser')
    
    data_to_collect = []

I'm going to extract the content under the "NCT DREAM Discography" section

In [50]:
discography_section = soup.find("div", class_="entry-content herald-entry-content")
if discography_section:
    data_to_collect.append(discography_section.text)

print(data_to_collect)

["NCT DREAM Discography\n\nThe bolded tracks are the title tracks of said album.\n\n\nChewing Gum\n1st Pre-Release Single\n\nRelease Date: August 27, 2016Chewing GumChewing Gum (Chinese Ver.)The First\n1st Mini Album\n\nRelease Date: February 9, 2017My First and LastMy First and Last (Chinese Ver.)Dunk ShotChewing GumChewing Gum (Chinese Ver.)We Young\n2nd Mini Album\n\nRelease Date: August 17, 2017We YoungLa La LoveWalk You HomeMy PageWe Young (Chinese Ver.)Trigger the FeverJoy\n1st Single\n\nRelease date: December 15, 2017\nJoyJoy (Instrumental)NCT 2018 Empathy\n1st Studio Album\n\nRelease Date: March 14, 2018Intro: Neo Got My BackBoss (NCT U)Baby Don’t Stop (NCT U)Go (NCT DREAM)Touch (NCT 127)Yestoday (NCT U)Black On BlackTimeless (NCT U)The 7th Sense (NCT U)Without You (NCT U)Without You (Chinese Ver.) (NCT U)Dream in a DreamOutro: VisionWe Go Up\n3rd Mini Album\n\nRelease Date: September 3, 2018We Go Up1, 2, 3Beautiful TimeDrippinDear DREAMWe Go Up (Chinese Ver.)Candle Light\n2nd 

Filtering out to the song titles only

In [25]:
ordered_lists = soup.find_all('ol')

for ordered_list in ordered_lists:
    list_items = ordered_list.find_all('li')
    for list_item in list_items:
        data_to_collect.append(list_item.text)

print(ordered_lists)

[<ol><li><strong>Chewing Gum</strong></li><li>Chewing Gum (Chinese Ver.)</li></ol>, <ol><li><strong>My First and Last</strong></li><li><strong>My First and Last (Chinese Ver.)</strong></li><li>Dunk Shot</li><li><strong>Chewing Gum</strong></li><li>Chewing Gum (Chinese Ver.)</li></ol>, <ol><li>We Young</li><li>La La Love</li><li>Walk You Home</li><li>My Page</li><li><strong>We Young (Chinese Ver.)</strong></li><li>Trigger the Fever</li></ol>, <ol><li><strong>Joy</strong></li><li>Joy (Instrumental)</li></ol>, <ol><li>Intro: Neo Got My Back</li><li><strong>Boss (NCT U)</strong></li><li><strong>Baby Don’t Stop (NCT U)</strong></li><li><strong>Go (NCT DREAM)</strong></li><li><strong>Touch (NCT 127)</strong></li><li><strong>Yestoday (NCT U)</strong></li><li><strong>Black On Black</strong></li><li>Timeless (NCT U)</li><li><strong>The 7th Sense (NCT U)</strong></li><li>Without You (NCT U)</li><li>Without You (Chinese Ver.) (NCT U)</li><li>Dream in a Dream</li><li>Outro: Vision</li></ol>, <ol><

Organizing the data

In [45]:
cleaned_data = []

for ol_element in ordered_lists:
    item_data = []
    for li_element in ol_element.find_all('li'):
        text = li_element.get_text(strip=True)
        if text:
            item_data.append(text)
    cleaned_data.append(item_data)

for index, item in enumerate(cleaned_data, start=1):
    print(f"{index}. {', '.join(item)}")

1. Chewing Gum, Chewing Gum (Chinese Ver.)
2. My First and Last, My First and Last (Chinese Ver.), Dunk Shot, Chewing Gum, Chewing Gum (Chinese Ver.)
3. We Young, La La Love, Walk You Home, My Page, We Young (Chinese Ver.), Trigger the Fever
4. Joy, Joy (Instrumental)
5. Intro: Neo Got My Back, Boss (NCT U), Baby Don’t Stop (NCT U), Go (NCT DREAM), Touch (NCT 127), Yestoday (NCT U), Black On Black, Timeless (NCT U), The 7th Sense (NCT U), Without You (NCT U), Without You (Chinese Ver.) (NCT U), Dream in a Dream, Outro: Vision
6. We Go Up, 1, 2, 3, Beautiful Time, Drippin, Dear DREAM, We Go Up (Chinese Ver.)
7. Candle Light, Candle Light (Instrumental)
8. Don’t Need Your Love, Don’t Need Your Love (Instrumental)
9. Fireflies
10. Boom, Stronger, 119, Bye My First.., Best Friend, Dream Run
11. Ridin’, Quiet Down, 7 Days, Love Again, Puzzle Piece
12. Make A Wish (NCT U), Misfit (NCT U), Volcano (NCT U), Light Bulb (NCT U), Dancing In The Rain (NCT U), Interlude: Past to Present, Deja Vu (N

Seperating each song into an ordered list. 

In [46]:
song_list = []

for ol_element in ordered_lists:
    item_data = []
    for li_element in ol_element.find_all('li'):
        text = li_element.get_text(strip=True)
        if text:
            item_data.append(text)
    if item_data:
        formatted_list = "\n".join([f"{i + 1}. {item}" for i, item in enumerate(item_data)])
        song_list.append(formatted_list)

for index, item in enumerate(song_list, start=1):
    print(item)

1. Chewing Gum
2. Chewing Gum (Chinese Ver.)
1. My First and Last
2. My First and Last (Chinese Ver.)
3. Dunk Shot
4. Chewing Gum
5. Chewing Gum (Chinese Ver.)
1. We Young
2. La La Love
3. Walk You Home
4. My Page
5. We Young (Chinese Ver.)
6. Trigger the Fever
1. Joy
2. Joy (Instrumental)
1. Intro: Neo Got My Back
2. Boss (NCT U)
3. Baby Don’t Stop (NCT U)
4. Go (NCT DREAM)
5. Touch (NCT 127)
6. Yestoday (NCT U)
7. Black On Black
8. Timeless (NCT U)
9. The 7th Sense (NCT U)
10. Without You (NCT U)
11. Without You (Chinese Ver.) (NCT U)
12. Dream in a Dream
13. Outro: Vision
1. We Go Up
2. 1, 2, 3
3. Beautiful Time
4. Drippin
5. Dear DREAM
6. We Go Up (Chinese Ver.)
1. Candle Light
2. Candle Light (Instrumental)
1. Don’t Need Your Love
2. Don’t Need Your Love (Instrumental)
1. Fireflies
1. Boom
2. Stronger
3. 119
4. Bye My First..
5. Best Friend
6. Dream Run
1. Ridin’
2. Quiet Down
3. 7 Days
4. Love Again
5. Puzzle Piece
1. Make A Wish (NCT U)
2. Misfit (NCT U)
3. Volcano (NCT U)
4. Li

Now that I succeed on tidying the data up, I'm going to save it into the csv and json file.

In [47]:
with open('nct_dream_discography.csv', 'w', newline='', encoding='utf-8') as csvfile:
        csv_writer = csv.writer(csvfile)
        for item in song_list:
            csv_writer.writerow([item])

with open('nct_dream_discography.json', 'w', encoding='utf-8') as jsonfile:
        json.dump(song_list, jsonfile, ensure_ascii=False, indent=4)