# 0.Import library: 

In [1]:
import pandas as pd
import numpy as np

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

import time
import datetime as dt

import csv

from concurrent.futures import ThreadPoolExecutor

# 1. Data collection: 

## 1.1. Initialize: 

In [2]:
# Initialize webdriver
def initialize_driver(url):
    chrome_options = webdriver.ChromeOptions()
    prefs={"profile.managed_default_content_settings.images": 2, 'disk-cache-size': 4096 }
    chrome_options.add_experimental_option("prefs", prefs) # Manage image loading and run on disk cache
    chrome_options.add_argument("--headless") # Runs Chrome in headless mode
#     chrome_options.add_argument("--kiosk");
    chrome_options.add_argument('--no-sandbox') # Bypass OS security model
    chrome_options.add_argument('--disable-dev-shm-usage') # overcome limited resource problems
    driver = webdriver.Chrome(ChromeDriverManager().install(), options=chrome_options)
    driver.get(url)
    return driver

In [3]:
def scrollWebpage(n_times, driver):
    # Get scroll height
    last_height = driver.execute_script("return document.body.scrollHeight")
    for _ in range(n_times):
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        
        # Wait to load page
        time.sleep(2)
        
        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        
        # compare 2 height (to check whether reach the end of website)
        if new_height == last_height:
            # webpage may be loading
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(1)
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
        
        # update height
        last_height = new_height

In [4]:
def accept_cookies(driver):
    try:
      WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH,
                                                        '//button[@id="onetrust-accept-btn-handler"]')))
    except Exception as err:
      print("there's no cookies")
    else:
      ck = driver.find_element_by_xpath('//button[@id="onetrust-accept-btn-handler"]') # accept cookies
      driver.execute_script("arguments[0].click();", ck)

In [5]:
def string2number(numstr):
    if numstr == "Repost" or numstr == "Like":
        return 0

    numstr = numstr.replace(',', '')
    mul = 1
    if numstr[-1] == 'K':
        numstr = numstr.replace('K', '')
    elif numstr[-1] == 'M':
        numstr = numstr.replace('M', '')
        mul = 1000000
    elif numstr[-1] == 'B':
        numstr = numstr.replace('B', '')
        mul = 1000000000
    return int(float(numstr) * mul)

In [6]:
def write_csv_file(filename, header, content):
    with open(filename, 'w', encoding='utf-8-sig', newline='') as file:
        writter = csv.writer(file, delimiter=',')
        writter.writerow(header)
        for l in content:
            writter.writerow(l)

In [7]:
# Initialize some ultilities
sleep = 3

searchStr = 'abcdefghijklmnoprstwuyxyz123456789'
base_link = 'https://www.soundcloud.com'

In [8]:
# Some variable for assigning and storing
users_id = dict() # url : id
users_list = [] # all data
users_id_assign = 0 # use for assigning id
unknow_users = dict() # url : [id, username] --> use temporary store un-stored users

playlists_id = dict()
playlists_list = []
playlists_id_assign = 0

tracks_id = dict() # url : [id, trackname]
tracks_list = []
tracks_id_assign = 0

## 1.2. Crawling users: 

In [9]:
def CrawlUsersData_from_search(page_source):
    soup = BeautifulSoup(page_source, 'html.parser')
    users = soup.find_all('div', class_="userItem sc-media g-flex-row-centered sc-px-2x sc-py-1x m-horizontal m-verified")
    
    global users_id_assign
    for user in users:
        user_url = base_link + user.find('a', class_="userItem__coverArt sc-media-image sc-mr-2x")['href']
        if user_url in users_id.keys():
            continue
        
        username = user.find('a', class_="sc-link-dark sc-link-primary").text.strip().encode('utf-8').decode('utf-8')
        
        confirm_owner = user.find('span', class_="sc-status-icon sc-status-icon-verified sc-status-icon-small sc-ir")
        if confirm_owner:
            confirm_owner = 'Yes'
        else:
            confirm_owner = 'Not yet'
        
        user_followers = user.find('li', class_="sc-ministats-item")
        if user_followers:
            user_followers = int(user.find('li', class_="sc-ministats-item")['title'].split(' ')[0].replace(',', ''))
        else:
            user_followers = 0
        
        user_detail = user.find('h3', \
                                class_='userItem__details sc-type-light sc-text-secondary sc-text-h4 sc-mt-0.5x')\
                                .find_all('div')
        detail_name, country = tuple(map(lambda x : x.text.strip().encode('utf-8').decode('utf-8'), user_detail))
        
        user_id = users_id_assign
        users_id_assign += 1
        users_id[user_url] = user_id
        
        users_list.append([user_id, username, user_url, confirm_owner, detail_name, country, user_followers])

In [10]:
def CrawlUsersData_from_page(url):
    driver = initialize_driver(url)
    WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH,
                                                            '//button[@id="onetrust-accept-btn-handler"]')))
    ck = driver.find_element_by_xpath('//button[@id="onetrust-accept-btn-handler"]')
    driver.execute_script("arguments[0].click();", ck)
    page_source = driver.page_source
    driver.quit()
    soup = BeautifulSoup(page_source, 'html.parser')
    
    username = unknow_users[url][0]
    
    if soup.find('span', class_="verifiedBadge userDropbar__verifiedBadge"):
        confirm_owner = 'Yes'
    else:
        confirm_owner = 'Not yet'
        
    user_followers = int(soup.find('a', class_="infoStats__statLink sc-link-light sc-link-primary")['title']\
                         .split(' ')[0].replace(',', ''))

    detail_name = soup.find('h3', 
                            class_="profileHeaderInfo__additional g-type-shrinkwrap-block theme-dark g-type-shrinkwrap-large-secondary")
    if detail_name:
        detail_name = detail_name.text.strip().encode('utf-8').decode('utf-8')
    else:
        detail_name = ''
        
    country = soup.find('h3', 
                        class_='profileHeaderInfo__additional g-type-shrinkwrap-block theme-dark g-type-shrinkwrap-large-secondary sc-mt-1x')
    if country:
        country = country.text.strip().encode('utf-8').decode('utf-8')
    else:
        country = ''

    user_id = unknow_users[url][1]

    users_list.append([user_id, username, url, confirm_owner, detail_name, country, user_followers])

### 1.3. Crawling playlists: 

In [11]:
# This funciton is temporarily unavailable
def CrawlPlaylistData_from_page(url):
    global playlists_id_assign
    global users_id_assign
    global tracks_id_assign
    
    # open driver
    driver = initialize_driver(url)
    accept_cookies(driver)
    scrollWebpage(1000, driver)
    page_source = driver.page_source
    driver.close()
    driver.quit()
    soup = BeautifulSoup(page_source, 'html.parser')
    
    # crawl
    pll_name = soup.find('h1', 
                         class_='soundTitle__title sc-font g-type-shrinkwrap-inline g-type-shrinkwrap-large-primary theme-dark')\
                        .text.strip().encode('utf-8').decode('utf-8')
    #crawl user information
    pll_user = soup.find('a', 
                         class_='userBadge__usernameLink sc-link-dark sc-link-primary sc-truncate sc-mr-0.5x')
    pll_username = pll_user.text.strip().encode('utf-8').decode('utf-8')
    pll_user_url = pll_user['href']
    if pll_user_url not in users_id.keys() and pll_user_url not in unknow_users.keys():
        unknow_users['pll_url'] = [users_id_assign, pll_username]
        users_id_assign += 1
        
    pll_time = soup.find('time', class_='relativeTime')['datetime']
    
    pll_tags = soup.find_all('span', class_='sc-truncate sc-tagContent')
    if pll_tags:
        pll_mainTag = pll_tags[0].text.strip().encode('utf-8').decode('utf-8')
        pll_detailTags = list(map(lambda x : x.text.strip().encode('utf-8').decode('utf-8'), pll_tags[1:]))
    else:
        pll_mainTag = ''
        pll_detailTags = []
    
    pll_stat = soup.find_all('li', class_='sc-ministats-item')
    pll_like = int(pll_stat[0]['title'].strip().split(' ')[0].replace(',', ''))
    pll_repost = int(pll_stat[1]['title'].strip().split(' ')[0].replace(',', ''))
    
    # crawl tracks
    tracks = soup.find_all('div', class_='trackItem__content sc-truncate')
    pll_tracks_id = []
    for track in tracks:
        track_url = base_link + track.find('a',
                                          class_='trackItem__trackTitle sc-link-dark sc-link-primary sc-font-light')['href']\
                                            .split('?')[0]
        if track_url not in tracks_id.keys():
            track_id = tracks_id_assign
            tracks_id_assign += 1
            tracks_id[track_url] = track_id
        else:
            track_id = tracks_id[track_url]
        
        pll_tracks_id.append(track_id)
        
        pll_url = str(url)
    # playlists_list.append([pll_name, pll_url, playlists_id[url], pll_username, pll_time, pll_mainTag, 
    #                  pll_detailTags, pll_like, pll_repost, len(pll_tracks_id), pll_tracks_id])
    # print([pll_name, pll_url, playlists_id[url], pll_username, pll_time, pll_mainTag, 
    #                  pll_detailTags, pll_like, pll_repost, len(pll_tracks_id), pll_tracks_id])
    print(playlists_id[url])

In [12]:
def CrawlPlaylistData_from_search(page_source):
    soup = BeautifulSoup(page_source, 'html.parser')
    playlists = soup.find_all('div', class_='sound__body')

    global playlists_id_assign
    global tracks_id_assign
    global users_id_assign
    # print(len(playlists))

    for playlist in playlists:
        playlist_base = playlist.find('a', class_='sc-link-primary soundTitle__title sc-link-dark sc-text-h4')
        playlist_url = base_link + playlist_base['href']
        if playlist_url in playlists_id.keys():
            continue
        
        playlist_available = playlist.find('span', class_='g-geoblocked-icon')
        if playlist_available:
            continue
        
        playlist_id = playlists_id_assign
        playlists_id[playlist_url] = playlist_id
        playlists_id_assign += 1
        
        playlist_name = playlist_base.text.strip().encode('utf-8').decode('utf-8')
        
        #-------------------------------------------------------------------------------
        # playlist_name = playlist_base.text.strip().encode('utf-8').decode('utf-8')
        # print(playlist_name)
        # if playlist_url in playlists_id:
        #     print(playlist_url)
        #     print('playlist is existed')
        #     continue
        
        # playlist_available = playlist.find('span', class_='g-geoblocked-icon')
        # if playlist_available:
        #     print(playlist_url)
        #     print('playlist is available')
        #     continue

        # playlist_id = playlists_id_assign
        # playlists_id[playlist_url] = playlist_id
        # playlists_id_assign += 1
        #-------------------------------------------------------------------------------


        playlist_time = playlist.find('time', class_="relativeTime sc-text-secondary sc-text-captions")['datetime']

        playlist_tag = playlist.find('span', class_='sc-truncate sc-tagContent')
        if playlist_tag:
            playlist_tag = playlist_tag.text.strip().encode('utf-8').decode('utf-8')
        else:
            playlist_tag = ''

        playlist_like = string2number(playlist.find('button', class_="sc-button-like sc-button-secondary sc-button sc-button-small sc-button-responsive").text)

        playlist_repost = string2number(playlist.find('button', class_="sc-button-repost sc-button-secondary sc-button sc-button-small sc-button-responsive").text)

        # crawl user
        playlist_username = playlist.find('span', class_='soundTitle__usernameText').text.strip().encode('utf-8').decode('utf-8')
        playlist_user_url = base_link + playlist.find('span', class_='soundTitle__usernameText').parent['href']

        # if playlist_user_url not in users_id.keys() and playlist_user_url not in unknow_users.keys():
        if playlist_user_url not in users_id:
            user_id = users_id_assign
            users_id[playlist_user_url] = user_id
            users_id_assign += 1

            users_list.append([user_id, playlist_username, playlist_user_url, '', '', '', ''])
        else:
            user_id = users_id[playlist_user_url]

        # crawl tracks
        playlist_tracks = playlist.find_all('div', class_='compactTrackListItem sc-media sc-border-light-bottom clickToPlay m-interactive m-playable')

        print(playlist_name, len(playlist_tracks), sep='\t')

        playlist_tracks_id = []
        for track in playlist_tracks:
            track_name = track.find('span', class_='compactTrackListItem__trackTitle sc-text-primary sc-text-h4').text.strip().encode('utf-8').decode('utf-8')

            track_username = track.find('span', class_='compactTrackListItem__user sc-text-secondary sc-text-h4 sc-mr-0.5x')
            if track_username:
                track_username = track_username.text.strip().replace('-', '').strip().encode('utf-8').decode('utf-8')
            else:
                track_username = ''
                
            track_url = base_link + track.find('span', class_='compactTrackListItem__trackTitle sc-text-primary sc-text-h4')['data-permalink-path'].split('?')[0]
            print(track_url)
            if track_url in tracks_id.keys():
                track_id = tracks_id[track_url]
            else:
                track_id = tracks_id_assign
                tracks_id[track_url] = track_id
                tracks_id_assign += 1

            playlist_tracks_id.append(track_id)

            track_play = track.find('span', class_="compactTrackListItem__plays sc-ministats sc-ministats-small  sc-ministats-plays")
            if track_play:
                track_play = string2number(track_play.text.strip())
            else:
                track_play = 0

            tracks_list.append([track_id, track_name, track_url, track_username, track_play])

        playlists_list.append([playlist_id, playlist_name, playlist_url, user_id, playlist_username, playlist_time, playlist_tag, 
                                playlist_like, playlist_repost, len(playlist_tracks_id), playlist_tracks_id])

### 1.4. Crawling tracks (This function is temporarily unavailable):

In [13]:
def CrawlingTracksData_from_page(url):
    driver = initialize_driver(url)
    accept_cookies(driver)
    page_source = driver.page_source
    driver.close()
    driver.quit()
    soup = BeautifulSoup(page_source, 'html.parser')

    track_name = soup.find('h1', class_="soundTitle__title sc-font g-type-shrinkwrap-inline g-type-shrinkwrap-large-primary theme-dark")\
                    .text.strip().encode('utf-8').decode('utf-8')
    track_username = soup.find('a', class_="sc-link-secondary").text
    
    track_time = soup.find('time', class_='relativeTime')['datetime']

    track_tag = soup.find('a', class_="sc-tag sc-tag-large").text.strip().encode('utf-8').decode('utf-8')

    

### 1.5. Crawling:

In [14]:
people_search_str = [base_link + '/search/people?q=' + c for c in searchStr]
# page_sources = []
def get_users(url):
    driver = initialize_driver(url)
    accept_cookies(driver)
    scrollWebpage(10, driver)
    page_source = driver.page_source
    driver.close()
    driver.quit()
    
    CrawlUsersData_from_search(page_source)
    
with ThreadPoolExecutor(max_workers=15) as executor:
    executor.map(get_users, people_search_str)

  driver = webdriver.Chrome(ChromeDriverManager().install(), options=chrome_options)
  ck = driver.find_element_by_xpath('//button[@id="onetrust-accept-btn-handler"]') # accept cookies


there's no cookies
there's no cookies
there's no cookies
there's no cookies
there's no cookies
there's no cookies


In [15]:
playlists_search_str = [base_link + '/search/sets?q=' + c for c in searchStr]

def get_playlists_url(url):
    driver = initialize_driver(url)
    accept_cookies(driver)
    scrollWebpage(10, driver)
    # try:
    #     more_tracks_button = WebDriverWait(driver, 10).until(
    #         EC.presence_of_all_elements_located((By.XPATH, '//a[@class="compactTrackList__moreLink sc-link-light sc-link-primary sc-border-light sc-text-h4"]')))
    # except Exception as err:
    #     print('Something went wrong!')
    # else:
    #     for bt in more_tracks_button:
    #         driver.execute_script("arguments[0].click();", bt)
    page_source = driver.page_source
    driver.close()
    driver.quit()

    CrawlPlaylistData_from_search(page_source)

with ThreadPoolExecutor(max_workers=15) as executor:
    executor.map(get_playlists_url, playlists_search_str)
# for url in playlists_search_str:
    # get_playlists_url(url)

  driver = webdriver.Chrome(ChromeDriverManager().install(), options=chrome_options)
  ck = driver.find_element_by_xpath('//button[@id="onetrust-accept-btn-handler"]') # accept cookies


there's no cookiesthere's no cookies

there's no cookies
there's no cookies
there's no cookies
there's no cookies
there's no cookies
there's no cookies
Khiloo	5
https://www.soundcloud.com/doha-refaat/x7c3jdasffzp
https://www.soundcloud.com/mahmed-badawya/aaprv4ekhfl5
https://www.soundcloud.com/khaled-diesel-3/jgmfck6uxqbm
https://www.soundcloud.com/princc-midoo12/2016-1
https://www.soundcloud.com/sameh_sa3eed/2016-1
Kent Jones	0
Kendrick Lamar - Mr. Morale & The Big Steppers (album + playlist)	5
https://www.soundcloud.com/kendrick-lamar-music/united-in-grief
https://www.soundcloud.com/ramaj-eroc/on-my-mind-alt-version
https://www.soundcloud.com/kendrick-lamar-music/n95
https://www.soundcloud.com/kendrick-lamar-music/worldwide-steppers
https://www.soundcloud.com/ramaj-eroc/i-was-never-loved
KL	5
https://www.soundcloud.com/topdawgent/kendrick-lamar-swimming-pools
https://www.soundcloud.com/ericbellinger/girls-love-rihanna
https://www.soundcloud.com/octobersveryown/lil-wayne-she-will-feat

In [21]:
len(playlists_list)

2637

In [16]:
len(users_list)

5595

In [17]:
len(tracks_list)

11886

In [20]:
playlists_list

[[0,
  'Khiloo',
  'https://www.soundcloud.com/user954636599/sets/khiloo',
  3392,
  'user954636599',
  '2017-04-21T01:45:31.000Z',
  '',
  4,
  0,
  5,
  [0, 1, 2, 3, 4]],
 [1,
  'Kent Jones',
  'https://www.soundcloud.com/user-349244966/sets/kent-jones',
  3393,
  'User 349244966',
  '2016-11-15T12:10:26.000Z',
  '',
  10,
  5,
  0,
  []],
 [2,
  'Kendrick Lamar - Mr. Morale & The Big Steppers (album + playlist)',
  'https://www.soundcloud.com/lovemenoww/sets/kendrick-lamar-mr-morale-and-the-big-steppers-album-playlist',
  3394,
  'Best Playlist Ever :)',
  '2015-07-04T01:25:55.000Z',
  'Hip-hop & Rap',
  17,
  1078,
  5,
  [5, 6, 7, 8, 9]],
 [3,
  'KL',
  'https://www.soundcloud.com/leonora_18/sets/kl',
  3395,
  'leonora_18',
  '2015-02-13T11:51:58.000Z',
  '',
  9,
  2,
  5,
  [10, 11, 12, 13, 14]],
 [4,
  'Keep Calm & Listen R3sizze Records',
  'https://www.soundcloud.com/rosesvoque/sets/7eolett7ibes',
  3396,
  'Samantha Voque',
  '2016-02-15T07:35:18.000Z',
  'R3SIZZE',
  10,
 

In [18]:
user_header = ['user_id', 'username', 'user_url', 'confirm_owner', 'detail_name', 'country', 'user_followers']
write_csv_file('./user.csv', user_header, users_list)

In [24]:
playlist_header = ['playlist_id', 'playlist_name', 'playlist_url', 'owner_id', 'username', 'playlist_time', 'playlist_tag', 
                                'like', 'repost', 'size', 'tracks']
write_csv_file('./playlist.csv', playlist_header, playlists_list)

In [26]:
track_header = ['track_id', 'track_name', 'url', 'username', 'play_time']
write_csv_file('./track.csv', track_header, tracks_list)