In [2]:
import selenium, os, re
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm
from langdetect import detect

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
options = Options()
options.headless = True
options.add_argument("start-maximized")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)


  options.headless = True


## Scrape Post Data

In [8]:
df = pd.DataFrame(columns=['keyword','title', 'author', 'body', 'time', 'date', 'status', 'votes', 'num_comments', 'num_comment_pages', 'link', 'has_status_update', 'status_message', 'status_update_date'])
no_author = []
idx = 0

link_dir = 'idea_links'

# Go thorugh all files in directory
for link_file in os.listdir(link_dir):
    # Format keyword from file name
    keyword = link_file.split('_')[0]
    
    # Open file
    with open(os.path.join(link_dir, link_file)) as f:
        links = f.read().split('\n')
        
    # Iterate over all links in file
    for link in tqdm(links):
        if link == '': continue
            
        # Parse linkG
        driver.get(link)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        # ONLY FOR IMPLEMENTED POSTS
        # fetch official status

        try:
            status_component = soup.find('div', class_="lia-status-comment-body")
            has_status_update = True
            status_message = status_component.text.strip()
            status_date = status_message.partition('Updated on')[2].partition('\n')[0]

        except:
            has_status_update = False
            status_message = ''
            status_date = ''

        # Retrieve the number of comment pages
        try:
            num_comment_pages = int(soup.find('li', class_=re.compile('lia-paging-page-last lia-js-data-pageNum-[0-9]+'))['class'][-1].split('-')[-1])
        except:
            num_comment_pages = 1

        # Retrieve the title
        try:
            title = ' '.join(soup.find('div', class_='lia-message-subject').getText().split())
        except:
            # No permission to view link
            continue

        body = ' '.join(soup.find('div', class_='lia-message-body-content').getText().split())

        # Locate author section
        try:
            author_span = soup.find('span', {'class':
                                             ['lia-message-byline lia-message-byline-author-date lia-component-byline-author-date lia-component-message-view-widget-byline-author-date',
                                              re.compile('lia-user-name lia-user-rank-.* lia-component-message-view-widget-author-username'),
                                             ]})

            # If there is an author, find here
            try:
                author = author_span.find('a', class_='lia-link-navigation lia-page-link lia-user-name-link')['aria-label']
                author = author.split('View Profile of ')[1]
            # Otherwise, author was removed. Value is 'user-removed'
            except:
                author = author_span.find('span', class_='anon-user').getText()
        except:
            continue

        # Locate date and time
        try:
            date = soup.find('span', class_='local-date').getText()
            time = soup.find('span', class_='local-time').getText()
        except:
            stamp = soup.find('span', class_='local-friendly-date')['title']
            date = stamp.split()[0]
            time = stamp.split()[1] + ' ' + stamp.split()[2]
        
        # Locate post status
        try:
            status_span = soup.find('span', class_=re.compile('MessageStatus lia-status lia-status-idea-.* lia-status-completed lia-component-message-status lia-component-message-view-widget-message-status'))
                                                    
            status = status_span.find('a', class_='lia-link-navigation message-status-link').getText()
        except:
            status = soup.find('span', class_='lia-img-message-type-solved lia-fa-message lia-fa-type lia-fa-solved lia-fa')
            if status != None:
                status = status['title']

        # If there is a vote count, find it
        try:
            votes = soup.find('span', class_='MessageKudosCount lia-component-kudos-widget-message-kudos-count').getText()
            votes = int(votes.replace(',', ''))
        # Otherwise, set votes to -1
        except:
            votes = -1

        # Update dataframe with new data
        df_new = pd.DataFrame({
            'keyword':keyword, 'title': title, 'author': author, 'body': body, 'time': time, 'date': date,
            'status': keyword, 'votes': votes, 'num_comments':num_comment_pages*10, 'num_comment_pages':num_comment_pages, 'link': link,
            'has_status_update': has_status_update, 'status_message': status_message, 'status_update_date': status_date
        }, index=[idx])

        df = pd.concat([df, df_new])

        idx = idx + 1
        



100%|██████████| 320/320 [08:46<00:00,  1.64s/it] 
100%|██████████| 392/392 [06:36<00:00,  1.01s/it]
100%|██████████| 648/648 [13:02<00:00,  1.21s/it]


#### Filter out non-english comments

In [9]:
def get_lang(row):
    try:
        return detect(row['body'])
    except:
        print(row['link'])
        return None
    
df['lang'] = df.apply(lambda row: get_lang(row), axis=1)

https://community.spotify.com/t5/Implemented-Ideas/Mobile-Playlist-Customization-Edit-Cover-and-Caption/idi-p/1552722?search-action-id=347302393013&search-result-uid=1552722
https://community.spotify.com/t5/Implemented-Ideas/WP8-Bring-Spotify-Free-to-Windows-Phone/idi-p/615580?search-action-id=347302527089&search-result-uid=615580
https://community.spotify.com/t5/Closed-Ideas/Names-doesnt-fit/idi-p/1447145


In [10]:
df = df.loc[df['lang'] == 'en']
df

Unnamed: 0,keyword,title,author,body,time,date,status,votes,num_comments,num_comment_pages,link,has_status_update,status_update_text,status_update_date,status_message,status_date,lang
0,implemented,[Mobile] Click lyric to go to lyric time,marcosw,To be able to go to the lyric part of a song a...,02:43 AM,‎2022-02-08,implemented,311,10,1,https://community.spotify.com/t5/Implemented-I...,True,,,"Updated on 2022-12-06\nHey folks,\n \n\n\nThan...",status_date,en
1,implemented,"[Mobile][Your Library] ""Spotify Playlists"" folder",Peter,Idea: Create a separate folder in Your Library...,08:52 AM,‎2022-01-04,implemented,102,10,1,https://community.spotify.com/t5/Implemented-I...,True,,,"Updated on 2022-10-12\nHey folks,\n \n\n\nThan...",status_date,en
2,implemented,[AutoPlay] Option to toggle Autoplay on/off ac...,nirshtuhl,The Autoplay setting isn't working for connect...,07:39 PM,‎2021-10-26,implemented,1667,370,37,https://community.spotify.com/t5/Implemented-I...,True,,,"Updated on 2023-02-07\n \nHey folks,\n \nThe ""...",status_date,en
3,implemented,[All Platforms] Support for Apple Shareplay,jtylerhartley,I would love to see Spotify adopt support for ...,07:32 PM,‎2021-10-25,implemented,106,10,1,https://community.spotify.com/t5/Implemented-I...,True,,,"Updated on 2022-05-24Hey folks,\nThanks for co...",status_date,en
4,implemented,[Desktop][Playlists] Enhance Playlist on Desktop,caduceusmi7,Please introduce the Enhance Playlist on Deskt...,03:22 PM,‎2021-10-21,implemented,620,40,4,https://community.spotify.com/t5/Implemented-I...,True,,,Updated on 2022-10-07\n Hey folks!\n\n\nThanks...,status_date,en
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1352,live,[Partnerships] Kodi Plugin (XBMC),Crazy-S,"Hey, how about an official Spotify XBMC Plugin...",12:53 AM,‎2012-07-27,live,997,70,7,https://community.spotify.com/t5/Live-Ideas/Pa...,True,,,"Updated on 2020-10-14Hey folks, We wanted to p...",status_date,en
1353,live,[Shows] TV Shows & Movies,jrjessen7,"I would love to have a feature where, similar ...",03:51 AM,‎2019-01-23,live,740,20,2,https://community.spotify.com/t5/Live-Ideas/Sh...,True,,,"Updated on 2019-09-18Hey folks, \n \nThanks f...",status_date,en
1354,live,[All Platforms] Streaming Service On Spotify f...,user-removed,It would be nice if they was a free and premiu...,12:15 AM,‎2019-05-15,live,672,10,1,https://community.spotify.com/t5/Live-Ideas/Al...,True,,,"Updated on 2019-09-18Hey folks, \n \nThanks f...",status_date,en
1355,live,[Subscription] Discount for National Service m...,lillykera,The student discount for Spotify premium helps...,10:19 PM,‎2018-08-25,live,686,10,1,https://community.spotify.com/t5/Live-Ideas/Su...,True,,,"Updated on 2019-09-16Hey folks,\nWe wanted to ...",status_date,en


# Write to file

Dataframe contains one column per useful attribute. If there is no status, the value is None. If there is no vote count, the value is -1.

In [11]:
from datetime import date

today = date.today()

df = df.drop_duplicates(subset='body')
df.to_csv('posts-en-' + today.strftime("%b-%d-%Y") + '.csv')

In [None]:
import numpy as np

df = pd.DataFrame(columns=['keyword','title', 'author', 'body', 'time', 'date', 'status', 'votes', 'num_comments', 'num_comment_pages', 'link', 'has_status_update', 'status_update_text', 'status_update_date'])
no_author = []
idx = 0

link_dir = 'idea_links'

# Go thorugh all files in directory
for link_file in os.listdir(link_dir):
    # Format keyword from file name
    keyword = link_file.split('_')[0]
    
    # Open file
    with open(os.path.join(link_dir, link_file)) as f:
        links = f.read().split('\n')
        
    # Iterate over all links in file
    for link in tqdm(links):
        if link == '': continue
            
        # Parse linkG
        driver.get(link)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        # ONLY FOR IMPLEMENTED POSTS
        # fetch official status

        try:
            status_component = soup.find('div', class_="lia-status-comment-body")
            has_status_update = True
            status_message = status_component.text.strip()
            status_date = status_message.partition('Updated on')[2].partition('\n')[0]

        except:
            has_status_update = False
            status_message = ''
            status_date = ''

        # Retrieve the number of comment pages
        try:
            num_comment_pages = int(soup.find('li', class_=re.compile('lia-paging-page-last lia-js-data-pageNum-[0-9]+'))['class'][-1].split('-')[-1])
        except:
            num_comment_pages = 1

        
        ## data model for comments
        ## array of {comment_date, comment_time, poster_rank, comment_text, like_count, is_official_comment}
        # Comments scraping logic

        for comment_page in range(num_comment_pages):
            comment_page_url = f'${link}/page/${comment_page + 1}#comments'

            driver.get(comment_page_url)
            comment_soup = BeautifulSoup(driver.page_source, 'html.parser')
            comments_list = comment_soup.findAll('lia-quilt-idea-reply-message')
            
            for comment in comments_list:
                print(comment)
                print("8" * 60)

            print(comment_page_url)


        # Retrieve the title
        try:
            title = ' '.join(soup.find('div', class_='lia-message-subject').getText().split())
        except:
            # No permission to view link
            continue

        body = ' '.join(soup.find('div', class_='lia-message-body-content').getText().split())

        # Locate author section
        try:
            author_span = soup.find('span', {'class':
                                             ['lia-message-byline lia-message-byline-author-date lia-component-byline-author-date lia-component-message-view-widget-byline-author-date',
                                              re.compile('lia-user-name lia-user-rank-.* lia-component-message-view-widget-author-username'),
                                             ]})

            # If there is an author, find here
            try:
                author = author_span.find('a', class_='lia-link-navigation lia-page-link lia-user-name-link')['aria-label']
                author = author.split('View Profile of ')[1]
            # Otherwise, author was removed. Value is 'user-removed'
            except:
                author = author_span.find('span', class_='anon-user').getText()
        except:
            continue

        # Locate date and time
        try:
            date = soup.find('span', class_='local-date').getText()
            time = soup.find('span', class_='local-time').getText()
        except:
            stamp = soup.find('span', class_='local-friendly-date')['title']
            date = stamp.split()[0]
            time = stamp.split()[1] + ' ' + stamp.split()[2]
        
        # Locate post status
        try:
            status_span = soup.find('span', class_=re.compile('MessageStatus lia-status lia-status-idea-.* lia-status-completed lia-component-message-status lia-component-message-view-widget-message-status'))
                                                    
            status = status_span.find('a', class_='lia-link-navigation message-status-link').getText()
        except:
            status = soup.find('span', class_='lia-img-message-type-solved lia-fa-message lia-fa-type lia-fa-solved lia-fa')
            if status != None:
                status = status['title']

        # If there is a vote count, find it
        try:
            votes = soup.find('span', class_='MessageKudosCount lia-component-kudos-widget-message-kudos-count').getText()
            votes = int(votes.replace(',', ''))
        # Otherwise, set votes to -1
        except:
            votes = -1

        # Update dataframe with new data
        df_new = pd.DataFrame({
            'keyword':keyword, 'title': title, 'author': author, 'body': body, 'time': time, 'date': date,
            'status': keyword, 'votes': votes, 'num_comments':num_comment_pages*10, 'num_comment_pages':num_comment_pages, 'link': link,
            'has_status_update': has_status_update, 'status_message': status_message, 'status_date': 'status_date'
        }, index=[idx])

        df = pd.concat([df, df_new])

        idx = idx + 1
        

