In [None]:
import selenium, os, re
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm
from langdetect import detect

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
options = Options()
options.headless = True
options.add_argument("start-maximized")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)


## Scrape Post Data

In [None]:
df = pd.DataFrame(columns=['keyword', 'title', 'author', 'body', 'time', 'date', 'status', 'votes', 'num_comments', 'num_comment_pages', 'link', 'has_status_update', 'status_message', 'status_update_date'])
df2 = pd.DataFrame(columns=['keyword', 'title', 'comment_date', 'username', 'rank', 'text', 'likes', 'is_official_comment'])
comments_list = []
no_author = []
idx = 0
idx2 = 0

link_dir = 'idea_links'

# Go thorugh all files in directory
for link_file in os.listdir(link_dir):
    # Format keyword from file name
    keyword = link_file.split('_')[0]
    
    # Open file
    with open(os.path.join(link_dir, link_file)) as f:
        links = f.read().split('\n')
        
    # Iterate over all links in file
    for link in tqdm(links):
        if link == '': continue
            
        # Parse linkG
        driver.get(link)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        # ONLY FOR IMPLEMENTED POSTS
        # fetch official status

        try:
            status_component = soup.find('div', class_="lia-status-comment-body")
            has_status_update = True
            status_message = status_component.text.strip()
            status_date = status_message.partition('Updated on')[2].partition('\n')[0]

        except:
            has_status_update = False
            status_message = ''
            status_date = ''

        
        # Retrieve the title
        try:
            title = ' '.join(soup.find('div', class_='lia-message-subject').getText().split())
        except:
            # No permission to view link
            continue

        body = ' '.join(soup.find('div', class_='lia-message-body-content').getText().split())


        # Retrieve the number of comment pages
        try:
            num_comment_pages = int(soup.find('li', class_=re.compile('lia-paging-page-last lia-js-data-pageNum-[0-9]+'))['class'][-1].split('-')[-1])
        except:
            num_comment_pages = 1


        ## data model for comments
        ## array of {comment_date, comment_time, poster_username, poster_rank, comment_text, like_count, is_official_comment}
        # Comments scraping logic

        for comment_page in range(num_comment_pages):
            comment_page_url = f'{link}/page/{comment_page + 1}#comments'
            
            if keyword == "implemented":
                comment_page_url = link.split('?')[0] + f'/page/{comment_page + 1}#comments'
                

            driver.get(comment_page_url)
            comment_soup = BeautifulSoup(driver.page_source, 'html.parser')
            comments_list = comment_soup.findAll('div', class_='lia-message-view-idea-reply-message')
            author_rank = comment_soup.findAll('div', class_='lia-message-author-rank')
            author_username = comment_soup.findAll('span', class_='UserName')
            
            print("Scraping ", len(comments_list) , " comments")

            for comment in comments_list:
                raw_text = " ".join(comment.getText().strip().split("\n"))
                raw_text = raw_text.replace("Mark as Read Mark as New     Bookmark     Permalink Print     Report Inappropriate Content", "")
                raw_text = raw_text.replace("\u200e", "")
                parsed_text =  [p.strip() for p in re.split(r'\s{2,}', raw_text) if len(p.strip()) > 0]
                parsed_text.pop()

                username = parsed_text[0]
                rank = parsed_text[1]
                timestamp = parsed_text[2]
                likes = parsed_text[-1]

                is_official_comment = False

                if rank == 'Spotify':
                    is_official_comment = True

                if parsed_text[3].startswith("Status changed to"):
                    is_official_comment = True

                comment = parsed_text[3]
                if (len(parsed_text) > 5):
                    copy_parsed_text = parsed_text
                    copy_parsed_text.pop()
                    copy_parsed_text.pop(0)
                    copy_parsed_text.pop(0)
                    copy_parsed_text.pop(0)
                    comment = " ".join(copy_parsed_text)
                    
                df2_new = pd.DataFrame({
                    'keyword':keyword, 'title': title, 'comment_date': timestamp, 'username': username, 'rank': rank, 'text': comment, 'likes': likes, 'is_official_comment': is_official_comment
                }, index=[idx2])
                
                # print(df2_new)
                

                df2 = pd.concat([df2, df2_new])
                idx2 += 1


        # Locate author section
        try:
            author_span = soup.find('span', {'class':
                                             ['lia-message-byline lia-message-byline-author-date lia-component-byline-author-date lia-component-message-view-widget-byline-author-date',
                                              re.compile('lia-user-name lia-user-rank-.* lia-component-message-view-widget-author-username'),
                                             ]})

            # If there is an author, find here
            try:
                author = author_span.find('a', class_='lia-link-navigation lia-page-link lia-user-name-link')['aria-label']
                author = author.split('View Profile of ')[1]
            # Otherwise, author was removed. Value is 'user-removed'
            except:
                author = author_span.find('span', class_='anon-user').getText()
        except:
            continue

        # Locate date and time
        try:
            date = soup.find('span', class_='local-date').getText()
            time = soup.find('span', class_='local-time').getText()
        except:
            stamp = soup.find('span', class_='local-friendly-date')['title']
            date = stamp.split()[0]
            time = stamp.split()[1] + ' ' + stamp.split()[2]
        
        # Locate post status
        try:
            status_span = soup.find('span', class_=re.compile('MessageStatus lia-status lia-status-idea-.* lia-status-completed lia-component-message-status lia-component-message-view-widget-message-status'))
                                                    
            status = status_span.find('a', class_='lia-link-navigation message-status-link').getText()
        except:
            status = soup.find('span', class_='lia-img-message-type-solved lia-fa-message lia-fa-type lia-fa-solved lia-fa')
            if status != None:
                status = status['title']

        # If there is a vote count, find it
        try:
            votes = soup.find('span', class_='MessageKudosCount lia-component-kudos-widget-message-kudos-count').getText()
            votes = int(votes.replace(',', ''))
        # Otherwise, set votes to -1
        except:
            votes = -1

        # Update dataframe with new data
        df_new = pd.DataFrame({
            'keyword':keyword, 'title': title, 'author': author, 'body': body, 'time': time, 'date': date,
            'status': keyword, 'votes': votes, 'num_comments':num_comment_pages*10, 'num_comment_pages':num_comment_pages, 'link': link,
            'has_status_update': has_status_update, 'status_message': status_message, 'status_update_date': status_date
        }, index=[idx])

        df = pd.concat([df, df_new])

        idx = idx + 1
        



#### Filter out non-english comments

In [None]:
def get_lang(row):
    try:
        return detect(row['body'])
    except:
        print(row['link'])
        return None
    
df['lang'] = df.apply(lambda row: get_lang(row), axis=1)

In [None]:
df = df.loc[df['lang'] == 'en']
df

# Write to file

Dataframe contains one column per useful attribute. If there is no status, the value is None. If there is no vote count, the value is -1.

In [None]:
from datetime import date

today = date.today()

df = df.drop_duplicates(subset='body')
df.to_csv('posts-en-' + today.strftime("%b-%d-%Y") + '.csv')
df2.to_csv('comments-en-' + today.strftime("%b-%d-%Y") + '.csv')