In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time
import random
from bs4 import BeautifulSoup

In [2]:
from selenium.webdriver.support.wait import WebDriverWait
def document_initialised(driver):
    return driver.execute_script("return initialised")

In [3]:
driver = webdriver.Chrome()

In [4]:
class Post():
    def __init__(self, post_element, driver):
        self.post_element = post_element
        self.driver = driver
        self.text = None
        self.community_name = None
        self.author = None
        self.commenters = None
        self.upvoters = None
    
    def wait(self):
        time.sleep(random.randint(1,2))
    
    def move_into_view(self):
        self.driver.execute_script("arguments[0].scrollIntoView();", self.post_element)
        self.wait()
    
    def remove_videos(self):
        videos = self.post_element.find_elements(By.CSS_SELECTOR, "div.q-box.qu-mt--small.standalone_featurable")
        for video in videos:
            self.driver.execute_script("""
            var element = arguments[0];
            element.parentNode.removeChild(element);
            """, video)
            self.wait()
    
    def remove_images(self):
        images = self.post_element.find_elements(By.TAG_NAME, "img")
        for image in images:
            self.driver.execute_script("""
            var element = arguments[0];
            element.parentNode.removeChild(element);
            """, image)
            self.wait()
    
    def click(self):
        # click on the post to get expanded view
        actions = webdriver.ActionChains(driver)
        # click on top
        actions.move_to_element_with_offset(self.post_element, 0, self.post_element.size["height"]/2-10)
        actions.click()
        actions.perform()
        self.wait()
    
    def scrape_text(self):
        WebDriverWait(self.driver, timeout=10).until(lambda d: d.find_element(By.CSS_SELECTOR,".q-text.qu-display--block.qu-wordBreak--break-word.qu-textAlign--start")) # this is the p tag containing the text
        
        # get the text from the p tag
        self.text = self.driver.find_element(By.CSS_SELECTOR, ".q-text.qu-display--block.qu-wordBreak--break-word.qu-textAlign--start").text
        self.wait()
    
    def scrape_community_name(self):
        # get the community name
        WebDriverWait(self.driver, timeout=10).until(lambda d: d.find_element(By.CSS_SELECTOR, ".q-text.puppeteer_test_tribe_name"))
        self.community_name = driver.find_element(By.CSS_SELECTOR, ".q-text.puppeteer_test_tribe_name").text
        self.wait()
    
    def scrape_author(self):
        # get all the links in the post
        all_links = self.post_element.find_elements(By.XPATH, ".//a[@href]")
        
        # get the link to the profile. The href should contain the word profile
        profile_link = ""
        for link in all_links:
            if "profile" in link.get_attribute("href"):
                profile_link = link.get_attribute("href")
                break
        self.author = profile_link
    
    def scrape_upvotes(self):
        # find upvote text and click on it
        try:
            upvote_text = self.post_element.find_element(By.XPATH, "//*[text()='View upvotes']")
            upvote_text.click()

            # wait for the upvote popup to load 
            time.sleep(2)

            # get the upvote popup
            upvote_popup = self.driver.find_element(By.CSS_SELECTOR, "div.q-box.qu-overflowY--auto.qu-display--flex.qu-flexDirection--column.ScrollBox___StyledBox-sc-1t8bc7j-0.eEjJKQ")
            
            # select all links in the popup
            upvoter_links = upvote_popup.find_elements(By.XPATH, ".//a[@href]")
            upvoter_links = [link.get_attribute("href") for link in upvoter_links if "profile" in link.get_attribute("href")]
            upvoter_links = list(set(upvoter_links))
            
            time.sleep(2)
            
            # close the popup
            close_button = self.driver.find_element(By.CLASS_NAME, "q-click-wrapper")
            close_button.click()

            self.wait()
        except:
            upvoter_links = []
        self.upvoters = upvoter_links

    def scrape_comments(self):
        comments_container = self.post_element.find_element(By.CLASS_NAME, "comment_and_ad_container")
        comments_container.click()
        
        # now, comments container is expanded. Click on view more comments
        self.wait()
        
        try:
            view_more_comments = comments_container.find_element(By.XPATH, "//*[text()='View more comments']")
            self.wait()
            view_more_comments.click()
            self.wait()
        except:
            pass
        
        # now get the profile links of all the commenters        
        comment_links = comments_container.find_elements(By.XPATH, ".//a[@href]")
        comment_links = [link.get_attribute("href") for link in comment_links if "profile" in link.get_attribute("href")]
        comment_links = list(set(comment_links))
        
        # remove the profile link of the post author
        self.commenters = [link for link in comment_links if link != self.author]
        
        self.wait()
    
    def master_scrape(self):
        self.move_into_view()
        self.remove_videos()
        self.click()
        self.remove_images()
        self.scrape_community_name()
        self.scrape_author()
        self.scrape_text()
        self.scrape_upvotes()
        self.scrape_comments()        

    def get_post_details(self):
        return {
            "text": self.text,
            "community_name": self.community_name,
            "author": self.author,
            "commenters": self.commenters,
            "upvoters": self.upvoters
        }

In [5]:
class PostScraper():
    def __init__(self, driver, search_query, total_post_count):
        self.driver = driver
        self.scraped_posts = []
        self.visible_posts = []
        self.search_query = search_query
        
        self.total_post_count = total_post_count
        self.scraped_post_count = 0
        self.epoch = 0
    
    def wait(self):
        time.sleep(random.randint(1, 2))
    
    def open_search_page(self):
        while(len(self.visible_posts) == 0):
            try:
                search_url = "https://www.quora.com/search?q="+self.search_query+"&type=post"
                self.driver.get(search_url)
                self.wait()
                self.get_new_posts()
            except:
                print("error opening search page")
                self.wait()
    
    def scrape_single_post(self, post):
        # scroll the page till next post is visible and wait for 2 seconds
        try:
            postElement = Post(post, self.driver)
            postElement.master_scrape()
            self.scraped_posts.append(postElement.get_post_details())
        except:
            print("error scraping post")
            self.wait()
    
    def scrape_visible_posts(self):
        for post in self.visible_posts:
            self.scrape_single_post(post)
            self.scraped_post_count += 1
            self.remove_post(post)
    
    def remove_post(self, post):
        self.driver.execute_script("""
        var element = arguments[0];
        element.parentNode.removeChild(element);
        """, post)
        self.wait()
    
    def get_new_posts(self):
        # get all the posts on the page
        try:
            all_posts = driver.find_elements(By.CSS_SELECTOR, ".q-box.qu-borderBottom.qu-px--medium.qu-pt--medium")
            self.visible_posts = all_posts
        except:
            self.wait()
    
    def run(self):
        self.open_search_page()
        while(self.scraped_post_count < self.total_post_count):
            self.get_new_posts()
            self.scrape_visible_posts()
            self.epoch += 1
            self.wait()

In [6]:
post_scraper = PostScraper(driver, "jeeadvanced", 20)
post_scraper.run()

{'text': '" Crack JEE Advanced with Maths Dada\'s Ultimate Challenge! Can you solve it? #\nJEEAdvanced\n#Mathematics\n#ChallengeAccepted\n"', 'community_name': None, 'author': None, 'commenters': ['https://www.quora.com/profile/Shreyas-Srivastava-46', 'https://www.quora.com/profile/Maths-Dada', 'https://www.quora.com/profile/Abhinavan-OS', 'https://www.quora.com/profile/ARYAN-GUPTA-2856', 'https://www.quora.com/profile/Yuvan-Sujit', 'https://www.quora.com/profile/Uday-Veer-200'], 'upvoters': ['https://www.quora.com/profile/Naman-Sharma-1348', 'https://www.quora.com/profile/Arman-Karn', 'https://www.quora.com/profile/Fahad-Ibrar-4', 'https://www.quora.com/profile/Aditya-6678', 'https://www.quora.com/profile/Whiteshadow-24', 'https://www.quora.com/profile/Young-Dumb-Stupid', 'https://www.quora.com/profile/Sho-680', 'https://www.quora.com/profile/Study-Account-187', 'https://www.quora.com/profile/Srashti-20', 'https://www.quora.com/profile/Shabnam-Devi-8', 'https://www.quora.com/profile/P

ElementNotInteractableException: Message: element not interactable
  (Session info: chrome=116.0.5845.140)
Stacktrace:
0   chromedriver                        0x0000000104f8265c chromedriver + 4318812
1   chromedriver                        0x0000000104f7ad00 chromedriver + 4287744
2   chromedriver                        0x0000000104bac644 chromedriver + 296516
3   chromedriver                        0x0000000104beb6ac chromedriver + 554668
4   chromedriver                        0x0000000104bdff90 chromedriver + 507792
5   chromedriver                        0x0000000104bdf858 chromedriver + 505944
6   chromedriver                        0x0000000104c22d28 chromedriver + 781608
7   chromedriver                        0x0000000104bde178 chromedriver + 500088
8   chromedriver                        0x0000000104bdefc0 chromedriver + 503744
9   chromedriver                        0x0000000104f42c40 chromedriver + 4058176
10  chromedriver                        0x0000000104f47160 chromedriver + 4075872
11  chromedriver                        0x0000000104f0ae68 chromedriver + 3829352
12  chromedriver                        0x0000000104f47c4c chromedriver + 4078668
13  chromedriver                        0x0000000104f1ff08 chromedriver + 3915528
14  chromedriver                        0x0000000104f64140 chromedriver + 4194624
15  chromedriver                        0x0000000104f642c4 chromedriver + 4195012
16  chromedriver                        0x0000000104f744d0 chromedriver + 4261072
17  libsystem_pthread.dylib             0x00000001854dbfa8 _pthread_start + 148
18  libsystem_pthread.dylib             0x00000001854d6da0 thread_start + 8


In [7]:
post_scraper.scraped_posts

[{'text': '" Crack JEE Advanced with Maths Dada\'s Ultimate Challenge! Can you solve it? #\nJEEAdvanced\n#Mathematics\n#ChallengeAccepted\n"',
  'community_name': None,
  'author': None,
  'commenters': ['https://www.quora.com/profile/Shreyas-Srivastava-46',
   'https://www.quora.com/profile/Maths-Dada',
   'https://www.quora.com/profile/Abhinavan-OS',
   'https://www.quora.com/profile/ARYAN-GUPTA-2856',
   'https://www.quora.com/profile/Yuvan-Sujit',
   'https://www.quora.com/profile/Uday-Veer-200'],
  'upvoters': ['https://www.quora.com/profile/Naman-Sharma-1348',
   'https://www.quora.com/profile/Arman-Karn',
   'https://www.quora.com/profile/Fahad-Ibrar-4',
   'https://www.quora.com/profile/Aditya-6678',
   'https://www.quora.com/profile/Whiteshadow-24',
   'https://www.quora.com/profile/Young-Dumb-Stupid',
   'https://www.quora.com/profile/Sho-680',
   'https://www.quora.com/profile/Study-Account-187',
   'https://www.quora.com/profile/Srashti-20',
   'https://www.quora.com/profil