#### **Importing libraries**

In [1]:
import time
import re
import os
import requests
import numpy as np
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

#### **Initialize Driver and open the url**

In [2]:
# Initialize the Chrome WebDriver
def initialize_driver():
    options = webdriver.ChromeOptions()
    options.add_argument('--start-maximized')  # Start the browser maximized
    options.add_argument("--disable-blink-features=AutomationControlled")  # Avoid detection
    driver = webdriver.Chrome(options=options)
    return driver

# Function to open a URL
def open_url(driver, url):
    driver.get(url)
    print(f"Opened URL: {url}")

# Cell 1: Initialize WebDriver
driver = initialize_driver()

url = "https://www.quora.com/Why-was-the-film-Downfall-controversial-because-it-was-made-by-Germans-instead-of-British-or-Americans/answer/Ian-Kummer?ch=10&oid=1477743811257630&share=b4167e93&srid=3e7J1q&target_type=answer"
open_url(driver, url)

Opened URL: https://www.quora.com/Why-was-the-film-Downfall-controversial-because-it-was-made-by-Germans-instead-of-British-or-Americans/answer/Ian-Kummer?ch=10&oid=1477743811257630&share=b4167e93&srid=3e7J1q&target_type=answer


#### **Extract Views**

In [3]:
# Function to extract views
def extract_views(driver):
    try:
        spans = driver.find_elements(By.CLASS_NAME, 'c1h7helg')  # Adjust the class name if necessary
        for span in spans:
            views_text = span.text
            match = re.search(r'([\d.]+[K]?) views', views_text)
            if match:
                views = match.group(1)
                print(f"Views: {views}")
                return views
        print("No views information found.")
    except Exception as e:
        print(f"Error extracting views: {e}")
    return None

# Cell 3: Extract views
views = extract_views(driver)

Views: 96.1K


#### **Extract Comments**

In [4]:
# Function to extract comments
def extract_comments(driver):
    try:
        comments = []
        comment_elements = driver.find_elements(By.CLASS_NAME, 'q-text')  # Replace 'q-text' with the actual class name for comments
        for element in comment_elements:
            comment_text = element.text
            if comment_text.strip():  # Ensure the comment text is not empty
                comments.append(comment_text)
        if comments:
            print("Comments extracted")
        else:
            print("No comments found.")
        return comments
    except Exception as e:
        print(f"Error extracting comments: {e}")
    return []

# Cell 3: Extract views
comments = extract_comments(driver)
comments

Comments extracted


['Sign In',
 'Sign In',
 "Why was the film Downfall controversial because it was made by Germans instead of British or Americans?\nIan Kummer\nAmerican Tourist in RussiaUpvoted by\nRoger Atkins\n, Masters Professional Ed History & Politics, Deakin University (2008)Author has 1.7K answers and 9.7M answer viewsSep 2\nMaybe the problem with Downfall was that it was too truthful.\nThe movie depicts Nazi Germany in its final days and hours, which is fine. But I could not help but feel something bubbling beneath the surface. Angry resentment.\nDownfall, as the name directly implies, is supposedly about the end of the war that destroyed Germany. But that is not quite it. Even the word “downfall” depicts not the end of something bad, but the end of something good, and the beginning of something bad. Downfall is about the beginning of Germany's century of humiliation.\nMost of the footage depicting Russians was cut from the theatrical release and that was probably for the best because it comes 

#### **Extract Images**

In [5]:
# Function to extract image URLs
def extract_image_urls(driver):
    try:
        image_urls = []
        image_elements = driver.find_elements(By.TAG_NAME, 'img')  # Locate all image elements
        for img in image_elements:
            src = img.get_dom_attribute('src')  # Use get_dom_attribute for DOM attributes like 'src'
            if src and src.startswith('http'):  # Ensure it’s a valid URL
                image_urls.append(src)
        if image_urls:
            print("Image URLs extracted")
        else:
            print("No image URLs found.")
        return image_urls
    except Exception as e:
        print(f"Error extracting image URLs: {e}")
    return []

images = extract_image_urls(driver)
for image_url in images:
    print(image_url)

Image URLs extracted
https://qph.cf2.quoracdn.net/main-thumb-292320004-50-dwsoacytqgcegjifhzphyhcbaqagiyne.jpeg
https://qph.cf2.quoracdn.net/main-qimg-364c716d657055c4eb6a92e1f0c4df37
https://qph.cf2.quoracdn.net/main-thumb-740714891-200-jqsfuatqkvhnojgjmsqlqvmmynqhprmo.jpeg
https://qph.cf2.quoracdn.net/main-thumb-690620103-200-eikrjewllqsbwinzgnnxoauabgexhspe.jpeg
https://qph.cf2.quoracdn.net/main-thumb-7398331-200-HOEAI5SoALVRITYQNCVQ78Wjr45D0v6J.jpeg
https://qph.cf2.quoracdn.net/main-thumb-354179271-200-ihamwdnafeovgntqssetbztnoxltddun.jpeg
https://qph.cf2.quoracdn.net/main-thumb-63182131-200-ikmfskgpfnrpvzmpvamnkyzrhpiyzaha.jpeg
https://qph.cf2.quoracdn.net/main-thumb-1432312610-200-endrvzyzffbkaqfjrmttsgujfnjnntbk.jpeg
https://qph.cf2.quoracdn.net/main-thumb-2001580205-200-gbdhtcnkwedjleigmqmbwjwgedhdccld.jpeg
https://qsf.cf2.quoracdn.net/-4-images.new_grid.profile_default.png-26-688c79556f251aa0.png
https://qph.cf2.quoracdn.net/main-thumb-264208325-200-otykfaahmwxohyuxvvovybraial

#### **Downloading Images**

In [6]:


# Directory to save images
output_dir = "downloaded_images"
os.makedirs(output_dir, exist_ok=True)

# Download each image
for i, url in enumerate(images):
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()  # Check for HTTP request errors

        # Determine file extension
        file_extension = os.path.splitext(url.split('/')[-1])[-1] or '.jpeg'
        if not file_extension.startswith('.'):
            file_extension = f".{file_extension}"
        
        # Construct file name
        file_name = f"image_{i+1}{file_extension}"
        file_path = os.path.join(output_dir, file_name)

        # Save the image
        with open(file_path, 'wb') as file:
            for chunk in response.iter_content(1024):
                file.write(chunk)

        print(f"Downloaded: {file_name}")
    except Exception as e:
        print(f"Failed to download {url}. Error: {e}")

print("Download complete!")

Downloaded: image_1.jpeg
Downloaded: image_2.jpeg
Downloaded: image_3.jpeg
Downloaded: image_4.jpeg
Downloaded: image_5.jpeg
Downloaded: image_6.jpeg
Downloaded: image_7.jpeg
Downloaded: image_8.jpeg
Downloaded: image_9.jpeg
Downloaded: image_10.png
Downloaded: image_11.jpeg
Downloaded: image_12.png
Downloaded: image_13.jpeg
Downloaded: image_14.jpeg
Downloaded: image_15.jpeg
Downloaded: image_16.jpeg
Download complete!


#### **Share counts**

In [7]:
# Function to extract the share count
def get_share_count(driver):
    try:
        elements = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located(
                (By.XPATH, "//div[contains(@class, 'q-click-wrapper') and contains(text(), 'View')]")
            )
        )
        share_element = None
        for element in elements:
            if "shares" in element.text.lower():
                share_element = element
                break

        if share_element:
            share_element.click()
            share_count_element = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located(
                    (By.XPATH, "//div[contains(@class, 'q-text qu-dynamicFontSize--large qu-bold')]")
                )
            )
            share_count = share_count_element.text
            print(f"Share count: {share_count}")
            return share_count
        else:
            print("No share count element found.")
            return None
    except Exception as e:
        print(f"Error extracting share count: {e}")
        return None


share_count = get_share_count(driver)

Share count: 97 Shares


#### **Get share profile names**

In [8]:
# Function to extract profile names
def get_profile_names(driver):
    try:
        # Locate the popup container for scrolling
        popup = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located(
                (By.XPATH, '//*[@id="root"]/div/div[2]/div/div/div/div/div[2]/div/div[2]/div/div')
            )
        )

        # Scroll to the bottom of the popup
        last_height = driver.execute_script("return arguments[0].scrollHeight", popup)
        while True:
            driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight", popup)
            time.sleep(2)
            new_height = driver.execute_script("return arguments[0].scrollHeight", popup)
            if new_height == last_height:
                break
            last_height = new_height

        # Extract profile names
        profile_name_elements = driver.find_elements(
            By.XPATH, "//span[@class='q-text puppeteer_test_tribe_name']//span//span"
        )

        profile_names = [profile.text if profile.text else np.nan for profile in profile_name_elements]

        print("Profile Names extracted")
        return profile_names

    except Exception as e:
        print(f"Error extracting profile names: {e}")
        return []
    finally:
        svg_element = driver.find_element(By.XPATH, "//*[local-name()='svg' and @width='24' and @height='24']") 
        svg_element.click()
        time.sleep(10)

# Usage
profile_names = get_profile_names(driver)
profile_names

Profile Names extracted


['Spirit of America',
 "Mr Ajang's Space",
 "Bishal Majumdar's Space F2",
 'Carrer life',
 "Aklima Khatun's Space 4",
 "Vikash24's Space 1",
 'Healthy4U',
 'Shamim’s',
 "Hukam Singh's Space 9",
 "Rahul Hussain's Space",
 "Tobin Jerry's Space of comedy",
 "WAHEDUZ JAMAN's Space 2",
 'Jkk 2',
 "Jahidul Islam's Space 28",
 'Filmography & Entertainment',
 'TheBlue',
 "Firoj Uddin Akand's Space",
 'Gmn 2',
 "Dore Mon's Space 6",
 "Asraful Alam's Space 5",
 'Nusrat Jahan 8',
 "FULBAR ALI's Space",
 "Dark Foji's",
 "Mokibul Islam's Space 9",
 'Rx rashid',
 "Mokibul Hoque's Space 2",
 'N+K',
 'Not monetized 6',
 "Bibhuti BD's 04",
 "Shahadot Ali's Space 4",
 'I love travel***',
 "Joy Khn's Space3",
 'Altaf A',
 "Boys Team Pk's 2",
 "Engineer's Space 4",
 'Global Brainwave',
 "Jon Moni's Space 2",
 "Ankitjaat Ankit's 1",
 'Mirajul Rohman 1',
 "Masuma Masuma's Space2",
 "Shahanur Islam's Space 3",
 "Anowar Hussain's Space 20",
 "Akram Islam's Space",
 "Shahil's Shohidul islam",
 "Saidul Islam's 

#### **Get share user names**

In [9]:
# Function to extract usernames
def get_usernames(driver):
    try:
        # Locate the popup container for scrolling
        elements = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located(
                (By.XPATH, "//div[contains(@class, 'q-click-wrapper') and contains(text(), 'View')]")
            )
        )
        share_element = None
        for element in elements:
            if "shares" in element.text.lower():
                share_element = element
                break

        if share_element:
            share_element.click()
            
        popup = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located(
                (By.XPATH, '//*[@id="root"]/div/div[2]/div/div/div/div/div[2]/div/div[2]/div/div')
            )
        )

        # Scroll to the bottom of the popup
        last_height = driver.execute_script("return arguments[0].scrollHeight", popup)
        while True:
            driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight", popup)
            time.sleep(2)
            new_height = driver.execute_script("return arguments[0].scrollHeight", popup)
            if new_height == last_height:
                break
            last_height = new_height

        # Extract usernames
        user_name_elements = driver.find_elements(
            By.XPATH, "//div[contains(@class, 'q-inlineFlex qu-alignItems--center qu-wordBreak--break-word')]//span//span"
        )

        user_names = [user.text for user in user_name_elements]

        print("Usernames extracted")
        return user_names

    except Exception as e:
        print(f"Error extracting usernames: {e}")
        return []
    finally:
        svg_element = driver.find_element(By.XPATH, "//*[local-name()='svg' and @width='24' and @height='24']") 
        svg_element.click()
        time.sleep(10)

user_names = get_usernames(driver)
user_names

Usernames extracted


['Jude',
 'Mr Ajang',
 'Bishal Majumdar',
 'Healthy Advice',
 'Aklima Khatun',
 'Vikash24',
 'Alex Radetskiy',
 "Hunter's World",
 'Shamim Mondal',
 'Hukam Singh',
 'Rahul Hussain',
 'Tobin Jerry',
 'WAHEDUZ JAMAN',
 'Kelley Elder',
 'Jeli Yolik Ke',
 'Jahidul Islam',
 'Amy',
 'David Jhon',
 'Firoj Uddin Akand',
 'Golamnabi Mostofa',
 'Dore Mon',
 'Asraful Alam',
 'Nusrat Jahan 8',
 'FULBAR ALI',
 'Dark Foji',
 'Mokibul Islam',
 'Rashidul Islam',
 'Mokibul Hoque',
 'Nazrul Islam',
 'Sahadul Islam',
 'Bibhuti BD',
 'Shahadot Ali',
 'Nural Hoque',
 'Joy Khn',
 'Momotaz Begom',
 'Boys Team Pk',
 'Engineer',
 'Ghulam Murtaza',
 'Jon Moni',
 'Ankitjaat Ankit',
 'Mirajul Rohman',
 'Masuma Masuma',
 'Shahanur Islam',
 'Anowar Hussain',
 'Akram Islam',
 'Shahil Khan',
 'Saidul Islam',
 'Saddam Hussain',
 'Faijur Rahman',
 'Washim Walif',
 'Ak Ariyan',
 'Lovely Queen',
 'Alfas Uddin Ahmed',
 'Arju Love',
 'Alfas Uddin Ahmed',
 'Yogibard',
 'MrProfGenius',
 'Mizanur Rahman',
 'Eunus Ali',
 'Base

#### **Extract upvote count**

In [10]:
# Extract upvote count
def extract_upvote_count(driver):
    try:
        upvote_element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located(
                (By.CLASS_NAME, "q-click-wrapper.c1nud10e.qu-display--inline-block.qu-tapHighlight--white.qu-cursor--pointer.qu-hover--textDecoration--underline")
            )
        )
        upvote_element.click()
        time.sleep(10)
        upvote_count_element = WebDriverWait(driver, 40).until(
            EC.presence_of_element_located(
                (By.XPATH, "//div[contains(@class, 'q-text qu-dynamicFontSize--large qu-bold') and contains(text(),'Upvotes')]")
            )
        )
        upvote_count = upvote_count_element.text
        print(f"Upvote count: {upvote_count}")
        return upvote_count
    except Exception as e:
        print(f"Error extracting upvote count: {e}")
        return None


upvote_count = extract_upvote_count(driver)

Upvote count: 412 Upvotes


#### **Upvote names**

In [11]:
# Extract upvoter names (with scrolling functionality)
def extract_upvote_names(driver):
    try:
        # Find the popup container to scroll
        popup = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[2]/div/div/div/div/div[2]/div/div[2]/div/div')

        # Scroll the popup to load all names
        last_height = driver.execute_script("return arguments[0].scrollHeight", popup)

        while True:
            # Scroll to the bottom of the popup
            driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight", popup)
            time.sleep(2)  # Allow time for new names to load

            # Check the new scroll height
            new_height = driver.execute_script("return arguments[0].scrollHeight", popup)
            if new_height == last_height:  # If the height doesn't change, we've reached the bottom
                break
            last_height = new_height
        print("Finished scrolling.")

        # Extract the names from the popup
        name_elements = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located(
                (By.XPATH, "//div[contains(@class, 'q-inlineFlex qu-alignItems--center qu-wordBreak--break-word')]//span//span")
            )
        )
        names = [name.text for name in name_elements if name.text.strip()]
        #print(f"Extracted {len(names)} upvoter names.")
        return names
    except Exception as e:
        print(f"Error extracting upvoter names: {e}")
        return []
    finally:
        svg_element = driver.find_element(By.XPATH, "//*[local-name()='svg' and @width='24' and @height='24']") 
        svg_element.click()
        time.sleep(10)

upvote_names = extract_upvote_names(driver) 
upvote_names

Finished scrolling.


['Roger Atkins',
 'Luke Baringer',
 'Matthew Gage McCloud',
 'John Doe',
 'Chris',
 'Dzaky Faisa',
 'Gryffindor',
 'Shaun Lawson',
 'Alvin Boss',
 'Michael Lucas',
 'Muhammad Bagoes Bachtiar',
 'Keith Dawe',
 'Samuel Truncale',
 'Heke Huuskonen',
 'Sage Grant',
 'Tugberk Tugcan',
 'Avis Ekalavya',
 'Alex Magot',
 'Flávio',
 'Bill Arrundale',
 'Jeroen Westmaas',
 'Fiona',
 'Elmo Sesam',
 'Michael Hansen',
 'Bryan Foltinek',
 'Olybrius',
 'Avram Singh',
 'Meijin',
 'Harsh Dharaiya',
 'Sam Argon',
 'Dylan Chester',
 'Faizan Anwar',
 'Revi Soekatno',
 'A. Gauß',
 'Aleksandr Pulnikov',
 'Santosh Kumar',
 'Christian Krill',
 'Frkn Mtl',
 'Jaichand Bihar Wale (former Qwe Rty)',
 'Aniekan Thomas',
 'Saxon',
 'Jean Paul Jones',
 'Skye Zhang',
 'Adrian Oprea',
 'Lorand Pora',
 'Hải Anh Lê',
 'Fluuton',
 'Alex McGuire',
 'Parth Shekhar',
 'Uros Markovic',
 'Yacine El Baitar',
 'Ron Szczypkowski',
 'Slyack',
 'Shubhang Jha',
 'Alex Radetskiy',
 'Igor S. R. Gleb',
 'Mehdi Fes',
 'Tudor Cornea',
 'N

#### **Downvote counts**

In [12]:
# Function to extract downvote count
def extract_downvote_count(driver):
    try:
        # Locate the "View downvotes" element
        elements = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located(
                (By.XPATH, "//div[contains(@class, 'q-click-wrapper') and contains(text(),'View')]")
            )
        )
        downvote_element = None
        for element in elements:
            if 'downvotes' in element.text.lower():
                downvote_element = element
                break

        if downvote_element:
            downvote_element.click()
            # Wait for the downvote count to appear in the modal
            downvote_count_element = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located(
                    (By.XPATH, "//div[contains(@class, 'q-text qu-dynamicFontSize--large qu-bold') and contains(text(),'Downvotes')]")
                )
            )
            downvote_text = downvote_count_element.text
            print(f"Downvote count: {downvote_text}")
            return downvote_text
        else:
            print("Downvote element is found but there are no downvotes.")
            return None
    except Exception as e:
        print(f"Error extracting downvote count: {e}")
        return None

downvote_count = extract_downvote_count(driver)
downvote_count

Downvote element is found but there are no downvotes.


#### **Downvote names**

In [13]:
# Function to extract downvote names
def extract_downvote_names(driver):
    try:
        # Locate the popup container for scrolling
        popup = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located(
                (By.XPATH, '//*[@id="root"]/div/div[2]/div/div/div/div/div[2]/div/div[2]/div/div')
            )
        )

        # Scroll to the bottom of the popup
        last_height = driver.execute_script("return arguments[0].scrollHeight", popup)
        while True:
            driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight", popup)
            time.sleep(2)
            new_height = driver.execute_script("return arguments[0].scrollHeight", popup)
            if new_height == last_height:
                break
            last_height = new_height

        # Extract names of users who downvoted
        name_elements = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located(
                (By.XPATH, "//div[contains(@class, 'q-inlineFlex qu-alignItems--center qu-wordBreak--break-word')]//span//span")
            )
        )
        names = [name.text for name in name_elements]

        if not names:
            print("No names found")
        else:
            print(f"Downvote names: {names}")
        return names
    except Exception as e:
        print(f"No downvotes")
        return []
    finally:
        driver.quit()

downvote_names = extract_downvote_names(driver)
downvote_names

No downvotes


[]

#### **Data saved to csv file**

In [14]:
import pandas as pd
import numpy as np

# Example data for fields of different shapes
data = {
    "image_url": images,
    "comments": comments,
    "views":[views],
    "share_count":[share_count],
    "share_profile_names": profile_names,
    "share_usernames": user_names,
    "upvote_count":[upvote_count],
    "upvote_names": upvote_names,
    "downvote_count":[downvote_count],
    "downvote_names": downvote_names
}

# Normalize fields with lists to strings for CSV compatibility
for key, value in data.items():
    data[key] = [str(item) if isinstance(item, list) else item for item in value]

# Convert to DataFrame, padding shorter fields with NaN
df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in data.items()]))

# Save to CSV
df.to_csv("output.csv", index=False)
print("Data saved to output.csv")

os.startfile("output.csv")

Data saved to output.csv


In [15]:
df.head()

Unnamed: 0,image_url,comments,views,share_count,share_profile_names,share_usernames,upvote_count,upvote_names,downvote_count,downvote_names
0,https://qph.cf2.quoracdn.net/main-thumb-292320...,Sign In,93.1K,96 Shares,Mr Ajang's Space,Mr Ajang,408 Upvotes,Roger Atkins,,
1,https://qph.cf2.quoracdn.net/main-qimg-364c716...,Sign In,,,Bishal Majumdar's Space F2,Bishal Majumdar,,Luke Baringer,,
2,https://qph.cf2.quoracdn.net/main-thumb-740714...,Why was the film Downfall controversial becaus...,,,Carrer life,Healthy Advice,,Gryffindor,,
3,https://qph.cf2.quoracdn.net/main-thumb-690620...,Why was the film Downfall controversial becaus...,,,Aklima Khatun's Space 4,Aklima Khatun,,Shaun Lawson,,
4,https://qph.cf2.quoracdn.net/main-thumb-739833...,Why was the film Downfall controversial becaus...,,,Vikash24's Space 1,Vikash24,,Alvin Boss,,
