In [1]:
# Standard Imports for all notebooks
import pandas as pd
import numpy as np

# Scraping Libraries
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
from time import sleep
import re
from random import randint
from selenium.webdriver.firefox.options import Options

# Specific Imports as a result of ChatGPT's Suggested Code Functions
from selenium.common.exceptions import StaleElementReferenceException, NoSuchElementException, TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


In [10]:
def click_with_retry(driver, css_selector, max_attempts=5, wait_time=5, extra_scroll=150):
    """
    Tries to find and click an element specified by the CSS selector.
    Waits for the element to be visible and clickable.
    Retries up to max_attempts times if StaleElementReferenceException is encountered.

    :param driver: The Selenium WebDriver.
    :param css_selector: CSS selector of the element to be clicked.
    :param max_attempts: Maximum number of attempts to try clicking the element.
    :param wait_time: Time to wait for the element to become visible and clickable.
    :return: True if click was successful, False if it failed after max_attempts.
    """
    for attempt in range(max_attempts):
        try:
            # Wait for the element to be visible
            element = WebDriverWait(driver, wait_time).until(
                EC.visibility_of_element_located((By.CSS_SELECTOR, css_selector))
            )

            # Scroll the element into view with extra space
            driver.execute_script("window.scroll(0, arguments[0].getBoundingClientRect().top + window.pageYOffset - arguments[1]);", element, extra_scroll)

            # Wait for the element to be clickable
            WebDriverWait(driver, wait_time).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, css_selector))
            )

            # Click the element
            element.click()
            return True

        except (StaleElementReferenceException, TimeoutException):
            pass

    return False

In [11]:
def get_text_with_retry(driver, css_selector, max_attempts=5, wait_time=3, extra_scroll=150):
    """
    Tries to find an element specified by the CSS selector and get its text.
    Waits for the element to be visible.
    Retries up to max_attempts times if StaleElementReferenceException is encountered.

    :param driver: The Selenium WebDriver.
    :param css_selector: CSS selector of the element.
    :param max_attempts: Maximum number of attempts to try getting the text.
    :param wait_time: Time to wait for the element to become visible.
    :return: The text of the element if successful, None if it failed after max_attempts.
    """
    for attempt in range(max_attempts):
        try:
            # Wait for the element to be visible
            element = WebDriverWait(driver, wait_time).until(
                EC.visibility_of_element_located((By.CSS_SELECTOR, css_selector))
            )

            # Scroll the element into view with extra space
            driver.execute_script(
                "window.scroll(0, arguments[0].getBoundingClientRect().top + window.pageYOffset - arguments[1]);", 
                element, extra_scroll)

            # Return the text of the element
            return element.text

        except (StaleElementReferenceException, NoSuchElementException, TimeoutException):
            pass

    return None

In [4]:
# Open the Selenium Window
fun_options = Options()
fun_driver = webdriver.Firefox(options=fun_options)
fun_driver.get("https://www.facebook.com/")

In [5]:
df_sample = pd.read_csv("../Webscrape_Data/sample_edited.txt", sep=":", header=None, 
                        names=["Other_Number", "Url_Key", "Name_First", "Name_Last", "Gender",
                               "1","2","3","4","5","6","7","8","9"])
df_sample['Url_Key'] = df_sample['Url_Key'].astype(str)

df_sample.head()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,Other_Number,Url_Key,Name_First,Name_Last,Gender,1,2,3,4,5,6,7,8,9
0,12102965600,596897507,Sarah,Murphy,female,"San Antonio, Texas","San Antonio, Texas",Married,Stay-at-home mom,12/5/2018 12,0.0,00 AM,,
1,12102965601,100028359581007,Victor,Ortiz,male,,,,,1/1/0001 12,0.0,00 AM,,
2,12102965607,1456394131,Alida,Canion,female,,,,USAA,12/7/2015 12,0.0,00 AM,,
3,12102965613,100016005185942,Austin,Duerr,male,"Somerset, Texas","San Antonio, Texas",,AAA,11/29/2018 12,0.0,00 AM,,
4,12102965614,100009325868432,George,Gomez,male,"San Antonio, Texas","San Antonio, Texas",Single,,11/19/2018 12,0.0,00 AM,,


In [6]:
df_scraped = pd.read_csv("../Webscrape_Data/scraped_data.csv")
df_scraped.tail(5)

Unnamed: 0,Url_Key,Profile Picture Link,Name,Num_Friends,Intro 1,Intro 2,Intro 3,Intro 4,Intro 5,Overview (About),...,Details About Joel,Details About Odalyz,Details About David,Details About Paco,Details About Jamiya,Details About Rudy,Details About Jaiden,Details About Arianna,Details About Brenda,Details About Aaron
296,100002991248201,,Scraper 3 has dibs!,,,,,,,,...,,,,,,,,,,
297,100002991248201,https://scontent-sjc3-1.xx.fbcdn.net/v/t1.6435...,Aaron Mesa,486.0,Went to Theodore Roosevelt High School San Ant...,"Lives in Windcrest, Texas","From San Antonio, Texas",Married to Esperanza Mesa,Joined October 2011,No workplaces to show\nWent to Theodore Roosev...,...,,,,,,,,,,Relationship\nEsperanza Mesa\nMarried to Esper...
298,100004959281000,,Scraper 3 has dibs!,,,,,,,,...,,,,,,,,,,
299,100004959281000,,Scrape Failed,,,,,,,,...,,,,,,,,,,
300,100002985889624,,Scraper 3 has dibs!,,,,,,,,...,,,,,,,,,,


In [None]:
df_sample = df_sample.iloc[(df_scraped.shape[0] - 10):]

for Url_Key in df_sample['Url_Key']:
    sleep(0.5 + randint(10,9999999)/10000000)
    # Check if Url_Key is not in df_scraped
    df_scraped = pd.read_csv("../Webscrape_Data/scraped_data.csv", error_bad_lines=False)
    df_scraped['Url_Key'] = df_scraped['Url_Key'].astype(str)
    Url_Key = str(Url_Key)
    if Url_Key not in df_scraped['Url_Key'].values:
        
        # Calling Dibs on this profile
        dict_person = {"Url_Key": Url_Key, "Name" : "Scraper Copy3 has dibs!"}
        df_scraped = pd.concat([df_scraped, 
            pd.DataFrame(dict_person, index=[0])], ignore_index=True)
        df_scraped.to_csv("../Webscrape_Data/scraped_data.csv", index = False)
        
        
        fun_driver.get("https://www.facebook.com/" + Url_Key)
        # Gather data using Url_Key
        dict_person = {"Url_Key" : Url_Key}
        try:
            # Obtain Profile Picture
            dict_person["Profile Picture Link"] = fun_driver.find_element(by="css selector", 
                value = "a.xzsf02u > div:nth-child(1) > svg:nth-child(1) > g:nth-child(2) > image:nth-child(1)"
                ).get_attribute("xlink:href")

            # Obtain Name
            dict_person["Name"] = get_text_with_retry(fun_driver, ".x14qwyeo > h1:nth-child(1)")

            # Obtain Number of Friends
            dict_person["Num_Friends"] = int(get_text_with_retry(fun_driver, "a.xi81zsa").split(" ")[0])

            # Obtain Intro Lines
            html_current = BeautifulSoup(fun_driver.page_source, "lxml")
            list_intro_el = html_current.select(
                "div.x7wzq59:nth-child(2) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) >"
                + " div:nth-child(1) > div:nth-child(1) > div:nth-child(2) > div:nth-child(1) > div:nth-child(1) >"
                + " ul:nth-child(1) > div")
            for num in range(len(list_intro_el)):
                dict_person["Intro " + str(num + 1)] = list_intro_el[num].text
                #print(list_intro_el[num].text)


            # Click About Tab
            click_with_retry(fun_driver, "a.x1i10hfl:nth-child(3) > div:nth-child(1) > span:nth-child(1)")
            #Get Overview Section First
            dict_person["Overview (About)"] = get_text_with_retry(fun_driver, ".xqmdsaz")
            
            # Obtain About Section
            for num in range(3, 9):
                css_selector_to_click = ".x16jcvb6 > div:nth-child(" + str(num) + ") > a:nth-child(1)"
                click_with_retry(fun_driver, css_selector_to_click)
                el_temp = fun_driver.find_element(by="css selector", value=css_selector_to_click)
                el_temp_text = el_temp.text
                css_path_content = ".xqmdsaz"
                content_text = get_text_with_retry(fun_driver, css_path_content)
                if content_text is not None:
                    dict_person[el_temp_text] = content_text
                    #print("element's text hath been saved!")
                else:
                    print("Failed to retrieve text for element:", el_temp_text)
        except:
            dict_person = {"Url_Key": Url_Key, "Name" : "Scrape Failed"}
        
        
        # Append the new data to df_scraped
        df_scraped = pd.concat([df_scraped, 
            pd.DataFrame(dict_person, index=[0])], ignore_index=True)
        df_scraped.to_csv("../Webscrape_Data/scraped_data.csv", index = False)
    
    else:
        # Skip the data gathering process
        print(f"URL key {Url_Key} already exists in df_scraped, skipping data gathering.")


b'Skipping line 182: expected 31 fields, saw 34\nSkipping line 183: expected 31 fields, saw 34\nSkipping line 184: expected 31 fields, saw 34\nSkipping line 185: expected 31 fields, saw 34\nSkipping line 186: expected 31 fields, saw 34\nSkipping line 187: expected 31 fields, saw 34\nSkipping line 188: expected 31 fields, saw 34\nSkipping line 189: expected 31 fields, saw 34\nSkipping line 190: expected 31 fields, saw 34\nSkipping line 191: expected 31 fields, saw 34\n'


# Original Test Scraping

In [60]:
Url_Key = "596897507"

In [61]:
fun_driver.get("https://www.facebook.com/" + Url_Key)

In [62]:
dict_person = {}
# Need to obtain the following:
# Profile Picture


In [63]:
dict_person["Url_Key"] = Url_Key

In [64]:
# Obtain Profile Picture
dict_person["Profile Picture Link"] = fun_driver.find_element(by="css selector", 
    value = "a.xzsf02u > div:nth-child(1) > svg:nth-child(1) > g:nth-child(2) > image:nth-child(1)"
    ).get_attribute("xlink:href")

In [65]:
# Obtain Name
dict_person["Name"] = get_text_with_retry(fun_driver, ".x14qwyeo > h1:nth-child(1)")

In [66]:
# Obtain Number of Friends
dict_person["Num_Friends"] = int(get_text_with_retry(fun_driver, "a.xi81zsa").split(" ")[0])

In [67]:
# Obtain Intro Lines
html_current = BeautifulSoup(fun_driver.page_source, "lxml")

list_intro_el = html_current.select(
    "div.x7wzq59:nth-child(2) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) >"
    + " div:nth-child(1) > div:nth-child(1) > div:nth-child(2) > div:nth-child(1) > div:nth-child(1) >"
    + " ul:nth-child(1) > div")

for num in range(len(list_intro_el)):
    dict_person["Intro " + str(num + 1)] = list_intro_el[num].text
    print(list_intro_el[num].text)

Went to Ashland High School
Lives in San Antonio, Texas
From San Antonio, Texas
Married to Matt Murphy
Joined October 2007


In [68]:
# Click About Tab
click_with_retry(fun_driver, "a.x1i10hfl:nth-child(3) > div:nth-child(1) > span:nth-child(1)")

#Get Overview Section First
dict_person["Overview (About)"] = get_text_with_retry(fun_driver, ".xqmdsaz")

In [69]:

# Obtain About Section
for num in range(3, 9):
    print("---------------------------------")
    
    # CSS selector for the current element to click
    css_selector_to_click = ".x16jcvb6 > div:nth-child(" + str(num) + ") > a:nth-child(1)"
    
    # Clicking the next About Sub-Section
    click_with_retry(fun_driver, css_selector_to_click)
    
    # Re-find the element to avoid stale reference
    el_temp = fun_driver.find_element(by="css selector", value=css_selector_to_click)
    el_temp_text = el_temp.text
    print("temp el: " + el_temp_text + " clicked")
    
    # Consuming its informational content
    css_path_content = ".xqmdsaz"
    content_text = get_text_with_retry(fun_driver, css_path_content)
    if content_text is not None:
        dict_person[el_temp_text] = content_text
        print("element's text hath been saved!")
    else:
        print("Failed to retrieve text for element:", el_temp_text)


---------------------------------
temp el: Work and education clicked
element's text hath been saved!
---------------------------------
temp el: Places lived clicked
element's text hath been saved!
---------------------------------
temp el: Contact and basic info clicked
element's text hath been saved!
---------------------------------
temp el: Family and relationships clicked
element's text hath been saved!
---------------------------------
temp el: Details About Sarah clicked
element's text hath been saved!
---------------------------------
temp el: Life events clicked
element's text hath been saved!


In [70]:
df_scraped = pd.DataFrame(dict_person, index=[0])
df_scraped

Unnamed: 0,Url_Key,Profile Picture Link,Name,Num_Friends,Intro 1,Intro 2,Intro 3,Intro 4,Intro 5,Overview (About),Work and education,Places lived,Contact and basic info,Family and relationships,Details About Sarah,Life events
0,596897507,https://scontent-sjc3-1.xx.fbcdn.net/v/t39.308...,Sarah Murphy,444,Went to Ashland High School,"Lives in San Antonio, Texas","From San Antonio, Texas",Married to Matt Murphy,Joined October 2007,No workplaces to show\nWent to Ashland High Sc...,No workplaces to show\nWent to Ashland High Sc...,Work\nNo workplaces to show\nCollege\nNo schoo...,"Places lived\nSan Antonio, Texas\nCurrent city...",Contact info\nNo contact info to show\nWebsite...,Relationship\nMatt Murphy\nMarried to Matt Mur...,2017\nMarried Matt Murphy\n2011\nMatt Murphy a...


In [71]:
df_scraped.to_csv("../Webscrape_Data/scraped_data.csv", index = False)

In [72]:
pd.read_csv("../Webscrape_Data/scraped_data.csv")

Unnamed: 0,Url_Key,Profile Picture Link,Name,Num_Friends,Intro 1,Intro 2,Intro 3,Intro 4,Intro 5,Overview (About),Work and education,Places lived,Contact and basic info,Family and relationships,Details About Sarah,Life events
0,596897507,https://scontent-sjc3-1.xx.fbcdn.net/v/t39.308...,Sarah Murphy,444,Went to Ashland High School,"Lives in San Antonio, Texas","From San Antonio, Texas",Married to Matt Murphy,Joined October 2007,No workplaces to show\nWent to Ashland High Sc...,No workplaces to show\nWent to Ashland High Sc...,Work\nNo workplaces to show\nCollege\nNo schoo...,"Places lived\nSan Antonio, Texas\nCurrent city...",Contact info\nNo contact info to show\nWebsite...,Relationship\nMatt Murphy\nMarried to Matt Mur...,2017\nMarried Matt Murphy\n2011\nMatt Murphy a...


# Code Recycling

from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

def fun_extended_wait(css_selector, driver):
    locator = (By.CSS_SELECTOR, css_selector)
    try:
        # Wait for the element to be available and interact with it
        element = WebDriverWait(driver, 10).until(EC.presence_of_element_located(locator))
        # Perform actions with element
    except StaleElementReferenceException:
        # Element is stale, re-find or handle the exception
        element = WebDriverWait(driver, 10).until(EC.presence_of_element_located(locator))
        # Perform actions with refreshed element
    sleep(1 + randint(10,9999999)/10000000)
    return

for num in range(3,8):
    print("---------------------------------")
    
    # Clicking next About Sub-Section
    css_path = ".x16jcvb6 > div:nth-child(" + str(num) + ") > a:nth-child(1)"
    fun_extended_wait(css_path, fun_driver)
    sleep(3 + randint(10,9999999)/10000000)
    el_temp = fun_driver.find_element(by="css selector", value = css_path)
    el_temp.click()
    print("temp el: " + el_temp.text + " clicked")
    
    # Consuming it's informational meat
    css_path = ".xqmdsaz"
    fun_extended_wait(css_path, fun_driver)
    dict_person[el_temp.text] = fun_driver.find_element(by="css selector", value = css_path).text
    print("element's text hath been saved!")

In [2]:
#import matplotlib.pyplot as plt
#import seaborn as sns
#%matplotlib inline
#import warnings
#warnings.filterwarnings('ignore')

def open_browser(alt_user_name = 'Thank you for your website'):
    path = '../Garage/chromedriver'         # Path to Chromedriver
    return webdriver.Chrome(executable_path = path)

def open_browser(alt_user_name = 'Thank you for your website'):
    opts = Options()
    opts.add_argument("user-agent=" + str(alt_user_name))
    path = '../Garage/chromedriver'         # Path to Chromedriver
    return webdriver.Chrome(executable_path = path, options=opts)

for num in range(3,8):
    print("---------------------------------")
    
    # CSS selector for the current element to click
    css_selector_to_click = ".x16jcvb6 > div:nth-child(" + str(num) + ") > a:nth-child(1)"
    
    # Clicking the next About Sub-Section
    click_with_retry(fun_driver, css_selector_to_click)
    
    # Re-find the element to avoid stale reference
    el_temp = fun_driver.find_element(by="css selector", value=css_selector_to_click)
    print("temp el: " + el_temp.text + " clicked")
    
    # Consuming it's informational meat
    css_path = ".xqmdsaz"
    #fun_extended_wait(css_path, fun_driver)
    dict_person[el_temp.text] = fun_driver.find_element(by="css selector", value = css_path).text
    print("element's text hath been saved!")

fun_driver.find_element(by="css selector",
                        value = "a.x1i10hfl:nth-child(3) > div:nth-child(1) > span:nth-child(1)").click()

about_overview = fun_driver.find_element(by="css selector", value = ".xqmdsaz").text
dict_person["Overview (About)"] = about_overview
about_overview

##### #2 Work & Education

fun_driver.find_element(by="css selector", value = ".x16jcvb6 > div:nth-child(3) > a:nth-child(1)").click()

about_workanded = fun_driver.find_element(by="css selector", value = ".xqmdsaz").text
dict_person["Work & Education (About)"] = about_workanded
about_workanded

##### #3 Places Lived

fun_driver.find_element(by="css selector", value = ".x16jcvb6 > div:nth-child(4) > a:nth-child(1)").click()

about_placeslived = fun_driver.find_element(by="css selector", value = ".xqmdsaz").text
dict_person["Places Lived (About)"] = about_placeslived
about_placeslived
# Should be placed into a list

##### #4 Contact and basic info

fun_driver.find_element(by="css selector", value = ".x16jcvb6 > div:nth-child(5) > a:nth-child(1)").click()

about_contact = fun_driver.find_element(by="css selector", value = ".xqmdsaz").text
dict_person["Contact and basic info (About)"] = about_contact
about_contact

##### #5 Family and Relationships

fun_driver.find_element(by="css selector", value = ".x16jcvb6 > div:nth-child(6) > a:nth-child(1)").click()

about_family = fun_driver.find_element(by="css selector", value = ".xqmdsaz").text
dict_person["Family & Relationships (About)"] = about_family
about_family

##### #6 Details About

fun_driver.find_element(by="css selector", value = ".x16jcvb6 > div:nth-child(7) > a:nth-child(1)").click()

about_details = fun_driver.find_element(by="css selector", value = ".xqmdsaz").text
dict_person["Details About Profile (About)"] = about_details
about_details

##### #7 Life Events

fun_driver.find_element(by="css selector", value = ".x16jcvb6 > div:nth-child(8) > a:nth-child(1)").click()

In [None]:
about_overview = fun_driver.find_element(by="css selector", value = ".xqmdsaz").text
dict_person["Overview (About)"] = about_overview
about_overview

In [None]:
        # Calling Dibs on this profile
        dict_person = {"Url_Key": Url_Key, "Name" : "Scraper # has dibs!"}
        df_scraped = pd.concat([df_scraped, 
            pd.DataFrame(dict_person, index=[0])], ignore_index=True)
        df_scraped.to_csv("../Webscrape_Data/scraped_data.csv", index = False)

about_lifeEvents = fun_driver.find_element(by="css selector", value = ".xqmdsaz").text
dict_person["Life Events (About)"] = about_lifeEvents
about_lifeEvents