In [1]:
import requests
from bs4 import BeautifulSoup
import selenium
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import pandas as pd
import time

# 1 Simulate the Interface of SpeakEV with Selenium
### 1-1 Set custom headers to pretend as a visitor

In [11]:
#### prepare the headers to pretend as a real visitor
# set the user agent
headers = {"accept":"*/*", "User-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"}

### 1-2 Simulate the interface and enter the main page

In [14]:
#### simulate SpeakEV's webpage
opts = Options()
opts.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36")
driver = webdriver.Chrome(options=opts)
driver.get("https://www.speakev.com/")
time.sleep(3)

# click the cookies button
cookie_button = driver.find_element(By.XPATH,'//*[@class="sc-ifAKCX ljEJIv"]')
cookie_button.click()
time.sleep(3)

# 2 Record URLs Linked to Posts
### Two Approaches: 
### 1st - via the specific forum navigator 
### 2nd - via the insite keyword searching

## 2-1 Approach 1: Find the forums under the targeted topic through the navigator
### Jump to the navigator first and record the urls of related forumns that includes the keyword (brand name)

In [15]:
#### Identify the navigator and move to the result page
search_button = driver.find_element(By.XPATH,'//a[@class="new-navigation vl-middle wayfinding-2  button"]')
search_button.click()
time.sleep(3)

In [16]:
#### Record all topic links to the forums
l_topic = driver.find_elements(By.XPATH,'//h3[@class="node-title"]')
# find the links that include keywords
l_topic_link = []
l_topic_link = [i for i in l_topic if "Fiat" in i.find_element_by_css_selector("a").text] 
for i in l_topic_link:
    print(i.find_element_by_css_selector("a").text)
time.sleep(3)

# build up a list of urls of each post under the topics
forums_urls = []

#### move to the topic page that contains forums with the keywords
for topic in l_topic_link:
    if topic.is_enabled():
        # the topics are sometimes intercepted to be clicked, it's not sure if the ip is blocked or because the webpage structure is dynamic.
        # As the topics are blocked, we choose to jump to the next page via urls directly
        url_topic = topic.find_element_by_css_selector("a").get_attribute('href')
        driver.get(url_topic)
        time.sleep(3)
        #### Store the urls of each forum under the topic
        stop = False
        
        # use an embedded loop to collect urls
        while not stop:
            # identify and store urls into the list
            forums = driver.find_elements(By.XPATH,'//*[@class="node-title"]')
            print(str(len(forums)) + " forums found")
            for forum in forums:
                forum_url = forum.find_element_by_css_selector("a")
                forums_urls.append(forum_url.get_attribute('href'))
            next_arrows = driver.find_elements(By.XPATH,'//a[@class="pageNav-jump pageNav-jump--next california-page-nav-jump-next"]')
            if len(next_arrows) > 0:
                next_arrow = next_arrows[0]
                if next_arrow.is_enabled():
                    next_arrow.click()
                    time.sleep(3)
                else:
                    stop = True
            else:
                stop = True
        print(forums_urls[0:2])

Fiat EV Forum
2 forums found
['https://www.speakev.com/forums/general-fiat-discussion.360/', 'https://www.speakev.com/forums/fiat-500.361/']


## 2-2 Approach 2: Use the keyword to find the posts that have relevant titles
### Jump to the search page and specify the search settings, then jump to the result page where posts have titles that include the keyword (brand name)

In [12]:
#### Use the search page to navigate to the posts relevant to the keyword
# go to the search page
search_url = "https://www.speakev.com/search/?type=post"
driver.get(search_url)
time.sleep(3)

# input the keyword into the search box
search_box = driver.find_element(By.XPATH,'//*[@class="input search-form__field"]')
search_box.send_keys("Fiat")

# select the "title only" checkbox to target posts whose title contains the keyword
titleonly_box = driver.find_element(By.XPATH,'//*[@qid="title-only-checkbox"]')
if not titleonly_box.is_selected():
    driver.execute_script("arguments[0].click();", titleonly_box)

# unselect the "subforum" checkbox to exclude sub forums
subforum_box = driver.find_element(By.XPATH,'//*[@qid="include-subforums-check-box"]')
if subforum_box.is_selected():
    driver.execute_script("arguments[0].click();", subforum_box)

# click the "search" button to jump to the result page
search_button = driver.find_element(By.XPATH,'//*[@qid="search-button"]')
driver.execute_script("arguments[0].click();", search_button)

# 3 Find posts within each forum and record the urls
### This phase is logically connected to phase 2, and the two approaches aren't expected to be executed all together. Either approach is expected to be excecuted independently within only one simulated interface.

### For Tesla, there are too many forums and comments, so only the most popular general forum was scrapped

In [20]:
# build up a list of urls of each post
posting_urls = []

### 3-1 Approach 1

In [21]:
#### Store the urls of each post within each forum (When following Approach 1)
# Manually set the forum for Tesla
# forums_urls = [""]

for forum in forums_urls:
    # enter each forum:
    driver.get(forum)
    time.sleep(5)
    # identify and store urls into the list
    stop = False
    while not stop:
        postings = driver.find_elements(By.XPATH,'//*[@class="structItem-title"]')
        print(str(len(postings)) + " postings found")
        for posting in postings:
            posting_url = posting.find_element_by_css_selector("a")
            posting_urls.append(posting_url.get_attribute('href'))
        # use try-except to avoid cases where the there is no "next page" button
        try:
            next_arrows = driver.find_elements(By.XPATH,'//a[@class="pageNavSimple-el pageNavSimple-el--next"]')
            if len(next_arrows)>0:
                next_arrow = next_arrows[0]
                if next_arrow.is_enabled():
                    # button is not interactable here, jump to the next page via the url directly
                    url_next_page = next_arrow.get_attribute('href')
                    driver.get(url_next_page)
                    time.sleep(3)
                else:
                    stop = True
            else:
                stop = True
        except NoSuchElementException:
            stop = True
        
print(len(posting_urls), posting_urls[0:5])

5 postings found
35 postings found
35 postings found
6 postings found
81 ['https://www.speakev.com/threads/fiat-doesnt-like-transparency.176618/', 'https://www.speakev.com/threads/fiat-600e-500x-replacement-to-be-announced-in-a-few-weeks.177628/', 'https://www.speakev.com/threads/new-fiat-panda-coming-most-affordable-ev.165895/', 'https://www.speakev.com/threads/fiat-all-electrified.169789/', 'https://www.speakev.com/threads/should-this-forum-move-to-be-part-of-a-stellantis-group-section.164981/']


### 3-2 Approach 2

In [20]:
#### Store the urls of each post in the result page (When following Approach 2)

# identify and store urls into the list
stop = False
while not stop:
    postings = driver.find_elements(By.XPATH,'//*[@qid="search-results-title"]')
    print(str(len(postings)) + " postings found")
    for posting in postings:
        posting_urls.append(posting.get_attribute('href'))
    # use try-except to avoid cases where the there is no "next page" button
    try:
        next_arrows = driver.find_elements(By.XPATH,'//a[@class="pageNavSimple-el pageNavSimple-el--next"]')
        if len(next_arrows)>0:
            next_arrow = next_arrows[0]
            if next_arrow.is_enabled():
                # button is not interactable here, jump to the next page via the url directly
                url_next_page = next_arrow.get_attribute('href')
                driver.get(url_next_page)
                time.sleep(3)
            else:
                stop = True
        else:
            stop = True
    except NoSuchElementException:
        stop = True
        
print(len(posting_urls), posting_urls[0:5])

20 postings found
20 postings found
20 postings found
20 postings found
20 postings found
20 postings found
20 postings found
20 postings found
20 postings found
20 postings found
2 postings found
202 ['https://www.speakev.com/threads/peugeot-satnav-questions.177812/post-3441711', 'https://www.speakev.com/threads/peugeot-e208-wait-times.166986/post-3186684', 'https://www.speakev.com/threads/peugeot-e-208-charging-issue.165019/post-3143967', 'https://www.speakev.com/threads/peugeot-e-208-and-wallbox-wallbox-com.169166/post-3236693', 'https://www.speakev.com/threads/peugeot-e-traveller-electric-van.151120/post-2836498']


## 4 Store urls of posts and comments seperately
### Urls of posts and comments are stored seperately, and each comment would be assigned with a feature value called 'post_id' to identify its parent post.
### The reason to do so is that for posts that have more than 1 page of comments, beautiful soup can't scrape all data since the only one link is to the 1st page of each post, and it can't jump to the next page automatically. As a result, each page of a post (aka 'comment url') was iterated and recorded.

In [22]:
#### The urls of posts are already stored in 'posting_urls'
# The urls of comments would be scraped next
# build up a list of urls of each post
comment_urls = []
for post in posting_urls:
    # enter each post:
    driver.get(post)
    time.sleep(3)
    # store urls of each page of a post into the list
    # store the 1st page of each post first
    comment_urls.append(post)
    stop = False
    while not stop:
        # use try-except to avoid cases where the there is no "next page" button
        try:
            next_arrows = driver.find_elements(By.XPATH,'//a[@class="pageNav-jump pageNav-jump--next california-page-nav-jump-next"]')
            if len(next_arrows)>0:
                next_arrow = next_arrows[0]
                if next_arrow.is_enabled():
                    # button is not interactable here, jump to the next page via the url directly
                    url_next_page = next_arrow.get_attribute('href')
                    comment_urls.append(url_next_page)
                    driver.get(url_next_page)
                    time.sleep(3)
                else:
                    stop = True
            else:
                stop = True
        except NoSuchElementException:
            stop = True
        
print(len(comment_urls), comment_urls[0:5])

92 ['https://www.speakev.com/threads/fiat-doesnt-like-transparency.176618/', 'https://www.speakev.com/threads/fiat-600e-500x-replacement-to-be-announced-in-a-few-weeks.177628/', 'https://www.speakev.com/threads/new-fiat-panda-coming-most-affordable-ev.165895/', 'https://www.speakev.com/threads/new-fiat-panda-coming-most-affordable-ev.165895/page-2', 'https://www.speakev.com/threads/new-fiat-panda-coming-most-affordable-ev.165895/page-3']


In [23]:
#### Store posts' and comments' urls into csv file
# store posts urls
df_post_url = pd.DataFrame({"post_url":posting_urls})
df_post_url.to_csv("Fiat_post_url.csv", sep=",", index=False, header=True)
# store comments urls
df_comment_url = pd.DataFrame({"comment_url":comment_urls})
df_comment_url.to_csv("Fiat_comment_url.csv", sep=",", index=False, header=True)

## 5 Scrape each post's information and information of comments under each post
### (Store posts information and comments seperately)

### 5.1 Create lists to store posts & comments information

In [28]:
#### Create lists of post information
l_post_title = []
l_post_author = []
l_post_date = []
l_post_content = []
l_comment_num = []
l_post_like = []
l_view = []

In [37]:
#### Create lists of comment information
l_comment_id = []
l_comment_post_id = []
l_comment_author = []
l_comment_date = []
l_comment_content = []
l_comment_like = []
l_reply_id = []

### 5.2 Load urls for posts and comments

In [26]:
#### Load urls that were stored as .csv files
post_url = pd.read_csv("Fiat_post_url.csv")
comment_url = pd.read_csv("Fiat_comment_url.csv")
print(len(post_url), len(comment_url))
print(post_url)

81 92
                                             post_url
0   https://www.speakev.com/threads/fiat-doesnt-li...
1   https://www.speakev.com/threads/fiat-600e-500x...
2   https://www.speakev.com/threads/new-fiat-panda...
3   https://www.speakev.com/threads/fiat-all-elect...
4   https://www.speakev.com/threads/should-this-fo...
..                                                ...
76  https://www.speakev.com/threads/fiat-500e-conv...
77  https://www.speakev.com/threads/500e-test-driv...
78  https://www.speakev.com/threads/thinking-about...
79  https://www.speakev.com/threads/fiat-500e-2020...
80  https://www.speakev.com/threads/2021-fiat-500e...

[81 rows x 1 columns]


### 5.3 Scrape posts' information

In [30]:
#### Scrape post information via each post url
# Use for loop to get lists of required information
for i in post_url["post_url"]:
    requests.adapters.DEFAULT_RETRIES = 5
    try:
        page = requests.get(i, headers=headers)
    except:
        page = requests.get(i, headers=headers)
    time.sleep(3)
    
    soup = BeautifulSoup(page.content, "html.parser")

# get the post information block
    thread = soup.find("div", class_ = "MessageCard__container js-originalPostContainer")

# get the titile
    try:
        title = thread.find("h1", class_ = "MessageCard__thread-title").text
        l_post_title.append(title)
    except AttributeError:
        continue

# get the author id
    author = thread.find("div", class_ = "avatar-badge-wrapper").find("a").get("data-user-id")
    l_post_author.append(author)

# get the post date
    date = thread.find("a", class_ = "MessageCard__date-created").find("time").get("data-date-string")
    l_post_date.append(date)
    
# get the post content
    content = thread.find("div", class_ = "bbWrapper").text
    l_post_content.append(content)

# get the number of comments under each post
# use try-except in case a post doesn't have any comments
    try:
        comment_num = thread.find("i", class_ = "fa fa-comment").next_sibling[1:-9]
        l_comment_num.append(comment_num)
    except AttributeError:
        l_comment_num.append("0")

# get the post likes
# use try-except in case a post doesn't have any likes
    try:
        like = thread.find("a", class_ = "reactionsBar-link").text
        l_post_like.append(like)
    except AttributeError:
        l_post_like.append("0")

# get the post views
# use try-except in case a post doesn't have any views
    try:
        view = thread.find("i", class_ = "fa fa-eye").next_sibling[1:-7]
        l_view.append(view)
    except AttributeError:
        l_view.append("0")
    time.sleep(5)

  soup = BeautifulSoup(page.content, "html.parser")


### 5.4 Store the post information to a dataframe and csv file

In [31]:
#### Store post information
# Store data into the dataframe
df_post = pd.DataFrame({"Post_Title":l_post_title, "Author":l_post_author, "Date":l_post_date,
                        "Post_Content":l_post_content, "Comment_Number":l_comment_num,
                        "Net_Likes":l_post_like, "Views":l_view})
# Store data into the csv file
df_post.to_csv("Fiat_post_info.csv", sep=",", index=True, header=True)

### 5.5 Scrape comments' information

In [32]:
#### Load the Fiat post information
df_post = pd.read_csv("Fiat_post_info.csv")
df_post.head()

Unnamed: 0.1,Unnamed: 0,Post_Title,Author,Date,Post_Content,Comment_Number,Net_Likes,Views
0,0,Fiat doesn't like transparency,112918,"Apr 4, 2023",I haven't seen my 500e yet although it stands ...,17.0,1,888
1,1,Fiat 600e (500X replacement) to be announced i...,118330,"May 21, 2023","The first spy shots are in, launch expected on...",5.0,0,603
2,2,FIAT 500e Steering Not as Light as Expected,74132,"Feb 11, 2023",Had a new 500e for two weeks now and was expec...,9.0,0,944
3,3,4 months so far for replacement body panels an...,79216,"Apr 16, 2023",Our Fiat 500e (red) sustained front body damag...,10.0,0,691
4,4,Boot space,101780,"May 19, 2023","Odd question, but this feels like the best pla...",5.0,0,341


In [38]:
#### Scrape comment information under each post via the comment url.
#### Assign the post id to each comment
# Use for loop to get lists of required information
for i in comment_url["comment_url"]:
    requests.adapters.DEFAULT_RETRIES = 5
    try:
        page = requests.get(i, headers=headers)
    except:
        page = requests.get(i, headers=headers)
    time.sleep(3)
    
    soup = BeautifulSoup(page.content, "html.parser")

# get the post information block
    thread = soup.find("div", class_ = "MessageCard__container js-originalPostContainer")

# get all the comment blocks within each page
    comment = soup.select("[class*='MessageCard js-messageCard']")
    
#### Get the post's unique id and the real id that is stored in the csv file
# get the unique comment id of the post
# (each post is also a comment on this website, so the comment id is taken as the unique id of each post)
    try:
        post_id = thread.select("[class*='MessageCard__content js-messageCard-content']")[0].get("data-post-id")
    except AttributeError:
        continue
# get the post titile and assign the post id to the comment record
    title = thread.find("h1", class_ = "MessageCard__thread-title").text
    try:
        real_id = df_post.loc[df_post["Post_Title"]==title, "Unnamed: 0"].values[0]
    except IndexError:
        real_id = -1
    
#### Use an embedded loop to scrape all comments' information within each page
    for j in comment[1:]:

    # get the unique id of each comment
        comment_id = j.get("id")[5:]
        l_comment_id.append(comment_id)

    # assign the same post id to comments under the same post
        l_comment_post_id.append(real_id)

    # get the author id
        try:
            author = j.find("div", class_ = "MessageCard__avatar").find("a").get("data-user-id")
            l_comment_author.append(author)
        except AttributeError:
            l_comment_author.append("None")

    # get the comment date
        try:
            date = j.find("a", class_ = "MessageCard__date-created").find("time").get("data-date-string")
            l_comment_date.append(date)
        except AttributeError:
            l_comment_date.append("None")
        
    # get the comment content
        try:
            content = j.find("div", class_ = "bbWrapper").text
            l_comment_content.append(content)
        except AttributeError:
            continue

    # get the comment likes
    # use try-except in case a comment doesn't have any likes
        try:
            like = j.find("a", class_ = "reactionsBar-link").text
            l_comment_like.append(like)
        except AttributeError:
            l_comment_like.append("0")

    # get the replied comment's unique comment id if the comment is a reply to other comments
    # use try-except in case a comment doesn't reply to any other comments
    # set the reply id as "None" if the comment reply id is the same as the post's unique id 
    # (which is the self-contradictory foundamental setting of the website, where a comment can either reply the post or comment under the post without replying to it)
        try:
            try:
                reply_id = j.find("blockquote").find("a").get("data-content-selector")[6:]
                if reply_id == post_id:
                    l_reply_id.append("None")
                else:
                    l_reply_id.append(reply_id)
            except TypeError:
                l_reply_id.append("None")
        except AttributeError:
            l_reply_id.append("None")
    time.sleep(5)

### 5.6 Store the comment information to a dataframe and csv file

In [40]:
#### Store comment information
# Store data into the dataframe
df_comment = pd.DataFrame({"Comment_id":l_comment_id, "Post_id":l_comment_post_id, "Author":l_comment_author, 
                        "Date":l_comment_date, "Comment_Content":l_comment_content,
                        "Net_Likes":l_comment_like, "Reply_id":l_reply_id})
# Store data into the csv file
df_comment.to_csv("Fiat_comment_info.csv", sep=",", index=True, header=True)