## Importation

In [868]:
import warnings
warnings.filterwarnings('ignore')
import time
import pandas as pd
import requests
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from datetime import datetime
import random
import re
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException

# FACEBOOK SCRAPER FUNCTION AND RAW DATA FORMAT

*Disclaimer: Due to Facebook's mechanism, the following function may be outdated.*

In [864]:
class FacebookScraper:
    def __init__(self, username, password):
        self.driver = self.initialize_driver()
        self.username = username
        self.password = password

    def initialize_driver(self):
        chrome_options = Options()
        chrome_options.add_argument("--disable-notifications")
        chrome_options.add_argument("--headless")
        return webdriver.Chrome(options=chrome_options)

    def login(self):
        self.driver.get('https://facebook.com')
        wait = WebDriverWait(self.driver, 10)
        email_field = wait.until(EC.presence_of_element_located((By.ID, "email")))
        email_field.send_keys(self.username)
        self.driver.find_element(By.ID, "pass").send_keys(self.password, Keys.RETURN)
        time.sleep(5)  # Wait for the login process to complete

    def parse_post_date(self, post):
        try:
            timestamp_element = post.find_element(By.CSS_SELECTOR, 'abbr')
            post_date = timestamp_element.text.strip()
        except NoSuchElementException:
            post_date = None
        return post_date

    def extract_reaction_type(self, post):
        a_tags = post.find_elements(By.TAG_NAME, 'a')
        expression_types = None
        for a_tag in a_tags:
            aria_label = a_tag.get_attribute('aria-label')
            if aria_label and 'cảm xúc' in aria_label:
                expression_types = aria_label.split(", bao gồm ")[1] if ", bao gồm " in aria_label else None
        return expression_types

    def extract_reaction_count(self, post):
        reaction_counts = []
        for img_tag in post.find_elements(By.TAG_NAME, 'img'):
            parent = img_tag.find_element(By.XPATH, "..")
            while parent:
                if parent.tag_name == 'a':
                    text = parent.text.strip().replace('.', '')
                    if text and text[0].isdigit():
                        try:
                            count = int(text.split()[0])
                            reaction_counts.append(count)
                        except ValueError:
                            pass
                    break
                parent = parent.find_element(By.XPATH, "..")
        return reaction_counts[0] if reaction_counts else 0

    def extract_comment_count(self, post):
        a_tags = post.find_elements(By.TAG_NAME, 'a')
        for a_tag in a_tags:
            text = a_tag.text.strip()
            if 'bình luận' in text:
                try:
                    return int(text.split()[0].replace('.', ''))
                except ValueError:
                    return 0
        return 0

    def extract_post_link(self, post):
        try:
            a_tags = post.find_elements(By.TAG_NAME, 'a')
            for a_tag in a_tags:
                href = a_tag.get_attribute('href')
                if href and '/story.php?' in href:
                    return href
        except NoSuchElementException:
            return None

    def click_see_more_link(self):
        try:
            see_more_link = WebDriverWait(self.driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, "//a[span[text()='Xem tin khác']]"))
            )
            see_more_link.click()
            time.sleep(2)
        except Exception as e:
            print(f"No 'Xem tin khác' link found: {e}")

    def scrape_facebook_page(self, page_limit=10):
        data = []
        CSS_SELECTORS = ['article._55wo._56bf._5rgl', 'article.bn.bo.bp']

        for page_num in range(page_limit):
            html_content = self.driver.page_source

            for css_selector in CSS_SELECTORS:
                articles = self.driver.find_elements(By.CSS_SELECTOR, css_selector)

                for article in articles:
                    post_date = self.parse_post_date(article)
                    reaction_count = self.extract_reaction_count(article)
                    top_react = self.extract_reaction_type(article)
                    comment_count = self.extract_comment_count(article)
                    post_link = self.extract_post_link(article)
                    data.append({
                        'date': post_date,
                        'reaction_count': reaction_count,
                        'top_react': top_react,
                        'comment_count': comment_count,
                        'post_link': post_link
                    })

            self.click_see_more_link()

        return pd.DataFrame(data)

    def close(self):
        self.driver.quit()

    def extract_full_post_content(self, post_link):
        try:
            self.driver.get(post_link)
            time.sleep(random.uniform(2, 5))

            paragraphs = self.driver.find_elements(By.TAG_NAME, 'p')
            full_content = ' '.join([p.text.strip() for p in paragraphs])

            hashtags = [word for word in full_content.split() if word.startswith('#')]
            emojis = ''.join(c for c in full_content if not c.isalnum() and c not in [' ', '#'])

            return full_content, emojis, hashtags
        except Exception as e:
            print(f"Error extracting full post content from {post_link}: {e}")
            return None, None, None

    def scrape_full_content_for_all_posts(self, df):
        full_content_data = []
        for index, row in df.iterrows():
            post_link = row['post_link']
            if post_link:
                full_content, emojis, hashtags = self.extract_full_post_content(post_link)
                full_content_data.append({
                    'post_link': post_link,
                    'full_content': full_content,
                    'emojis': emojis,
                    'hashtags': hashtags
                })
                time.sleep(random.uniform(3, 7))
            else:
                full_content_data.append({
                    'post_link': post_link,
                    'full_content': None,
                    'emojis': None,
                    'hashtags': None
                })
        return pd.DataFrame(full_content_data)

The `FacebookScraper` class is designed to automate the process of scraping Facebook posts using Selenium WebDriver. It starts by initializing the WebDriver with Chrome options such as disabling notifications and running in headless mode, allowing for smooth and invisible scraping without pop-ups. The class stores user credentials for logging into Facebook and includes a `login` method that navigates to Facebook's login page, enters the user's credentials, and waits for the login process to complete.

Once logged in, the class uses various methods to extract information from Facebook posts. These include `parse_post_date` to retrieve the date of a post, `extract_reaction_type` to identify the types of reactions (e.g., "Thích", "Yêu thích"), and `extract_reaction_count` to calculate the number of reactions by traversing the DOM tree. Similarly, the `extract_comment_count` method extracts the number of comments, while `extract_post_link` retrieves the post's URL. The class also includes pagination handling through the `click_see_more_link` method, which clicks on the "Xem tin khác" ("See more posts") link to load additional content, with delays to ensure smooth operation.

The core method, `scrape_facebook_page`, orchestrates the entire scraping process by iterating through multiple pages of posts, extracting relevant details like reaction counts and post links, and compiling them into a DataFrame. The class also provides a `close` method to properly terminate the WebDriver session when scraping is complete. Additionally, for extracting full post content, including emojis and hashtags, the `extract_full_post_content` method navigates to each post's URL, retrieves paragraph text, identifies emojis and hashtags, and returns these along with the full content. 

Lastly, the `scrape_full_content_for_all_posts` method processes a DataFrame of post links, extracting detailed content for each post and handling delays between requests to avoid being blocked by Facebook. The class provides a comprehensive solution for scraping large amounts of Facebook post data efficiently, handling pagination, extracting detailed content, and ensuring a reliable scraping process with proper resource management.

However, in terms of extracting emojis, the authors aim at extracting features which are not words and hashtags sign (#), thus the emojis feature contains punctuation marks and other non-emojis. In terms of posting date, posts that are posted in 2024, the year 2024 is not included in the post” posting date. 

Thus, the authors create functions for handling two specific challenges: cleaning emoji data and parsing inconsistent date formats.

In [865]:
# Function to clean emojis by removing unwanted characters
def clean_emojis(emojis):
    if emojis:
        emoji_pattern = r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F700-\U0001F77F\U0001F780-\U0001F7FF\U0001F800-\U0001F8FF\U0001F900-\U0001F9FF\U0001FA00-\U0001FA6F\U0001FA70-\U0001FAFF\u2728]'
        cleaned_emojis = ''.join(re.findall(emoji_pattern, emojis))
        return cleaned_emojis if cleaned_emojis else None
    return None
# Function to parse dates and handle missing years
def parse_date(date_str):
    if isinstance(date_str, pd.Timestamp):
        return date_str  # If it's already a timestamp, return as is
    if pd.isna(date_str) or not isinstance(date_str, str) or not date_str.strip():
        return pd.NaT

    date_str = date_str.strip()
    current_year = datetime.now().year
    
    # Attempt to parse with the assumption of missing year
    try:
        return datetime.strptime(f"{date_str} {current_year}", "%d tháng %m lúc %H:%M %Y")
    except ValueError:
        # Handle cases where the year is included
        try:
            return datetime.strptime(date_str, "%d tháng %m, %Y lúc %H:%M")
        except ValueError:
            return pd.NaT

# Function to format the post data
def format_post_data(df):
    # Clean emojis in the 'emojis' column
    df['emojis'] = df['emojis'].apply(clean_emojis)
    
    # Parse the 'date' column
    df['date'] = df['date'].apply(parse_date)
    
    return df

In [None]:
if __name__ == "__main__":
    username = "your-facebook-email"
    password = "your-password"

    scraper = FacebookScraper(username, password)
    scraper.login()
    scraper.driver.get('https://mbasic.facebook.com/your-page?v=timeline')

    # Scrape initial post data
    your-page = scraper.scrape_facebook_page()

    # Scrape full content for each post
    full_content_df = scraper.scrape_full_content_for_all_posts(your-page)
    your-page = pd.merge(your-page, full_content_df, on='post_link', how='left')
    your-page.drop(columns='post_link', inplace=True)
    scraper.close()

    your-page_crawl_data = format_post_data(your-page)
your-page_crawl_data.to_csv('your-page_crawl_data.csv',index = False)

This function automates the process of logging into Facebook and scraping posts from the Heineken Vietnam Facebook page using a `FacebookScraper` class. It begins by checking if the script is being run as the main program (`if __name__ == "__main__":`). If so, it starts by assigning the username and password for a Facebook account to the variables `username` and `password`.

A `FacebookScraper` instance is then created using the provided credentials, and the `login()` method is called to log into Facebook. After logging in, the WebDriver is directed to the mobile version of the Heineken Vietnam Facebook page (`https://mbasic.facebook.com/your-page?v=timeline`), which is a lightweight version of Facebook suitable for scraping.

The next step involves scraping initial post data from the page using the `scrape_facebook_page()` method, which gathers details like post date, reaction count, comment count, and top reactions. The resulting data is stored in a DataFrame named `your-page`.

After collecting the basic post data, the script proceeds to scrape the full content of each post (including emojis and hashtags) using the `scrape_full_content_for_all_posts()` method. The scraped full content is stored in another DataFrame (`full_content_df`). This DataFrame is then merged with the initial `your-page` DataFrame based on the post links (`post_link`), using a left join to combine both datasets. Once merged, the `post_link` column is dropped as it is no longer needed.

The script then closes the WebDriver session with the `scraper.close()` method to free up resources. Finally, the `format_post_data(your-page)` function is called to format the scraped data, and the final DataFrame (`your-page_crawl_data`) is saved to a CSV file named `your_page_crawl_data.csv`. The CSV file contains all the scraped data, including post details and full content, ready for further analysis.

## Heineken

In [None]:
if __name__ == "__main__":
    username = "your-facebook-email"
    password = "your-password"

    scraper = FacebookScraper(username, password)
    scraper.login()
    scraper.driver.get('https://mbasic.facebook.com/HEINEKENVietnam?v=timeline')

    # Scrape initial post data
    heineken = scraper.scrape_facebook_page()

    # Scrape full content for each post
    full_content_df = scraper.scrape_full_content_for_all_posts(heineken)
    heineken = pd.merge(heineken, full_content_df, on='post_link', how='left')
    heineken.drop(columns='post_link', inplace=True)
    scraper.close()

    heineken_crawl_data = format_post_data(heineken)
heineken_crawl_data.to_csv('heineken_crawl_data.csv',index = False)

## 333 Beer

In [None]:
if __name__ == "__main__":
    username = "your-facebook-email"
    password = "your-password"

    scraper = FacebookScraper(username, password)
    scraper.login()
    scraper.driver.get('https://mbasic.facebook.com/bia333.sabeco?v=timeline')

    # Scrape initial post data
    beer333 = scraper.scrape_facebook_page()

    # Scrape full content for each post
    full_content_df = scraper.scrape_full_content_for_all_posts(beer333)
    beer333 = pd.merge(beer333, full_content_df, on='post_link', how='left')
    beer333.drop(columns='post_link', inplace=True)
    scraper.close()

    beer333_crawl_data = format_post_data(beer333)
beer333_crawl_data.to_csv('beer333_crawl_data.csv',index = False)

## Bia Saigon

In [None]:
if __name__ == "__main__":
    username = "your-facebook-email"
    password = "your-password"

    scraper = FacebookScraper(username, password)
    scraper.login()
    scraper.driver.get('')

    # Scrape initial post data
    biasg = scraper.scrape_facebook_page('https://mbasic.facebook.com/BiaSaigon.official.page?v=timeline')

    # Scrape full content for each post
    full_content_df = scraper.scrape_full_content_for_all_posts(biasg)
    biasg = pd.merge(biasg, full_content_df, on='post_link', how='left')
    biasg.drop(columns='post_link', inplace=True)
    scraper.close()

    biasg_crawl_data = format_post_data(biasg)
biasg_crawl_data.to_csv('biasg_crawl_data.csv',index = False)

*Fill the required information*