**You need to implement some packages when running below code**
If a package is missing, run !pip3 install <package_name>

In [None]:
# For example:
# !pip3 install selenium

**First we try BeautifulSoup to retrieve HTML content of website**

We found no sign of comments since this website uses Javascript to further render content
Thus, we need to use Selenium to actually load website through a browser

In [3]:
import requests
from bs4 import BeautifulSoup

# URL of the webpage you want to scrape
url = "https://www.viz.com/shonenjump/one-piece-chapter-1133/chapter/44787?action=read"  # Replace with your target URL

# Send an HTTP GET request to the URL
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content of the page
    soup = BeautifulSoup(response.content, 'html.parser')
    print(soup)
    
    # Find all div tags with class "o_comment-content"
    comments = soup.find_all('div', class_='o_comment-content')
    
    # Extract and print the content of each div
    for i, comment in enumerate(comments, 1):
        print(f"Comment {i}:")
        print(comment.get_text(strip=True))
        print("-" * 50)
else:
    print(f"Failed to fetch the webpage. Status code: {response.status_code}")


<!DOCTYPE html>

<!--[if IE 9 ]>    <html dir="ltr" lang="en-US" class="ie9"> <![endif]-->
<!--[if (gte IE 10)|(gt IEMobile 7)|!(IEMobile)|!(IE)]><!--><html dir="ltr" lang="en-US"><!--<![endif]-->
<head>
<!-- product -->
<!-- Google Tag Manager -->
<script>
  (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
  new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
  j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
  'https://www.googletagmanager.com/gtm.js?id='+i+dl+ '&gtm_auth=ZarF2Qyfj6o5KCl8wozoZA&gtm_preview=env-2&gtm_cookies_win=x';f.parentNode.insertBefore(j,f);
  })(window,document,'script','dataLayer','GTM-NL4KN8G');
</script>
<!-- End Google Tag Manager -->
<!-- quick throwaway until we get old GA code fully removed/changed -->
<script>
  function ga() {}
</script>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<meta content="telephone=no" name="format-detection"/>
<meta content="Jo

**Since there's no HTML rendered for comments, we need a virtual browser**

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

# Set up Chromium options
chrome_options = Options()
# chrome_options.add_argument("--headless")  # Run Chromium in headless mode
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")

# # Path to your Chromium or ChromeDriver executable
# chromedriver_path = r"./chromedriver"  # Replace with your chromedriver path

# # Initialize the WebDriver
# service = Service(executable_path=chromedriver_path)
driver = webdriver.Chrome(options=chrome_options)

# URL of the webpage you want to scrape
url = "https://www.viz.com/shonenjump/one-piece-chapter-1131/chapter/44524?action=read"  # Replace with your target URL

# Open the webpage
driver.get(url)

try:
    # Wait for the button to be present
    wait = WebDriverWait(driver, 10)  # Wait for up to 10 seconds
    see_more_button = driver.find_element(By.ID, "see_more_posts")

    # Scroll to the button
    driver.execute_script("arguments[0].scrollIntoView(true);", see_more_button)

    see_more_button.click()
    time.sleep(2)

    # Locate all divs with class "o_comment-content"
    comment_divs = driver.find_elements(By.CLASS_NAME, "o_comment-content")
    
    # Extract and print the content of each div
    for i, comment_div in enumerate(comment_divs, 1):
        comment_html = comment_div.get_attribute("outerHTML")
        
        # Parse the HTML with BeautifulSoup
        soup = BeautifulSoup(comment_html, "html.parser")
        
        # Combine all <p> tag text within the comment
        paragraphs = soup.find_all("p")
        if paragraphs:
            full_text = " ".join(p.get_text(strip=True) for p in paragraphs)
        else:
            # If no <p> tags, use the text content of the div
            full_text = comment_div.get_text(strip=True)
        
        print(f"Comment {i}:")
        print(full_text)
        print("-" * 50)
finally:
    # Close the browser
    driver.quit()


**Perform the above for all links**

In [None]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import requests
from bs4 import BeautifulSoup

# Set up Chromium options
chrome_options = Options()
# chrome_options.add_argument("--headless")  # Run Chromium in headless mode
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")


urls = [
    "https://www.viz.com/shonenjump/one-piece-chapter-1131/chapter/44524?action=read",
    "https://www.viz.com/shonenjump/one-piece-chapter-1132/chapter/44747?action=read",
    "https://www.viz.com/shonenjump/one-piece-chapter-1133/chapter/44787?action=read",
]

data = []

for url in urls:
    # Extract chapter number from the URL
    chapter_number = url.split("chapter-")[1].split("/")[0]
    print(chapter_number)

    driver = webdriver.Chrome(options=chrome_options)
    driver.get(url)

    try:
        # Wait for the button to be present
        wait = WebDriverWait(driver, 10)  # Wait for up to 10 seconds
        while True:
            try:
                see_more_button = driver.find_element(By.ID, "see_more_posts")
    
                # Scroll to the button
                driver.execute_script("arguments[0].scrollIntoView(true);", see_more_button)
            
                see_more_button.click()
                time.sleep(2)
            except Exception as e:
                print(f"No 'Load More' button for chapter {chapter_number} or no more clicks needed.")
                break
    
        # Locate all divs with class "o_comment-content"
        comment_divs = driver.find_elements(By.CLASS_NAME, "o_comment-content")
        
        # Extract and print the content of each div
        for i, comment_div in enumerate(comment_divs, 1):
            comment_html = comment_div.get_attribute("outerHTML")
            
            # Parse the HTML with BeautifulSoup
            soup = BeautifulSoup(comment_html, "html.parser")
            
            # Combine all <p> tag text within the comment
            paragraphs = soup.find_all("p")
            if paragraphs:
                full_text = " ".join(p.get_text(strip=True) for p in paragraphs)
            else:
                # If no <p> tags, use the text content of the div
                full_text = comment_div.get_text(strip=True)
            
            print(f"Comment {i}:")
            print(full_text)
            print("-" * 50)
    
            data.append({"chapter": chapter_number, "comments": full_text})
    finally:
        driver.quit()

df = pd.DataFrame(data)

1131
No 'Load More' button for chapter 1131 or no more clicks needed.
Comment 1:
Is Loki the one who fell in love with Bigmom's daughter?
--------------------------------------------------
Comment 2:
I'm already excited that Loki is the final boss (or at least I think he is) of the Elbaph. The guy looks like a misdirecting schemer like Don Flamingo.
--------------------------------------------------
Comment 3:
I shaved my neckbeard off this morning.. So I can relate
--------------------------------------------------
Comment 4:
if Kidd is there, he's most likely in the underworld too and could ally with Loki
--------------------------------------------------
Comment 5:
That ending is definitely a misdirect that Oda likes to use in the beginning of arcs to create misunderstandings or side quests for the Straw Hats to take. The better question here is... is Dr. Saul actually sick or is it something else?
--------------------------------------------------
Comment 6:
Part of me thinks Luffy

In [53]:
print(df)

    chapter                                           comments
0      1131  Is Loki the one who fell in love with Bigmom's...
1      1131  I'm already excited that Loki is the final bos...
2      1131  I shaved my neckbeard off this morning.. So I ...
3      1131  if Kidd is there, he's most likely in the unde...
4      1131  That ending is definitely a misdirect that Oda...
..      ...                                                ...
133    1133  Actual tears. As awesome as the action sequenc...
134    1133  Us 30 somethings getting moments of  closure l...
135    1133  I knew I was gonna cry when this happened. I j...
136    1133                              They finally reunited
137    1133  I wont forgive Holden for attacking like that!...

[138 rows x 2 columns]


**Sentiment Analysis**

In [56]:
from textblob import TextBlob

def analyze_sentiment(comment):
    sentiment_score = TextBlob(comment).sentiment.polarity
    sentiment_category = (
        "Positive" if sentiment_score > 0 else "Negative" if sentiment_score < 0 else "Neutral"
    )
    return pd.Series([sentiment_score, sentiment_category])

# Apply the function to each comment
df[["sentiment_score", "sentiment_category"]] = df["comments"].apply(analyze_sentiment)

print(df)

    chapter                                           comments  \
0      1131  Is Loki the one who fell in love with Bigmom's...   
1      1131  I'm already excited that Loki is the final bos...   
2      1131  I shaved my neckbeard off this morning.. So I ...   
3      1131  if Kidd is there, he's most likely in the unde...   
4      1131  That ending is definitely a misdirect that Oda...   
..      ...                                                ...   
133    1133  Actual tears. As awesome as the action sequenc...   
134    1133  Us 30 somethings getting moments of  closure l...   
135    1133  I knew I was gonna cry when this happened. I j...   
136    1133                              They finally reunited   
137    1133  I wont forgive Holden for attacking like that!...   

     sentiment_score sentiment_category  
0           0.500000           Positive  
1           0.025000           Positive  
2           0.000000            Neutral  
3           0.250000           Positive

**Extract keywords**

In [60]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
import nltk

# Download necessary NLTK resources
nltk.download("punkt")
nltk.download("stopwords")
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\alex.vuong\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alex.vuong\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\alex.vuong\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [61]:
def extract_keywords(comment):
    # Tokenize the comment
    tokens = word_tokenize(comment.lower())
    # Remove stopwords and non-alphabetic tokens
    filtered_tokens = [word for word in tokens if word.isalpha() and word not in stopwords.words("english")]
    # Count the frequency of each word
    word_counts = Counter(filtered_tokens)
    # Get the top 3 keywords
    keywords = [word for word, count in word_counts.most_common(3)]
    # Ensure exactly 3 columns (fill with None if less than 3 keywords)
    while len(keywords) < 3:
        keywords.append(None)
    return pd.Series(keywords)

# Apply the function to each comment
df[["kw1", "kw2", "kw3"]] = df["comments"].apply(extract_keywords)

# Display the updated DataFrame
print(df)

    chapter                                           comments  \
0      1131  Is Loki the one who fell in love with Bigmom's...   
1      1131  I'm already excited that Loki is the final bos...   
2      1131  I shaved my neckbeard off this morning.. So I ...   
3      1131  if Kidd is there, he's most likely in the unde...   
4      1131  That ending is definitely a misdirect that Oda...   
..      ...                                                ...   
133    1133  Actual tears. As awesome as the action sequenc...   
134    1133  Us 30 somethings getting moments of  closure l...   
135    1133  I knew I was gonna cry when this happened. I j...   
136    1133                              They finally reunited   
137    1133  I wont forgive Holden for attacking like that!...   

     sentiment_score sentiment_category      kw1         kw2         kw3  
0           0.500000           Positive     loki         one        fell  
1           0.025000           Positive     like     alre