In [9]:
!pip install requests beautifulsoup4 pandas



In [10]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize

In [13]:
# Download the necessary resources for sentence tokenization
nltk.download('punkt')

exclude_phrases = [
    "Your changes have been saved",
    "Email Is sent",
    "Please verify your email address.",
    "You’ve reached your account maximum for followed topics.",
    'Baskin-Robbins Japan prepares a limited seasonal lineup of delicious Pokemon-themed treats for its sixth annual "Poké Summer" campaign.',
    "Dragon Ball is rich in prolific transformations, but the franchise's non-canon material also introduces some creative new forms for characters!",
    "Studio Ghibli's official store has a new release in the form of My Neighbor Totoro card holders, which mimic the look of traditional Japanese amulets.",
    "Goku and his son, Gohan, are two of Dragon Ball's greatest heroes, but there are some major differences between the two Saiyans.",
    "Plenty of anime feature notable corsairs who share similar values & virtues to the Straw Hats & would be welcome additions to One Piece lore.",
    "As the Straw Hats celebrate success at the peak of their adventure, it's the perfect time to reflect on the origins of their trademark monikers.",
    "Dragon Ball GT is a very controversial show.",
    "However, the Japanese and American versions have major differences.",
    "Anime films like The End of Evangelion and The Garden of Sinners are shining examples of anime's ability to challenge viewers' perspectives.",
    "A prototype by toy developer Flame Toys is set to modernize the obscure Transformers: Zone Autobot hero Dai Atlas through a new action figure.",
    "The latest Solo Leveling: Arise update introduces new summer-themed costumes and weapons for popular Hunters Sung Jin-woo, Cha Hae-in and others.",
    "Dragon Ball Z’s brand of intergalactic action, whimsical sci-fi world and lovable characters made it the premier shonen battle manga of a generation.",
    "Solo Leveling webtoon sequel series, Solo Leveling: Ragnarok, debuts with over 2,000 pages, depicting main character Sung Suho and his partner Beru.",
    "Dragon Ball DAIMA is set to rewrite the series' status quo, but there are certain beloved Dragon Ball Super characters who are unlikely to appear!",
    "Avatar: The Last Airbender may be geared toward younger audiences, but that does not make it an innocent and carefree viewing experience.",
    "Dragon Ball DAIMA is set to redefine Akira Toriyama's signature shonen series, but there are already some exciting theories on what lies ahead!",
    "Shonen Jump's Sakamoto Days anime gets a release update, with Netflix dropping the controversial episode strategy that many fans have long hated.",
    "One Piece has many fantastic sagas that include exciting story arcs like Alabasta and Enies Lobby."
]

# Function to extract data from screenrant
def extract_data_from_screenrant(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0"
    }
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        return []

    soup = BeautifulSoup(response.text, 'html.parser')

    raw_data = []

    # Extracting data from 'p' tags
    div_element = soup.find_all('div', class_="content-block-regular")

    for sentence in div_element:
            text = sentence.get_text(strip=True, separator=' ')
            for sent in sent_tokenize(text):  # Split paragraph into sentences
                if not any(phrase in sent for phrase in exclude_phrases) and not sent.startswith("RELATED:"):
                    raw_data.append(sent)

    return raw_data

# Function to extract data from absolute anime
def extract_data_from_absolute_anime(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0"
    }
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        return []

    soup = BeautifulSoup(response.text, 'html.parser')

    raw_data = []

    # Extracting data from 'p' tags
    div_element = soup.find_all('div', class_= 'aa_section_content  aa_section_description')

    for sentences in div_element:
      paragraph = sentences.find_all("p")
      for sentence in sentences:
        text = sentence.get_text(strip=True, separator=' ')
        for sent in sent_tokenize(text):  # Split paragraph into sentences
            if not sent.startswith("RELATED:"):
                raw_data.append(sent)

    return raw_data

# Function to extract data from CBR
def extract_data_from_cbr(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0"
    }
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        return []

    soup = BeautifulSoup(response.text, 'html.parser')

    raw_data = []

    # Extracting data from 'p' tags
    p_elements = soup.find_all('p')

    for sentence in p_elements:
        text = sentence.get_text(strip=True, separator=' ')
        for sent in sent_tokenize(text):  # Split paragraph into sentences
            if not any(phrase in sent for phrase in exclude_phrases) and not sent.startswith("RELATED:") and not sent.startswith("KEEP READING:"):
                raw_data.append(sent)

    # Exclude the last 4 rows
    if len(raw_data) > 4:
        raw_data = raw_data[:-4]

    return raw_data

screenrant_url = 'https://screenrant.com/naruto-facts-trivia-hinata/'
absolute_anime_url = 'https://www.absoluteanime.com/naruto/hinata#'
cbr_url = 'https://www.cbr.com/naruto-hyuga-hinata-mbti-personality/'

all_data_screenrant = extract_data_from_screenrant(screenrant_url)
all_data_absolute_anime = extract_data_from_absolute_anime(absolute_anime_url)
all_data_cbr = extract_data_from_cbr(cbr_url)

# Combine the data
all_data = all_data_screenrant + all_data_absolute_anime + all_data_cbr

# Display extracted data without headers
df_extracted = pd.DataFrame(all_data, columns=['Sentence'])
print(df_extracted.to_string(header=False))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


0                                                                                                                                                                               When introduced in the Naruto manga and anime, Hinata Hyuga was a shy girl trying her best to master shinobi skills.
1                                                                                                                                                                                                    Her quiet attitude and desire to avoid conflict made her an unlikely ninja, but she persevered.
2                                                                                                                                                                                                               She’s a fixture in the franchise, and its spinoff, Boruto: Naruto Next Generations .
3                                                                                                                        

# Data Cleaning

In [14]:
# Check for null values in the DataFrame
null_summary = df_extracted.isnull().sum()

# Print the summary of null values
print("Summary of null values in each column:")
print(null_summary)

Summary of null values in each column:
Sentence    0
dtype: int64


In [15]:
# Check for duplicates based on all columns
duplicates = df_extracted.duplicated()

# Print the rows that are duplicates
print("Duplicate Rows:")
print(df_extracted[duplicates])

Duplicate Rows:
Empty DataFrame
Columns: [Sentence]
Index: []


In [16]:
# Save the cleaned DataFrame to a new CSV file
df_extracted.to_csv('hinata_RAG.csv', index=False, header=False)

# Convert CSV file to .txt file
df_extracted.to_csv('hinata_RAG.txt', sep='\t')