In [1]:
!pip install requests beautifulsoup4 pandas



In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize

In [17]:
# Download the necessary resources for sentence tokenization
nltk.download('punkt')

# Function to extract data from a single page
def extract_data_from_page(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0"
    }
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        return []

    soup = BeautifulSoup(response.text, 'html.parser')

    # Extracting data from 'p' tags
    raw_data = []
    sentences = soup.find_all('p')
    exclude_phrases = [
        "Your changes have been saved",
        "Email Is sent",
        "Please verify your email address.",
        "You’ve reached your account maximum for followed topics."
    ]

    for sentence in sentences:
        text = sentence.get_text(strip=True, separator=' ')
        for sent in sent_tokenize(text):  # Split paragraph into sentences
            if not any(phrase in sent for phrase in exclude_phrases) and not sent.startswith("RELATED:"):
                raw_data.append(sent)

    return raw_data

# Extract and display data
base_url = 'https://www.cbr.com/naruto-hinata-hyuga-unexplored-potential/'
all_data = extract_data_from_page(base_url)

# Display extracted data without headers
df_extracted = pd.DataFrame(all_data, columns=['Sentence'])
print(df_extracted.to_string(header=False))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


0                                                                                                                                         One of Naruto 's many strengths is its large cast of  interesting side characters.
1                                                                          Kishimoto gives the Konoha 11, the then-latest generation of Konoha ninjas, varied backstories and personalities that add to Naruto 's rich lore.
2        One such character is Hinata Hyuga, former heiress of the prestigious Hyuga clan who was deemed too soft to be a clan leader, and she later weds Naruto, her childhood love, and becomes powerful in her own right.
3                             With such an intriguing background, Hinata's character had enormous potential but, like many of Naruto 's female characters , Kishimoto severely missed the mark in developing Hinata's story.
4                                                                                                                   

In [18]:
# Check for null values in the DataFrame
null_summary = df_extracted.isnull().sum()

# Print the summary of null values
print("Summary of null values in each column:")
print(null_summary)

Summary of null values in each column:
Sentence    0
dtype: int64


In [19]:
# Check for duplicates based on all columns
duplicates = df_extracted.duplicated()

# Print the rows that are duplicates
print("Duplicate Rows:")
print(df_extracted[duplicates])

Duplicate Rows:
Empty DataFrame
Columns: [Sentence]
Index: []


In [22]:
# Save the cleaned DataFrame to a new CSV file
df_extracted.to_csv('hinata_news_article.csv', index=False, header=False)

# Convert CSV file to .txt file
df_extracted.to_csv('hinata_news_article.txt', sep='\t')