In [None]:
!pip install requests beautifulsoup4 pandas



In [None]:
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import sent_tokenize
import pandas as pd

# Download the necessary resources for sentence tokenization
nltk.download('punkt')

# Function to extract data from a single page (div elements)
def extract_data_from_deviant_art(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0"
    }
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        return []

    soup = BeautifulSoup(response.text, 'html.parser')

    # Extracting data from 'div' tags
    raw_data = []
    section_sentences = soup.find_all('section', class_="_2Pl4_")

    for section in section_sentences:
        sentences = section.find_all('p')
        for sentence in sentences:
            text = sentence.get_text(strip=True, separator=' ')
            raw_data.extend(sent_tokenize(text))

    return raw_data

# Function to extract data from a single page (li elements)
def extract_data_from_weebly(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0"
    }
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        return []

    soup = BeautifulSoup(response.text, 'html.parser')

    # Extracting data from 'li' tags
    raw_data = []
    div_elements = soup.find_all('div', class_='paragraph')

    for sentence in div_elements:
        text = sentence.get_text(strip=True, separator=' ')
        raw_data.extend(sent_tokenize(text))

    return raw_data

def extract_data_from_p(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0"
    }
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        return []

    soup = BeautifulSoup(response.text, 'html.parser')

    # Extracting data from 'div' tag
    raw_data = []
    p_element = soup.find_all('p')

    for sentence in p_element:
        text = sentence.get_text(strip=True, separator=' ')
        raw_data.extend(sent_tokenize(text))

    return raw_data

# Extract and display data
base_url_deviant = 'https://www.deviantart.com/blusilvrpaladin/art/Character-Analysis-Hinata-Hyuuga-363323793'
base_url_p = 'https://wiki.sportskeeda.com/naruto/who-is-hinata-huyga'
base_url_weebly = 'https://hinatas.weebly.com/personality.html'


all_data_deviant = extract_data_from_deviant_art(base_url_deviant)
all_data_p = extract_data_from_p(base_url_p)
all_data_weebly = extract_data_from_weebly(base_url_weebly)

# Combine the data
all_data = all_data_deviant + all_data_p + all_data_weebly

# Display extracted data without headers
df_extracted = pd.DataFrame(all_data, columns=['Sentence'])

df_extracted = df_extracted.drop(df_extracted.index[153:159])

print(df_extracted.to_string(header=False))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


0                                                                                                                                                                                                                                                                                                                                                                    Hinata Hyuga is a Naruto anime series character who belongs to the mightiest Hyuga clan of Hidden Leaf.
1                                                                                                                                                                                                                                                                                                                                                                                                               She is a fan favorite female character in the Naruto series.
2                                                                             

# Data Cleaning

In [None]:
# Check for null values in the DataFrame
null_summary = df_extracted.isnull().sum()

# Print the summary of null values
print("Summary of null values in each column:")
print(null_summary)

Summary of null values in each column:
Sentence    0
dtype: int64


In [None]:
# Check for duplicates based on all columns
duplicates = df_extracted.duplicated()

# Print the rows that are duplicates
print("Duplicate Rows:")
print(df_extracted[duplicates])

Duplicate Rows:
    Sentence
100       A.


In [None]:
df_extracted = df_extracted.drop_duplicates()

df_extracted.reset_index(drop=True, inplace=True)

df_extracted

Unnamed: 0,Sentence
0,Hinata Hyuga is a Naruto anime series characte...
1,She is a fan favorite female character in the ...
2,Hinata Hyuga is a Kunoichi of Hidden Leaf vill...
3,"Still, because she is not meant to be the lead..."
4,She was very nervous and had no motivation to ...
...,...
118,"Hinata is also the closest to her sensei , who..."
119,She knows Hinata's personal struggles and unli...
120,After Hinata was knocked down for the final ti...
121,"Like Kiba, Kurenai is aware of her affection t..."


In [None]:
# Save the cleaned DataFrame to a new CSV file
df_extracted.to_csv('hinata_personality.csv', index=False, header=False)

# Convert CSV file to .txt file
df_extracted.to_csv('hinata_personality.txt', sep='\t')