<a href="https://colab.research.google.com/github/Webb2209/Animation/blob/main/WebScrapping001.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install the necessary libraries
!pip install requests beautifulsoup4 pandas

# Import the libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time



In [None]:
# The base URL for the first page
url = 'http://books.toscrape.com/catalogue/page-1.html'

try:
    # Send a GET request to the URL
    response = requests.get(url)

    # Check if the request was successful
    response.raise_for_status()
    print("Successfully fetched the webpage.")

except requests.exceptions.RequestException as e:
    print(f"Error fetching the page: {e}")

Successfully fetched the webpage.


In [None]:
# Parse the HTML content of the page
soup = BeautifulSoup(response.text, 'html.parser')
print("Successfully parsed the HTML content.")

# You can now print the HTML to see its structure
print(soup.prettify())

Successfully parsed the HTML content.
<!DOCTYPE html>
<!--[if lt IE 7]>      <html lang="en-us" class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
<!--[if IE 7]>         <html lang="en-us" class="no-js lt-ie9 lt-ie8"> <![endif]-->
<!--[if IE 8]>         <html lang="en-us" class="no-js lt-ie9"> <![endif]-->
<!--[if gt IE 8]><!-->
<html class="no-js" lang="en-us">
 <!--<![endif]-->
 <head>
  <title>
   All products | Books to Scrape - Sandbox
  </title>
  <meta content="text/html; charset=utf-8" http-equiv="content-type"/>
  <meta content="24th Jun 2016 09:30" name="created"/>
  <meta content="" name="description"/>
  <meta content="width=device-width" name="viewport"/>
  <meta content="NOARCHIVE,NOCACHE" name="robots"/>
  <!-- Le HTML5 shim, for IE6-8 support of HTML elements -->
  <!--[if lt IE 9]>
        <script src="//html5shim.googlecode.com/svn/trunk/html5.js"></script>
        <![endif]-->
  <link href="../static/oscar/favicon.ico" rel="shortcut icon"/>
  <link href="../static/osca

In [None]:
# Find the first book's container
first_book = soup.find('article', class_='product_pod')

# Find and print the title
title = first_book.h3.a['title']
print(f"Title: {title}")

# Find and print the price
price_str = first_book.find('p', class_='price_color').text
price = float(price_str.replace('Â£', ''))
print(f"Price: {price}")

# Find and print the star rating class
rating_class = first_book.find('p', class_='star-rating')['class'][1]
print(f"Rating Class: {rating_class}")

# Find and print the stock status
stock_status = first_book.find('p', class_='instock availability').text.strip()
print(f"Stock Status: {stock_status}")

Title: A Light in the Attic
Price: 51.77
Rating Class: Three
Stock Status: In stock


In [None]:
# Create an empty list to store all the scraped book data
all_books_data = []

# Find all book containers on the page
all_books = soup.find_all('article', class_='product_pod')

# Loop through each book container and extract the details
for book in all_books:
    try:
        title = book.h3.a['title']
        price_str = book.find('p', class_='price_color').text
        price = float(price_str.replace('Â£', ''))
        rating_class = book.find('p', class_='star-rating')['class'][1]

        # We'll create a helper function for this in the next step
        # For now, let's just get the class name

        stock_status = book.find('p', class_='instock availability').text.strip()

        # Append the data as a dictionary to our list
        all_books_data.append({
            'title': title,
            'price': price,
            'rating_class': rating_class,
            'stock': stock_status
        })
    except (AttributeError, KeyError) as e:
        print(f"Skipping a book due to missing data: {e}")
        continue

# Print the list of dictionaries to see the result
print(all_books_data)

[{'title': 'A Light in the Attic', 'price': 51.77, 'rating_class': 'Three', 'stock': 'In stock'}, {'title': 'Tipping the Velvet', 'price': 53.74, 'rating_class': 'One', 'stock': 'In stock'}, {'title': 'Soumission', 'price': 50.1, 'rating_class': 'One', 'stock': 'In stock'}, {'title': 'Sharp Objects', 'price': 47.82, 'rating_class': 'Four', 'stock': 'In stock'}, {'title': 'Sapiens: A Brief History of Humankind', 'price': 54.23, 'rating_class': 'Five', 'stock': 'In stock'}, {'title': 'The Requiem Red', 'price': 22.65, 'rating_class': 'One', 'stock': 'In stock'}, {'title': 'The Dirty Little Secrets of Getting Your Dream Job', 'price': 33.34, 'rating_class': 'Four', 'stock': 'In stock'}, {'title': 'The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull', 'price': 17.93, 'rating_class': 'Three', 'stock': 'In stock'}, {'title': 'The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics', 'price': 22.6, 'rating_class': 'Fou

In [None]:
# Helper function to convert star ratings
def get_star_rating(rating_class):
    """Converts a star rating class name to a number."""
    rating_map = {'One': 1, 'Two': 2, 'Three': 3, 'Four': 4, 'Five': 5}
    return rating_map.get(rating_class, 0) # Returns 0 if rating class is not found

# Now, let's create a DataFrame from our list
df = pd.DataFrame(all_books_data)

# Create a new 'rating' column by applying our function to the 'rating_class' column
df['rating'] = df['rating_class'].apply(get_star_rating)

# Drop the old 'rating_class' column as we no longer need it
df = df.drop(columns=['rating_class'])

# Display the first 5 rows of the DataFrame
print(df.head())
print(f"\nTotal books on this page: {len(df)}")

                                   title  price     stock  rating
0                   A Light in the Attic  51.77  In stock       3
1                     Tipping the Velvet  53.74  In stock       1
2                             Soumission  50.10  In stock       1
3                          Sharp Objects  47.82  In stock       4
4  Sapiens: A Brief History of Humankind  54.23  In stock       5

Total books on this page: 20


In [None]:
# Define the base URL without the page number
base_url = 'http://books.toscrape.com/'
all_scraped_data = []
page_number = 1

while True:
    url = f'{base_url}catalogue/page-{page_number}.html'
    print(f"Scraping page {page_number}...")

    try:
        # Check if the page exists (404 Not Found)
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
    except requests.exceptions.RequestException as e:
        print(f"No more pages found or an error occurred: {e}")
        break

    # Find and scrape all books on the current page
    books = soup.find_all('article', class_='product_pod')
    for book in books:
        try:
            title = book.h3.a['title']
            price_str = book.find('p', class_='price_color').text
            price = float(price_str.replace('Â£', ''))
            rating_class = book.find('p', class_='star-rating')['class'][1]
            rating = get_star_rating(rating_class)
            stock_status = book.find('p', class_='instock availability').text.strip()

            all_scraped_data.append({
                'title': title, 'price': price, 'rating': rating, 'stock': stock_status
            })
        except (AttributeError, KeyError) as e:
            continue

    # Check for the 'next' button to determine if there are more pages
    next_button = soup.find('li', class_='next')
    if not next_button:
        print("Finished scraping all pages.")
        break

    page_number += 1
    time.sleep(1) # Be respectful

# Create the final DataFrame
final_df = pd.DataFrame(all_scraped_data)

# Display the final results
print("\n--- Final Scraped DataFrame ---")
print(final_df.head())
print(f"\nTotal books scraped: {len(final_df)}")

Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
Scraping page 12...
Scraping page 13...
Scraping page 14...
Scraping page 15...
Scraping page 16...
Scraping page 17...
Scraping page 18...
Scraping page 19...
Scraping page 20...
Scraping page 21...
Scraping page 22...
Scraping page 23...
Scraping page 24...
Scraping page 25...
Scraping page 26...
Scraping page 27...
Scraping page 28...
Scraping page 29...
Scraping page 30...
Scraping page 31...
Scraping page 32...
Scraping page 33...
Scraping page 34...
Scraping page 35...
Scraping page 36...
Scraping page 37...
Scraping page 38...
Scraping page 39...
Scraping page 40...
Scraping page 41...
Scraping page 42...
Scraping page 43...
Scraping page 44...
Scraping page 45...
Scraping page 46...
Scraping page 47...
Scraping page 48...
Scraping page 49...
Scraping page 50...
Finished 

In [None]:
# Assuming 'final_df' is your DataFrame from the previous step
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import pandas as pd

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet') # Download the missing resource

# Function to clean text
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize the text (split into words)
    tokens = nltk.word_tokenize(text)
    # Remove stop words and lemmatize
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    clean_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and len(word) > 2]
    return ' '.join(clean_tokens)

# Apply the preprocessing function to the 'title' column
final_df['clean_title'] = final_df['title'].apply(preprocess_text)

# Display the original and clean titles to see the result
print(final_df[['title', 'clean_title']].head())

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                   title                      clean_title
0                   A Light in the Attic                      light attic
1                     Tipping the Velvet                   tipping velvet
2                             Soumission                       soumission
3                          Sharp Objects                     sharp object
4  Sapiens: A Brief History of Humankind  sapiens brief history humankind


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF Vectorizer
# max_features limits the number of features to a manageable size
tfidf_vectorizer = TfidfVectorizer(max_features=1000)

# Fit and transform the clean titles to a TF-IDF matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(final_df['clean_title'])

# Print the shape of the matrix (number of documents, number of features)
print("TF-IDF Matrix Shape:", tfidf_matrix.shape)

TF-IDF Matrix Shape: (1000, 1000)


In [None]:
from sklearn.decomposition import LatentDirichletAllocation

# Set the number of topics you want to find
num_topics = 5

# Initialize the LDA model
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)

# Fit the model to the TF-IDF matrix
lda.fit(tfidf_matrix)

# Get the feature names (words) from the TF-IDF vectorizer
feature_names = tfidf_vectorizer.get_feature_names_out()

# Function to display the top words for each topic
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic #{topic_idx + 1}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

# Display the top 10 words for each of the 5 topics
display_topics(lda, feature_names, 10)

Topic #1:
one book fruit basket vol without midnight note home new
Topic #2:
art love song harry potter dark girl fire city game
Topic #3:
life saga poem edition volume reason collected stay girl summer
Topic #4:
god world story project black thing kitchen lost love twilight
Topic #5:
girl secret time little great family light last red love


In [None]:
# Transform the data to get the topic for each document
topic_probabilities = lda.transform(tfidf_matrix)

# Get the most probable topic for each book
final_df['most_likely_topic'] = topic_probabilities.argmax(axis=1)

# Now you can perform analysis based on topic
# Example: Find the average price for each topic
avg_price_by_topic = final_df.groupby('most_likely_topic')['price'].mean()
print("\nAverage Price by Topic:")
print(avg_price_by_topic)

# Example: Find the average rating for each topic
avg_rating_by_topic = final_df.groupby('most_likely_topic')['rating'].mean()
print("\nAverage Rating by Topic:")
print(avg_rating_by_topic)


Average Price by Topic:
most_likely_topic
0    35.087454
1    36.516667
2    33.785236
3    34.553667
4    35.428166
Name: price, dtype: float64

Average Rating by Topic:
most_likely_topic
0    2.822878
1    3.031746
2    2.926702
3    2.827778
4    3.059172
Name: rating, dtype: float64


In [None]:
# Create a dictionary to map topic numbers to a descriptive name
topic_names = {
    0: 'Fiction & Fantasy',  # Example based on your potential top words
    1: 'Historical & War',
    2: 'Young Adult & Mystery',
    3: 'Science & Non-Fiction',
    4: 'Love & Romance'
}

In [None]:
import pandas as pd

# Assuming you have the average price and rating Series from the previous step
avg_price_by_topic = pd.Series({
    0: 35.087454,
    1: 36.516667,
    2: 33.785236,
    3: 34.553667,
    4: 35.428166
})

avg_rating_by_topic = pd.Series({
    0: 2.822878,
    1: 3.031746,
    2: 2.926702,
    3: 2.827778,
    4: 3.059172
})

# Create a new DataFrame from the average price and rating
summary_df = pd.DataFrame({
    'Average Price': avg_price_by_topic,
    'Average Rating': avg_rating_by_topic
})

# Add the descriptive topic names by mapping the index
summary_df['Topic Name'] = summary_df.index.map(topic_names)

# Reorder the columns for a cleaner display
summary_df = summary_df[['Topic Name', 'Average Price', 'Average Rating']]

# Display the final DataFrame
print(summary_df)

              Topic Name  Average Price  Average Rating
0      Fiction & Fantasy      35.087454        2.822878
1       Historical & War      36.516667        3.031746
2  Young Adult & Mystery      33.785236        2.926702
3  Science & Non-Fiction      34.553667        2.827778
4         Love & Romance      35.428166        3.059172
