<a href="https://colab.research.google.com/github/PierceWind/Data-Scraping/blob/main/activity2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install beautifulsoup4
!pip install requests
!pip install pandas



In [31]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import csv

urls = [
    'https://www.cbr.com/best-quotes-from-netflix-wednesday/',
     'https://screenrant.com/wednesday-addams-family-best-quotes-movies-shows/'
]

In [21]:
Dataframed = pd.DataFrame(columns = ['Title', 'Quotes', 'URL'])

In [34]:
# Initialize lists to store extracted h2 elements, quotes, and their source URLs
h2_texts = []
quotes = []
url_list = []


# Function to scrape h2 elements and quotes from a given URL
def scrape_data_from_url(url, retries=3):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    for attempt in range(retries):
        try:
            # Fetch the content from the URL with a timeout
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()  # Raise an exception for HTTP errors

            # Parse the content with BeautifulSoup
            soup = BeautifulSoup(response.content, 'html.parser')

            # Find all <h2> elements
            h2_elements = soup.find_all('h2')

            # Find all quotes in the specified section
            quote_sections = soup.find_all('section', class_='emaki-custom-block emaki-custom-pullquote')
            quotes_elements = []
            for section in quote_sections:
                p_tags = section.find_all('p')
                for p in p_tags:
                    quotes_elements.append(p.get_text(strip=True))

            # Add data to the lists
            for h2 in h2_elements:
                if quotes_elements:
                    h2_texts.append(h2.get_text(strip=True))
                    quotes.append(quotes_elements.pop(0))  # Get the next quote
                    url_list.append(url)
                else:
                    # If there are no more quotes, add only h2 text with a placeholder
                    h2_texts.append(h2.get_text(strip=True))
                    quotes.append('')  # No quote available
                    url_list.append(url)

            # If there are remaining quotes and no corresponding <h2>, add them separately
            while quotes_elements:
                quotes.append(quotes_elements.pop(0))
                h2_texts.append('')  # No <h2> available
                url_list.append(url)

            print(f"Scraping completed for: {url}")
            return  # Exit if successful

        except requests.RequestException as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            time.sleep(2)  # Wait before retrying

# Iterate through each URL and scrape data
for url in urls:
    scrape_data_from_url(url)

# Convert the lists to a DataFrame
df = pd.DataFrame({
    'Title': h2_texts,
    'Quote': quotes,
    'URL': url_list
})

# Save the DataFrame to a CSV file
df.to_csv('source_combined.csv', index=False, quoting=csv.QUOTE_NONNUMERIC)

print("Data saved to source_combined.csv")

Scraping completed for: https://www.cbr.com/best-quotes-from-netflix-wednesday/
Scraping completed for: https://screenrant.com/wednesday-addams-family-best-quotes-movies-shows/
Data saved to source_combined.csv


In [35]:
df

Unnamed: 0,Title,Quote,URL
0,20Wednesday Expresses Her Distaste For Colorfu...,It looks like a rainbow vomited on your side.,https://www.cbr.com/best-quotes-from-netflix-w...
1,19Wednesday Is a Better Friend Than She Appear...,"If he breaks your heart, I'll nail-gun his.",https://www.cbr.com/best-quotes-from-netflix-w...
2,18Wednesday Loves Revenge More Than Anything Else,"I don't bury hatchets, I sharpen them.",https://www.cbr.com/best-quotes-from-netflix-w...
3,17Wednesday Proves Her Humble Nature,Every day is all about me. This one just comes...,https://www.cbr.com/best-quotes-from-netflix-w...
4,16Wednesday Is an Existentialist,"Sartre said, ""Hell is other people."" He was my...",https://www.cbr.com/best-quotes-from-netflix-w...
5,15Wednesday Proves She is a Child of Woe,They haven't built one strong enough to hold me.,https://www.cbr.com/best-quotes-from-netflix-w...
6,14Wednesday Comments on the Secret Society's P...,It's amateurs like you who give kidnapping a b...,https://www.cbr.com/best-quotes-from-netflix-w...
7,13Wednesday's Cynicism Spreads Even Towards So...,I find social media to be a soul-sucking void ...,https://www.cbr.com/best-quotes-from-netflix-w...
8,12Wednesday Shows Her Animosity Towards Religion,Why am I sharing this apocalypse with a pilgrim?,https://www.cbr.com/best-quotes-from-netflix-w...
9,11Wednesday Only Believes in the Power of Revenge,"I don't believe in heaven or hell, but I do be...",https://www.cbr.com/best-quotes-from-netflix-w...


In [36]:
df.head()


Unnamed: 0,Title,Quote,URL
0,20Wednesday Expresses Her Distaste For Colorfu...,It looks like a rainbow vomited on your side.,https://www.cbr.com/best-quotes-from-netflix-w...
1,19Wednesday Is a Better Friend Than She Appear...,"If he breaks your heart, I'll nail-gun his.",https://www.cbr.com/best-quotes-from-netflix-w...
2,18Wednesday Loves Revenge More Than Anything Else,"I don't bury hatchets, I sharpen them.",https://www.cbr.com/best-quotes-from-netflix-w...
3,17Wednesday Proves Her Humble Nature,Every day is all about me. This one just comes...,https://www.cbr.com/best-quotes-from-netflix-w...
4,16Wednesday Is an Existentialist,"Sartre said, ""Hell is other people."" He was my...",https://www.cbr.com/best-quotes-from-netflix-w...


In [39]:
df.to_csv("DND_Spells.csv", index = False)

In [42]:
Instructions = pd.DataFrame(columns = ['Instruction', 'Output'])

In [43]:
for index, row in df.iterrows():
    instruction = "What is the famous quote of " + row['Title'] + "?"
    output = row['Quote']
    Instructions.loc[len(Instructions.index)] = [instruction, output]

In [45]:
Instructions.head()

Unnamed: 0,Instruction,Output
0,What is the famous quote of 20Wednesday Expres...,It looks like a rainbow vomited on your side.
1,What is the famous quote of 19Wednesday Is a B...,"If he breaks your heart, I'll nail-gun his."
2,What is the famous quote of 18Wednesday Loves ...,"I don't bury hatchets, I sharpen them."
3,What is the famous quote of 17Wednesday Proves...,Every day is all about me. This one just comes...
4,What is the famous quote of 16Wednesday Is an ...,"Sartre said, ""Hell is other people."" He was my..."


In [57]:
for index, row in df.iterrows():
    instruction = "From what source does this quote cited - " + row['Quote'] + "?"
    output = row['URL']
    Instructions.loc[len(Instructions.index)] = [instruction, output]

In [55]:
for index, row in df.iterrows():
    instruction = "Is there a quote that has no title" + "?"
    output = row['Quote']
    if output == "" : output = "The quote has no title which is the " + row['Title']
    Instructions.loc[len(Instructions.index)] = [instruction, output]

In [51]:
Instructions.to_json('DND_Instructions.json', orient="records")

In [52]:
Knowledge = pd.DataFrame(columns = ['Fact', 'URL'])

In [53]:
for index, row in df.iterrows():
    fact = "The famous iconic quote that explains " + row['Title'] + " is" + row['Quote'] + "."
    Knowledge.loc[len(Knowledge.index)] = [fact, row['URL']]

In [54]:
for index, row in df.iterrows():
    fact = "The quote, " + row['Quote'] + " is from " + row['URL'] + "."
    Knowledge.loc[len(Knowledge.index)] = [fact, row['URL']]

In [58]:
for index, row in df.iterrows():
    if row['Quote'] == "" :
      fact = "The Quote that says  " + row['Title'] + " is untitled."
    else :
      fact = "It is believed that" + row['Title'] + " which is why she iconically says: " + row['Quote']
    Knowledge.loc[len(Knowledge.index)] = [fact, row['URL']]

In [59]:
Knowledge.to_json('DND_Knowledge.json', orient="records")