# Acquire Data through Web Scraping

In [2]:
import pandas as pd
import requests
import re

from bs4 import BeautifulSoup

### 1) 
Codeup Blog Articles

Visit Codeup's Blog and record the urls for at least 5 distinct blog posts. For each post, you should scrape at least the post's title and content.

Encapsulate your work in a function named get_blog_articles that will return a list of dictionaries, with each dictionary representing one article.   
The shape of each dictionary should look like this:  
{   
    'title': 'the title of the article',  
    'content': 'the full text content of the article'  
}  


In [3]:
# Define the URL to the webpage you want to scrape
url = 'https://codeup.edu/blog'
# Define the User-Agent header to be used in the HTTP request
headers = {'User-Agent': 'Codeup Data Science'}

In [4]:
# Send an HTTP GET request to the specified URL with the defined User-Agent header
response = requests.get(url, headers=headers)
response

<Response [200]>

In [5]:
# Create a BeautifulSoup object to parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')

In [6]:
# Find all the HTML elements with the 'h2' tag in the parsed HTML content
links = soup.find_all('h2')

In [7]:
# Iterate through each 'h2' element in the 'links' list
for article in links:
    # Find the first 'a' (anchor) element within each 'h2' element and print it
    print(article.find('a'))

<a href="https://codeup.edu/featured/apida-heritage-month/">Spotlight on APIDA Voices: Celebrating Heritage and Inspiring Change ft. Arbeena Thapa</a>
<a href="https://codeup.edu/featured/women-in-tech-panelist-spotlight/">Women in tech: Panelist Spotlight – Magdalena Rahn</a>
<a href="https://codeup.edu/featured/women-in-tech-rachel-robbins-mayhill/">Women in tech: Panelist Spotlight – Rachel Robbins-Mayhill</a>
<a href="https://codeup.edu/codeup-news/women-in-tech-panelist-spotlight-sarah-mellor/">Women in Tech: Panelist Spotlight – Sarah Mellor</a>
<a href="https://codeup.edu/events/women-in-tech-madeleine/">Women in Tech: Panelist Spotlight – Madeleine Capper</a>
<a href="https://codeup.edu/codeup-news/panelist-spotlight-4/">Black Excellence in Tech: Panelist Spotlight – Wilmarie De La Cruz Mejia</a>
None


In [8]:
# Initialize an empty list to store the extracted links
new_links = []

# Loop through each 'article' element in the 'links' list
for article in links:
    # Check if the 'article' contains an 'a' (anchor) element
    if article.find("a"):
        # Print the 'href' attribute of the 'a' element (the link)
        print(article.find("a").get("href"))
        # Append the link to the 'new_links' list
        new_links.append(article.find("a").get("href"))

new_links      

https://codeup.edu/featured/apida-heritage-month/
https://codeup.edu/featured/women-in-tech-panelist-spotlight/
https://codeup.edu/featured/women-in-tech-rachel-robbins-mayhill/
https://codeup.edu/codeup-news/women-in-tech-panelist-spotlight-sarah-mellor/
https://codeup.edu/events/women-in-tech-madeleine/
https://codeup.edu/codeup-news/panelist-spotlight-4/


['https://codeup.edu/featured/apida-heritage-month/',
 'https://codeup.edu/featured/women-in-tech-panelist-spotlight/',
 'https://codeup.edu/featured/women-in-tech-rachel-robbins-mayhill/',
 'https://codeup.edu/codeup-news/women-in-tech-panelist-spotlight-sarah-mellor/',
 'https://codeup.edu/events/women-in-tech-madeleine/',
 'https://codeup.edu/codeup-news/panelist-spotlight-4/']

## 1st Blog Post

In [9]:
# Get the first URL from the 'new_links' list and assign it to the variable 'url'
url = new_links[0]

In [10]:
# Send an HTTP GET request to the specified URL with the defined User-Agent header
response = requests.get(url, headers=headers)
response

<Response [200]>

In [11]:
# Create a BeautifulSoup object to parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')

In [12]:
# Find all the HTML elements with the 'h1' tag in the parsed HTML content and assign it to title
title = soup.find("h1").get_text()
title

'Spotlight on APIDA Voices: Celebrating Heritage and Inspiring Change ft. Arbeena Thapa'

In [13]:
# Select the 'entry-content' class within the parsed HTML content and retrieve the first element ([0])
# Within the selected content, find all 'p' (paragraph) elements and store them in the 'content' variable
content = soup.select(".entry-content")[0].find_all("p")

In [14]:
# Initialize an empty list to store cleaned content
clean_content = []

# Loop through each 'p' (paragraph) element in the 'content' list
for p in content:
    # Extract the text content from the 'p' element and append it to the 'clean_content' list
    clean_content.append(p.get_text())

In [15]:
# Join the cleaned content in the 'clean_content' list into a single string
content = ''.join(clean_content)

In [16]:
# Create a dictionary 'heritage_blog' with 'title' and 'content' as keys
heritage_blog = {
    'title': title,      # Store the title in the 'title' key
    'content': content   # Store the content in the 'content' key
}

In [17]:
heritage_blog

{'title': 'Spotlight on APIDA Voices: Celebrating Heritage and Inspiring Change ft. Arbeena Thapa',
 'content': 'May is traditionally known as Asian American and Pacific Islander (AAPI) Heritage Month. This month we celebrate the history and contributions made possible by our AAPI friends, family, and community. We also examine our level of support and seek opportunities to better understand the AAPI community.In an effort to address real concerns and experiences, we sat down with Arbeena Thapa, one of Codeup’s Financial Aid and Enrollment Managers.Arbeena identifies as Nepali American and Desi. Arbeena’s parents immigrated to Texas in 1988 for better employment and educational opportunities. Arbeena’s older sister was five when they made the move to the US. Arbeena was born later, becoming the first in her family to be a US citizen.At Codeup we take our efforts at inclusivity very seriously. After speaking with Arbeena, we were taught that the term AAPI excludes Desi-American individu

## 2nd Blog Post

In [18]:
# Get the second URL from the 'new_links' list and assign it to the variable 'url'
url = new_links[1]

In [19]:
# Send an HTTP GET request to the specified URL with the defined User-Agent header
response = requests.get(url, headers=headers)
response

<Response [200]>

In [20]:
# Create a BeautifulSoup object to parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')

In [21]:
# Find all the HTML elements with the 'h1' tag in the parsed HTML content and assign it to title
title = soup.find('h1').get_text()
title

'Women in tech: Panelist Spotlight – Magdalena Rahn'

In [22]:
# Select the 'entry-content' class within the parsed HTML content and retrieve the first element ([0])
# Within the selected content, find all 'p' (paragraph) elements and store them in the 'content' variable
content = soup.select('.entry-content')[0].find_all('p')

In [23]:
# Initialize an empty list to store cleaned content
clean_content = []

# Loop through each 'p' (paragraph) element in the 'content' list
for p in content:
    # Extract the text content from the 'p' element and append it to the 'clean_content' list
    clean_content.append(p.get_text())

In [24]:
# Join the cleaned content in the 'clean_content' list into a single string
content = ''.join(clean_content)

In [25]:
# Create a dictionary 'WIT_MR' with 'title' and 'content' as keys
WIT_MR = {
    'title': title,      # Store the title in the 'title' key
    'content': content   # Store the content in the 'content' key
}

In [26]:
WIT_MR

{'title': 'Women in tech: Panelist Spotlight – Magdalena Rahn',
 'content': 'Codeup is hosting a Women in Tech Panel in honor of Women’s History Month on March 29th, 2023! To further celebrate, we’d like to spotlight each of our panelists leading up to the discussion to learn a bit about their respective experiences as women in the tech industry!Meet Magdalena!Magdalena Rahn is a current Codeup student in a Data Science cohort in San Antonio, Texas. She has a professional background in cross-cultural communications, international business development, the wine industry and journalism. After serving in the US Navy, she decided to complement her professional skill set by attending the Data Science program at Codeup; she is set to graduate in March 2023. Magdalena is fluent in French, Bulgarian, Chinese-Mandarin, Spanish and Italian.We asked Magdalena how Codeup impacted her career, and she replied “Codeup has provided a solid foundation in analytical processes, programming and data scien

## 3rd Blog post

In [27]:
# Get the third URL from the 'new_links' list and assign it to the variable 'url'
url = new_links[2]

In [28]:
# Send an HTTP GET request to the specified URL with the defined User-Agent header
response = requests.get(url, headers=headers)
response

<Response [200]>

In [29]:
# Create a BeautifulSoup object to parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')

In [30]:
# Find all the HTML elements with the 'h1' tag in the parsed HTML content and assign it to title
title = soup.find('h1').get_text()

In [31]:
# Select the 'entry-content' class within the parsed HTML content and retrieve the first element ([0])
# Within the selected content, find all 'p' (paragraph) elements and store them in the 'content' variable
content = soup.select('.entry-content')[0].find_all('p')

In [32]:
# Initialize an empty list to store cleaned content
clean_content = []

# Loop through each 'p' (paragraph) element in the 'content' list
for p in content:
    # Extract the text content from the 'p' element and append it to the 'clean_content' list
    clean_content.append(p.get_text())

In [33]:
content = ''.join(clean_content)

In [34]:
# Create a dictionary 'WIT_RRM' with 'title' and 'content' as keys
WIT_RRM = {
    'title': title,      # Store the title in the 'title' key
    'content': content   # Store the content in the 'content' key
}

In [35]:
WIT_RRM

{'title': 'Women in tech: Panelist Spotlight – Rachel Robbins-Mayhill',
 'content': 'Codeup is hosting a Women in Tech Panel in honor of Women’s History Month on March 29th, 2023! To further celebrate, we’d like to spotlight each of our panelists leading up to the discussion to learn a bit about their respective experiences as women in the tech industry! Meet Rachel!\nRachel Robbins-Mayhill is a Decision Science Analyst I in San Antonio, Texas. Rachel has had a varied career that includes counseling, teaching, training, community development, and military operations. Her focus has always been on assessing needs, identifying solutions, and educating individuals and groups on aligning needs and solutions in different contexts. Rachel’s passion for data science stems from her belief that data is a powerful tool for communicating patterns that can lead to hope and growth in the future.In June 2022, Rachel graduated from Codeup’s Innis cohort, where she honed her skills in data science. Sho

## 4th Blog Post

In [36]:
# Get the fourth URL from the 'new_links' list and assign it to the variable 'url'
url = new_links[3]

In [37]:
# Send an HTTP GET request to the specified URL with the defined User-Agent header
response = requests.get(url, headers=headers)
response

<Response [200]>

In [38]:
# Create a BeautifulSoup object to parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')

In [39]:
# Find all the HTML elements with the 'h1' tag in the parsed HTML content and assign it to title
title = soup.find('h1').get_text()
title

'Women in Tech: Panelist Spotlight – Sarah Mellor'

In [40]:
# Select the 'entry-content' class within the parsed HTML content and retrieve the first element ([0])
# Within the selected content, find all 'p' (paragraph) elements and store them in the 'content' variable
content = soup.select('.entry-content')[0].find_all('p')

In [41]:
# Initialize an empty list to store cleaned content
clean_content = []

# Loop through each 'p' (paragraph) element in the 'content' list
for p in content:
    # Extract the text content from the 'p' element and append it to the 'clean_content' list
    clean_content.append(p.get_text())

In [42]:
content = ''.join(clean_content)

In [43]:
WIT_SM = {'title' : title,
          'content' : content
    
}

In [44]:
WIT_SM

{'title': 'Women in Tech: Panelist Spotlight – Sarah Mellor',
 'content': 'Codeup is hosting a Women in Tech Panel in honor of Women’s History Month on March 29th, 2023! To further celebrate, we’d like to spotlight each of our panelists leading up to the discussion to learn a bit about their respective experiences as women in the tech industry!Meet Sarah!Sarah Mellor currently works as the Director of People Operations. She joined Codeup four and a half years ago as an Admissions Manager. She went on to build out and lead the Marketing and Admissions team, while picking up People Ops tasks and projects here and there until moving over to lead the People Ops team two years ago. Prior to Codeup, she worked at education-focused non-profits in Washington, DC and Boulder, Colorado. She graduated from Wake Forest University.We asked Sarah how Codeup has impacted her career, and her response was “I have absolutely loved having the privilege to grow alongside Codeup. In my time here across mul

## 5th Blog Post

In [45]:
# Get the fourth URL from the 'new_links' list and assign it to the variable 'url'
url = new_links[4]

In [46]:
# Send an HTTP GET request to the specified URL with the defined User-Agent header
response = requests.get(url, headers=headers)
response

<Response [200]>

In [47]:
# Create a BeautifulSoup object to parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')

In [48]:
# Find all the HTML elements with the 'h1' tag in the parsed HTML content and assign it to title
title = soup.find('h1').get_text()
title

'Women in Tech: Panelist Spotlight – Madeleine Capper'

In [49]:
# Select the 'entry-content' class within the parsed HTML content and retrieve the first element ([0])
# Within the selected content, find all 'p' (paragraph) elements and store them in the 'content' variable
content = soup.select('.entry-content')[0].find_all('p')

In [50]:
# Initialize an empty list to store cleaned content
clean_content = []

# Loop through each 'p' (paragraph) element in the 'content' list
for p in content:
    # Extract the text content from the 'p' element and append it to the 'clean_content' list
    clean_content.append(p.get_text())

In [51]:
content = ''.join(clean_content)

In [52]:
WIT_MC = { 'title' : title,
          'content' : content
    
}

In [53]:
WIT_MC

{'title': 'Women in Tech: Panelist Spotlight – Madeleine Capper',
 'content': 'Codeup is hosting a Women in Tech Panel in honor of Women’s History Month on March 29th, 2023! To further celebrate, we’d like to spotlight each of our panelists leading up to the discussion to learn a bit about their respective experiences as women in the tech industry!Meet Madeleine!Madeleine Capper is a Data Scientist in San Antonio, Texas. A long-standing San Antonio resident, she studied mathematics at the University of Texas San Antonio and has worked as a Data Scientist for Booz Allen Hamilton. Madeleine currently teaches Data Science at Codeup, where she works daily with burgeoning data professionals to help them actualize their career aspirations through technical education.Madeleine attended Codeup as a student in early 2019 as a pupil in the very first Codeup Data Science cohort. The program proved immediately effective and she was the first student to obtain a data career out of the program. Afte

In [54]:
title[0:13]

'Women in Tech'

### function

In [55]:
def get_new_links(url, headers, sc1, sc2, sc3):
    # Send an HTTP GET request to the specified URL with the defined User-Agent header
    response = requests.get(url, headers=headers)
    # Create a BeautifulSoup object to parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')
    # Find all the HTML elements with the 'h2' tag in the parsed HTML content
    links = soup.find_all(sc1)
    # Initialize an empty list to store the extracted links
    new_links = []

    # Loop through each 'article' element in the 'links' list
    for article in links:
        # Check if the 'article' contains an 'a' (anchor) element
        if article.find(sc2):
            # Print the 'href' attribute of the 'a' element (the link)
            print(article.find(sc2).get(sc3))
            # Append the link to the 'new_links' list
            new_links.append(article.find(sc2).get(sc3))

    return new_links

In [56]:
url = 'https://codeup.edu/blog'
headers = {'User-Agent': 'Codeup Data Science'}
new_links = get_new_links(url, headers, 'h2', 'a', 'href')
print(new_links)

https://codeup.edu/featured/apida-heritage-month/
https://codeup.edu/featured/women-in-tech-panelist-spotlight/
https://codeup.edu/featured/women-in-tech-rachel-robbins-mayhill/
https://codeup.edu/codeup-news/women-in-tech-panelist-spotlight-sarah-mellor/
https://codeup.edu/events/women-in-tech-madeleine/
https://codeup.edu/codeup-news/panelist-spotlight-4/
['https://codeup.edu/featured/apida-heritage-month/', 'https://codeup.edu/featured/women-in-tech-panelist-spotlight/', 'https://codeup.edu/featured/women-in-tech-rachel-robbins-mayhill/', 'https://codeup.edu/codeup-news/women-in-tech-panelist-spotlight-sarah-mellor/', 'https://codeup.edu/events/women-in-tech-madeleine/', 'https://codeup.edu/codeup-news/panelist-spotlight-4/']


In [57]:
def get_article_data(new_links, headers):
    article_list = []
    
    for url in new_links:
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract the title
        title = soup.find('h1')
        if title and title.get_text() != "Example Domain":
            title_text = title.get_text()
        else:
            title_text = "Title Not Found"

        # Extract the content
        content = soup.select('.entry-content')
        if content:
            content_text = ''.join([p.get_text() for p in content[0].find_all('p')])
        else:
            content_text = "Content Not Found"

        article_data = {
            'title': title_text,
            'content': content_text
        }

        article_list.append(article_data)
    
    return article_list

In [58]:
get_article_data(new_links, headers)

[{'title': 'Spotlight on APIDA Voices: Celebrating Heritage and Inspiring Change ft. Arbeena Thapa',
  'content': 'May is traditionally known as Asian American and Pacific Islander (AAPI) Heritage Month. This month we celebrate the history and contributions made possible by our AAPI friends, family, and community. We also examine our level of support and seek opportunities to better understand the AAPI community.In an effort to address real concerns and experiences, we sat down with Arbeena Thapa, one of Codeup’s Financial Aid and Enrollment Managers.Arbeena identifies as Nepali American and Desi. Arbeena’s parents immigrated to Texas in 1988 for better employment and educational opportunities. Arbeena’s older sister was five when they made the move to the US. Arbeena was born later, becoming the first in her family to be a US citizen.At Codeup we take our efforts at inclusivity very seriously. After speaking with Arbeena, we were taught that the term AAPI excludes Desi-American indivi

### 2)
News Articles

We will now be scraping text data from inshorts, a website that provides a brief overview of many different topics.

Write a function that scrapes the news articles for the following topics:

Business  
Sports  
Technology  
Entertainment    
The end product of this should be a function named get_news_articles that returns a list of dictionaries, where each dictionary has this shape:
{   
    'title': 'The article title',   
    'content': 'The article content',  
    'category': 'business' # for example  
}  


In [59]:
# Define the URL to the webpage you want to scrape
url = 'https://inshorts.com/'
# Define the User-Agent header to be used in the HTTP request
headers = {'User-Agent': 'Codeup Data Science'}

In [60]:
# Send an HTTP GET request to the specified URL with the defined User-Agent header
response = requests.get(url, headers=headers)

In [61]:
# Create a BeautifulSoup object to parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')

In [62]:
# Find all the HTML elements with the 'h2' tag in the parsed HTML content
links = soup.find_all('div')

In [63]:
# Iterate through each 'h2' element in the 'links' list
for article in links:
    # Find the first 'a' (anchor) element within each 'h2' element and print it
    print(article.find('a'))

<a href="https://blog.inshorts.com/" target="_blank">Blog</a>
<a href="https://blog.inshorts.com/" target="_blank">Blog</a>
<a href="https://blog.inshorts.com/" target="_blank">Blog</a>
<a href="https://blog.inshorts.com/" target="_blank">Blog</a>
None
<a href="https://blog.inshorts.com/" target="_blank">Blog</a>
<a href="/en/read" target="_blank">Read Now</a>
<a href="https://itunes.apple.com/us/app/news-in-shorts/id892146527" target="_blank"><div class="ios_container"></div></a>
None
None
<a href="https://itunes.apple.com/us/app/news-in-shorts/id892146527" target="_blank"><div class="ios_container"></div></a>
None
None
None
None
<a href="https://itunes.apple.com/us/app/news-in-shorts/id892146527" target="_blank"><div class="ios_container"></div></a>
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
Non

In [64]:
# Initialize an empty list to store the extracted links
new_links = []

# Loop through each 'article' element in the 'links' list
for article in links:
    # Check if the 'article' contains an 'a' (anchor) element
    if article.find("a"):
        # Print the 'href' attribute of the 'a' element (the link)
        print(article.find("a").get("href"))
        # Append the link to the 'new_links' list
        new_links.append(article.find("a").get("href"))

https://blog.inshorts.com/
https://blog.inshorts.com/
https://blog.inshorts.com/
https://blog.inshorts.com/
https://blog.inshorts.com/
/en/read
https://itunes.apple.com/us/app/news-in-shorts/id892146527
https://itunes.apple.com/us/app/news-in-shorts/id892146527
https://itunes.apple.com/us/app/news-in-shorts/id892146527
https://itunes.apple.com/us/app/news-in-shorts/id892146527
https://itunes.apple.com/us/app/news-in-shorts/id892146527
https://itunes.apple.com/us/app/news-in-shorts/id892146527
/tnc
/tnc
https://facebook.com/inshortsapp


In [65]:
new_links

['https://blog.inshorts.com/',
 'https://blog.inshorts.com/',
 'https://blog.inshorts.com/',
 'https://blog.inshorts.com/',
 'https://blog.inshorts.com/',
 '/en/read',
 'https://itunes.apple.com/us/app/news-in-shorts/id892146527',
 'https://itunes.apple.com/us/app/news-in-shorts/id892146527',
 'https://itunes.apple.com/us/app/news-in-shorts/id892146527',
 'https://itunes.apple.com/us/app/news-in-shorts/id892146527',
 'https://itunes.apple.com/us/app/news-in-shorts/id892146527',
 'https://itunes.apple.com/us/app/news-in-shorts/id892146527',
 '/tnc',
 '/tnc',
 'https://facebook.com/inshortsapp']

### Business

In [66]:
# Get the fourth URL from the 'new_links' list and assign it to the variable 'url'
url = new_links[0]

In [67]:
# Send an HTTP GET request to the specified URL with the defined User-Agent header
response = requests.get(url, headers=headers)

In [68]:
# Create a BeautifulSoup object to parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')

In [69]:
# Find all the HTML elements with the 'h1' tag in the parsed HTML content and assign it to title
title = soup.find('h2').get_text()

In [70]:
# Select the 'entry-content' class within the parsed HTML content and retrieve the first element ([0])
# Within the selected content, find all 'p' (paragraph) elements and store them in the 'content' variable
content = soup.select('.entry-content')[4].find_all('p')

In [71]:
# Initialize an empty list to store cleaned content
clean_content = []

# Loop through each 'p' (paragraph) element in the 'content' list
for p in content:
    # Extract the text content from the 'p' element and append it to the 'clean_content' list
    clean_content.append(p.get_text())
content = ''.join(clean_content)

In [72]:
content

'5th October 2021: With festive season being just around the corner, a recent consumer poll by Public app with over 4 lakh respondents has revealed interesting social and economical consumer insights this year which has also been compared to a similar poll that was conducted in 2020. The poll highlights that while people are considerably…More '

In [73]:
title

'86% Indians feel voting should be made compulsory: Public App Survey￼'

## Entertainment

In [74]:
# Get the fourth URL from the 'new_links' list and assign it to the variable 'url'
url = new_links[0]  # Change the index to the correct one
# Send an HTTP GET request to the specified URL with the defined User-Agent header
response = requests.get(url, headers=headers)
# Create a BeautifulSoup object to parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')
# Find the HTML element with the title and extract the text
title = soup.find('h2').get_text()
content_elements = soup.select('.entry-content')  # Update this line to match the HTML structure
# Initialize an empty list to store cleaned content
clean_content = []

# Loop through each content element
for element in content_elements:
    # Extract the text content from the element and append it to the 'clean_content' list
    clean_content.append(element.get_text())
content = ''.join(clean_content)

In [75]:
title

'86% Indians feel voting should be made compulsory: Public App Survey￼'

In [76]:
content

'\nOn the occasion of the 12th National Voters’ Day, Public App, India’s largest location-based social network, conducted a survey to understand how seriously Indians take their voting rights and on what factors they evaluate for which candidate to choose. This pan-India poll was conducted with a sizable data pool of over 4 lakh people. As…More \n\nAs 2021 draws to a close end, users of location-based social network Public weighed in their views on the age-old tradition of making New Year’s resolutions. A New Year’s resolution is a common practice in which a person resolves to continue good habits, change an undesired trait or behavior, accomplish a personal goal, or otherwise…More \n\nWhile addressing the nation on November 19, Prime Minister Narendra Modi announced that the three contentious farm laws -The Farmers’ Produce Trade and Commerce (Promotion and Facilitation) Bill, 2020, The Farmers (Empowerment and Protection) Agreement of Price Assurance and Farm Services Bill, 2020 & Th

### Technology

In [77]:
def get_articles_data(new_links, headers):
    article_list = []
    
    for url in new_links:
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract the title
        title = soup.find('h2')
        if title and title.get_text() != "Example Domain":
            title_text = title.get_text()
        else:
            title_text = "Title Not Found"

        # Extract the content
        content = soup.select('.entry-content')
        if content:
            content_text = ''.join([p.get_text() for p in content[0].find_all('p')])
        else:
            content_text = "Content Not Found"

        article_data = {
            'title': title_text,
            'content': content_text
        }

        article_list.append(article_data)
    
    return article_list

In [78]:
def get_news_article(new_links):
    # Get the fourth URL from the 'new_links' list and assign it to the variable 'url'
    url = new_links[0]  # Change the index to the correct one
    # Send an HTTP GET request to the specified URL with the defined User-Agent header
    response = requests.get(url, headers=headers)
    # Create a BeautifulSoup object to parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')
    # Find the HTML element with the title and extract the text
    title = soup.find('h2').get_text()  # Update this line to match the HTML structure
    #<header class="entry-header default-max-width">
    		#<h2 class="entry-title"><a href="https://blog.inshorts.com/2022/01/27/around-86-indians-feel-voting-should-be-made-compulsory-public-poll/" rel="bookmark">86% Indians feel voting should be made compulsory: Public App Survey￼</a></h2>	</header>
    # Find the HTML element(s) containing the article content and extract the text
    content_elements = soup.select('.entry-content')  # Update this line to match the HTML structure
    # Initialize an empty list to store cleaned content
    clean_content = []
    
    # Loop through each content element
    for element in content_elements:
        # Extract the text content from the element and append it to the 'clean_content' list
        clean_content.append(element.get_text())
    content = ''.join(clean_content)
    news_df = {'title': title,
               'content': content
        
    }
    return news_df

In [79]:
title

'86% Indians feel voting should be made compulsory: Public App Survey￼'

In [80]:
content

'\nOn the occasion of the 12th National Voters’ Day, Public App, India’s largest location-based social network, conducted a survey to understand how seriously Indians take their voting rights and on what factors they evaluate for which candidate to choose. This pan-India poll was conducted with a sizable data pool of over 4 lakh people. As…More \n\nAs 2021 draws to a close end, users of location-based social network Public weighed in their views on the age-old tradition of making New Year’s resolutions. A New Year’s resolution is a common practice in which a person resolves to continue good habits, change an undesired trait or behavior, accomplish a personal goal, or otherwise…More \n\nWhile addressing the nation on November 19, Prime Minister Narendra Modi announced that the three contentious farm laws -The Farmers’ Produce Trade and Commerce (Promotion and Facilitation) Bill, 2020, The Farmers (Empowerment and Protection) Agreement of Price Assurance and Farm Services Bill, 2020 & Th

In [81]:
def get_new_links(url, headers, sc1, sc2, sc3):
    # Send an HTTP GET request to the specified URL with the defined User-Agent header
    response = requests.get(url, headers=headers)
    # Create a BeautifulSoup object to parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')
    # Find all the HTML elements with the 'h2' tag in the parsed HTML content
    links = soup.find_all(sc1)
    # Initialize an empty list to store the extracted links
    new_links = []

    # Loop through each 'article' element in the 'links' list
    for article in links:
        # Check if the 'article' contains an 'a' (anchor) element
        if article.find(sc2):
            # Print the 'href' attribute of the 'a' element (the link)
            print(article.find(sc2).get(sc3))
            # Append the link to the 'new_links' list
            new_links.append(article.find(sc2).get(sc3))

    return new_links

In [82]:
# Define the URL to the webpage you want to scrape
url = 'https://inshorts.com/'
# Define the User-Agent header to be used in the HTTP request
headers = {'User-Agent': 'Codeup Data Science'}

In [83]:
links = get_new_links(url, headers, 'div', 'a', 'href')

https://blog.inshorts.com/
https://blog.inshorts.com/
https://blog.inshorts.com/
https://blog.inshorts.com/
https://blog.inshorts.com/
/en/read
https://itunes.apple.com/us/app/news-in-shorts/id892146527
https://itunes.apple.com/us/app/news-in-shorts/id892146527
https://itunes.apple.com/us/app/news-in-shorts/id892146527
https://itunes.apple.com/us/app/news-in-shorts/id892146527
https://itunes.apple.com/us/app/news-in-shorts/id892146527
https://itunes.apple.com/us/app/news-in-shorts/id892146527
/tnc
/tnc
https://facebook.com/inshortsapp


In [84]:
get_news_article(links)

{'title': '86% Indians feel voting should be made compulsory: Public App Survey￼',
 'content': '\nOn the occasion of the 12th National Voters’ Day, Public App, India’s largest location-based social network, conducted a survey to understand how seriously Indians take their voting rights and on what factors they evaluate for which candidate to choose. This pan-India poll was conducted with a sizable data pool of over 4 lakh people. As…More \n\nAs 2021 draws to a close end, users of location-based social network Public weighed in their views on the age-old tradition of making New Year’s resolutions. A New Year’s resolution is a common practice in which a person resolves to continue good habits, change an undesired trait or behavior, accomplish a personal goal, or otherwise…More \n\nWhile addressing the nation on November 19, Prime Minister Narendra Modi announced that the three contentious farm laws -The Farmers’ Produce Trade and Commerce (Promotion and Facilitation) Bill, 2020, The Farm