<a href="https://colab.research.google.com/github/Sylvesterchuks/PagesNotFound/blob/main/Signal_blogPost_Scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# A Scraping Project

## In this project we are going to scrape a popular blog site for information about the posts on the blog.
  * Create a dataset of blog posts on a popular blog e.g. https://m.signalvnoise.com/search/ .
  * The dataset can contain information like the blog title, published date, tags, author, link to blog post, etc.

In [1]:
# import the needed libraries

import requests
from bs4 import BeautifulSoup
import time
import datetime 

### Retrieve information from the web

In [2]:
urls = 'https://m.signalvnoise.com/search/'

resp = requests.get(urls)
resp.raise_for_status()

### Using Beautiful to render the Html

In [3]:
soups = BeautifulSoup(resp.content,'lxml')

In [4]:
blog_list = soups.find('ul',class_='archives')

In [5]:
blog_list_links = [link.a['href'] for link in blog_list.find_all('li')]

In [6]:
blog_page_num = [int(num.text[-3:].strip('()')) for num in blog_list.find_all('li')]

In [7]:
blog_page_num

[1,
 1,
 3,
 5,
 4,
 2,
 6,
 6,
 1,
 4,
 7,
 2,
 8,
 8,
 10,
 9,
 9,
 9,
 17,
 11,
 9,
 11,
 8,
 9,
 26,
 9,
 10,
 9,
 8,
 7,
 8,
 8,
 5,
 10,
 10,
 13,
 25,
 20,
 20,
 17,
 20,
 21,
 24,
 19,
 35,
 16,
 28,
 29,
 25,
 22,
 19,
 15,
 28,
 25,
 21,
 27,
 30,
 16,
 28,
 16,
 22,
 20,
 9,
 7,
 18,
 1,
 1]

In [8]:
from math import ceil

In [9]:
blog_list_links[:10]

['https://m.signalvnoise.com/2021/02/',
 'https://m.signalvnoise.com/2021/01/',
 'https://m.signalvnoise.com/2020/12/',
 'https://m.signalvnoise.com/2020/10/',
 'https://m.signalvnoise.com/2020/09/',
 'https://m.signalvnoise.com/2020/08/',
 'https://m.signalvnoise.com/2020/07/',
 'https://m.signalvnoise.com/2020/06/',
 'https://m.signalvnoise.com/2020/05/',
 'https://m.signalvnoise.com/2020/04/']

In [10]:
blog_page_num[:5]

[1, 1, 3, 5, 4]

In [11]:
import pandas as pd

### Combining all the codes and using a for statement to iterate through each link from the blog website

In [12]:
title = [] 
author = [] 
date_posted = [] 
summary = []
post_link  = []
for num, link in zip(blog_page_num,blog_list_links):
  if num < 10:
    url_link = link

    respond = requests.get(url_link)
    respond.raise_for_status()

    soups = BeautifulSoup(respond.content,'lxml')
    section = soups.find_all('article', class_='entry-summary grid__item grid__item--third')
    for blog in section:
      title.append(blog.find('h2').text)
      author.append(blog.find('div').find('span', class_='byline').text)
      date_posted.append(blog.find('div').find('time', class_='entry-date published updated').text)
      summary.append(blog.find('p').text.split('.')[0])
      post_link.append(blog.find('h2').a['href'])
  else:
    page_num = ceil(num/10)
    for i in range(1,page_num+1):
      url_link = f'{link}page/{i}/'


      respond = requests.get(url_link)
      respond.raise_for_status()

      soups = BeautifulSoup(respond.content,'lxml')
      section = soups.find_all('article', class_='entry-summary grid__item grid__item--third')
      for blog in section:
        try:
          title.append(blog.find('h2').text)
        except:
          title.append('No title')
        try:
          author.append(blog.find('div').find('span', class_='byline').text)
        except:
          author.append('No author')
        date_posted.append(blog.find('div').find('time', class_='entry-date published updated').text)
        summary.append(blog.find('p').text.split('.')[0])
        try:
          post_link.append(blog.find('h2').a['href'])
        except:
          post_link.append('No link')
# post_dict = {'Date Posted': date_posted, 'Title':title, 'Author': author, 'Description': summary, 'Post link': post_link}

    

### Assign the lists to a dictionary

In [13]:
post_dict = {'Date Posted': date_posted, 'Title':title, 'Author': author, 'Description': summary, 'Post link': post_link}

### Export it to CSV using Pandas function

In [14]:
signal_noise = pd.DataFrame(post_dict)
signal_noise.to_csv('Blog_posts.csv',encoding='utf-8',index=False)
