# Web Scraping: The Mirror

In [None]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import time

### Obtain list of news from the coverpage

URL definition:

In [None]:
# url definition
url = "https://www.mirror.co.uk"

List of news:

In [None]:
# Request
r1 = requests.get(url)
print(r1.status_code)

# We'll save in coverpage the cover page content
coverpage = r1.content

# Soup creation
soup1 = BeautifulSoup(coverpage, 'html5lib')

# News identification
coverpage_news = soup1.find_all('article', class_='story story--news')
len(coverpage_news)

200


262

Now we have a list in which every element is a news article:

In [None]:
coverpage_news[1]

<article class="story story--news" data-priority="B" data-section="news-world-news" data-style="news"><a href="https://www.mirror.co.uk/news/world-news/boy-6-who-survived-italy-24767107"><amp-img alt="Orphaned boy who was only survivor of cable car tragedy centre of bitter custody battle" class="image image--wide cover " content="https://i2-prod.mirror.co.uk/incoming/article24767222.ece/ALTERNATES/n615/2_PAY-Italy-cable-car-crash-Boy-2-killed-in-horror-crash-alongside-parents-pictured-as-brother-5-figh.jpg" height="90" layout="responsive" src="https://i2-prod.mirror.co.uk/incoming/article24767222.ece/ALTERNATES/n615/2_PAY-Italy-cable-car-crash-Boy-2-killed-in-horror-crash-alongside-parents-pictured-as-brother-5-figh.jpg" srcset="https://i2-prod.mirror.co.uk/incoming/article24767222.ece/ALTERNATES/r250/2_PAY-Italy-cable-car-crash-Boy-2-killed-in-horror-crash-alongside-parents-pictured-as-brother-5-figh.jpg 250w, https://i2-prod.mirror.co.uk/incoming/article24767222.ece/ALTERNATES/r500/2

In [None]:
n=1
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

link = coverpage_news[n].find('a')['href']
title = coverpage_news[n].find('h2').get_text()
article = requests.get(link,headers=headers)
print(article)
article_content = article.content
soup_article = BeautifulSoup(article_content, 'html5lib')


<Response [200]>


In [None]:
title

'Boy, 6, who survived Italy cable car crash that killed parents at centre of custody battle'

In [None]:
body = soup_article.find_all('p')

In [None]:
body

[<p class="lead-content__sub-title" itemprop="description">Eitan Biran was the only survivor of the crash in the Alps earlier this year. His younger brother, mother, father and great-grandparents were five of the 13 that died</p>,
 <p>A six-year-old boy who was orphaned in a terrifying cable car tragedy is now at the centre of a bitter custody battle.</p>,
 <p>Eitan Biran was the only survivor in the Alps in May 23, after the cable car they were travelling up a mountain in dropped 1,000ft away from the station.</p>,
 <p>The crash killed his mother Tal, 26, father Amit, 30, two-year-old brother Tom and great-grandparents Itshak and Barbara Cohen, 82 and 70.</p>,
 <p>The family were looking over Lake Maggiore when they fell. Eight others, not related to Eitan, died after the tragedy.</p>,
 <p>Matteo Gasparini, provincial head of Italy's Alpine rescue service, said at the time there appeared to have been two major problems - the breakage of the cable and the failure of the emergency brake

In [None]:
len(body)

22

In [None]:
x = soup_article.find_all('p')

In [None]:
len(x)

22

In [None]:
x[0].get_text()

'Eitan Biran was the only survivor of the crash in the Alps earlier this year. His younger brother, mother, father and great-grandparents were five of the 13 that died'

In [None]:
#list_links

### Let's extract the text from the articles:

First, we'll define the number of articles we want:

In [None]:
number_of_articles = 200

In [None]:
# Empty lists for content, links and titles
news_contents = []
list_links = []
list_titles = []

for n in np.arange(0, number_of_articles):
    # Getting the link of the article
    link = coverpage_news[n].find('a')['href']
    list_links.append(link)
    
    # Getting the title
    title = coverpage_news[n].find('h2').get_text()
    list_titles.append(title)
    
    # Reading the content (it is divided in paragraphs)
    #print(link)
    article = requests.get(link,headers=headers)
    article_content = article.content
    #print(article_content)
    soup_article = BeautifulSoup(article_content, 'html5lib')
    x = soup_article.find_all('p')
    
    # Unifying the paragraphs
    list_paragraphs = []
    for p in np.arange(0, len(x)):
        paragraph = x[p].get_text()
        list_paragraphs.append(paragraph)
        final_article = " ".join(list_paragraphs)
        
    news_contents.append(final_article)

Let's put them into:
* a dataset which will the input of the models (`df_features`)
* a dataset with the title and the link (`df_show_info`)

In [None]:
# df_features
df_features = pd.DataFrame(
     {'Article Content': news_contents 
    })

# df_show_info
df_show_info = pd.DataFrame(
    {'Article Title': list_titles,
     'Article Link': list_links,
     'Article Content': news_contents })

In [None]:
df_features

Unnamed: 0,Article Content
0,Detective Constable David Louden and his three...
1,Eitan Biran was the only survivor of the crash...
2,"Maxine Davison, Lee Martyn, Sophie Martyn, Ste..."
3,Tracy Beaker actress Dani Harmer is expecting ...
4,"Priya Gopaldas, Brett Staniland, Liberty Poole..."
...,...
195,Sun seekers are finally getting hotter tempera...
196,Mel and her gondala operating lover Riccardo S...
197,Three-year-old Sophie Martyn was pushing a toy...
198,The structure was erected on a green near the ...


In [None]:
df_show_info.to_csv('Datsets.csv')
import nltk
nltk.download('punkt')
stop_words=['covid','coronavirus']
def get_keywords(row):
    some_text = row['Article Content']
    lowered = some_text.lower()
    tokens = nltk.tokenize.word_tokenize(some_text)
    keywords = [keyword for keyword in tokens if keyword.isalpha() and keyword in stop_words]
    keywords_string = ','.join(keywords)
    return keywords_string


df_show_info['Keywords'] = df_show_info.apply(get_keywords,axis=1)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
df_show_info.to_csv('Datasets_with_keywords.csv')