# Import and config

In [563]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from webdriver_manager.core.utils import ChromeType
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

import pandas as pd
import numpy as np
import re
import string
import spacy
import statistics
import textstat
import time
from datetime import datetime

from bs4 import BeautifulSoup
from jinja2 import Template
from IPython.display import HTML, Image
import webbrowser

from newspaper.article import ArticleException
from newspaper import Article
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from vaderSentiment_fr.vaderSentiment import SentimentIntensityAnalyzer
SIA = SentimentIntensityAnalyzer()
nlp = spacy.load('fr_core_news_sm')

Our main browser is Brave so here is its config.

In [357]:
options = Options()
options.binary_location = r'C:/Program Files/BraveSoftware/Brave-Browser/Application/brave.exe'
driver_path = ChromeDriverManager(chrome_type=ChromeType.BRAVE).install()
service = Service(executable_path=driver_path)
options.add_argument('--tor')
options.add_argument('--headless')
options.add_argument('-no-sandbox')
options.add_argument('-disable-dev-shm-usage')
options.add_argument("enable-automation")
options.add_argument("--window-size=1920,1080")
options.add_argument("--disable-extensions")
options.add_argument("--dns-prefetch-disable")
options.add_argument("--disable-gpu")
driver = webdriver.Chrome(options=options, service=service)

# Twitter trends

In [358]:
driver.get('https://twitter.com/i/trends')
driver.current_url

'https://twitter.com/i/trends'

In [359]:
trends = [t.text for t in driver.find_elements(By.CLASS_NAME, 'css-901oao.r-18jsvk2.r-1qd0xha.r-a023e6.r-b88u0q.r-rjixqe.r-1bymd8e.r-bcqeeo.r-qvutc0')[:10]]
trends

['Snapchat',
 '#greve19janvier',
 'Hobi',
 'Bein 2',
 '#PSGRiyadhSeasonTeam',
 'Murray',
 'Mister V',
 'Stranger Things',
 'Espagne',
 'Caesar']

# Google News

In [360]:
driver.get('https://news.google.com/home?hl=fr&gl=FR&ceid=FR:fr')
driver.current_url

'https://consent.google.com/m?continue=https://news.google.com/home?hl%3Dfr%26gl%3DFR%26ceid%3DFR:fr&gl=FR&m=0&pc=n&hl=fr&src=1'

The first time we arrived on Google News we need to accept cookies.

In [361]:
agree_box = driver.find_elements(By.CLASS_NAME, 'VfPpkd-LgbsSe VfPpkd-LgbsSe-OWXEXe-k8QpJ VfPpkd-LgbsSe-OWXEXe-dgl2Hf nCP5yc AjY5Oe DuMIQc LQeN7 Nc7WLe'.replace(' ', '.'))[1]
agree_box.submit()
driver.implicitly_wait(5)
driver.current_url

'https://news.google.com/home?hl=fr&gl=FR&ceid=FR:fr'

Now we are able to retrieve the name of the article, its newspaper and its link for each trends.

It takes about 30 seconds to save each page.

In [362]:
start = time.time()
pages_html = []
for t in trends:
    driver.get('https://news.google.com/home?hl=fr&gl=FR&ceid=FR:fr')
    search_box = driver.find_element(By.CLASS_NAME, 'Ax4B8.ZAGvjd')
    search_box.send_keys(t)
    search_box.send_keys(Keys.RETURN)
    driver.implicitly_wait(5)
    print(driver.current_url)
    time.sleep(1)
    page_source = driver.page_source
    html = BeautifulSoup(page_source, 'html.parser')
    articles = html.find_all('article', {'class': 'MQsxIb xTewfe R7GTQ keNKEd j7vNaf Cc0Z5d EjqUne'}, limit=10)
    pages_html.append(articles)
end = time.time()
computation_time = end - start
print(f'{computation_time//60} minutes and {computation_time%60} seconds')

https://news.google.com/search?q=Snapchat&hl=fr&gl=FR&ceid=FR%3Afr
https://news.google.com/search?q=%23greve19janvier&hl=fr&gl=FR&ceid=FR%3Afr
https://news.google.com/search?q=Hobi&hl=fr&gl=FR&ceid=FR%3Afr
https://news.google.com/search?q=Bein%202&hl=fr&gl=FR&ceid=FR%3Afr
https://news.google.com/search?q=%23PSGRiyadhSeasonTeam&hl=fr&gl=FR&ceid=FR%3Afr
https://news.google.com/search?q=Murray&hl=fr&gl=FR&ceid=FR%3Afr
https://news.google.com/search?q=Mister%20V&hl=fr&gl=FR&ceid=FR%3Afr
https://news.google.com/search?q=Stranger%20Things&hl=fr&gl=FR&ceid=FR%3Afr
https://news.google.com/search?q=Espagne&hl=fr&gl=FR&ceid=FR%3Afr
https://news.google.com/search?q=Caesar&hl=fr&gl=FR&ceid=FR%3Afr
0.0 minutes and 37.36444449424744 seconds


In [363]:
len(pages_html)

10

## Articles ranking

In [70]:
Image(url='https://readable.com/wp-content/uploads/2019/04/lix2.png')

In [73]:
Image(url='https://readable.com/wp-content/uploads/2019/04/rix2.png')

In [71]:
Image(url='https://readable.com/wp-content/uploads/2017/01/FLESCHREADINGEASE.png')

In [69]:
Image(url='https://readable.com/wp-content/uploads/2017/01/fleschkincaid.png')

In [74]:
Image(url='https://readable.com/wp-content/uploads/2017/03/gunning3.png')

In [364]:
features_dict = {'LIX': [10, 15, 20, 24, 28, 32, 36, 40, 44, 48, 52, 55], 
                 'RIX': [0.2, 0.5, 0.8, 1.3, 1.8, 2.4, 3.0, 3.7, 4.5, 5.3, 6.2, 7.2],
                 'FLESCH_READING_EASE':[90, 80, 70, 60, 60, 50, 50, 30, 30, 30, 30, 0],
                 'FLESCH_KINCAID_GRADE_LEVEL':range(6, 18),
                 'GUNNING_FOG': range(6, 18)}
read_scales = pd.DataFrame(features_dict, index=range(1, 13)); read_scales

Unnamed: 0,LIX,RIX,FLESCH_READING_EASE,FLESCH_KINCAID_GRADE_LEVEL,GUNNING_FOG
1,10,0.2,90,6,6
2,15,0.5,80,7,7
3,20,0.8,70,8,8
4,24,1.3,60,9,9
5,28,1.8,60,10,10
6,32,2.4,50,11,11
7,36,3.0,50,12,12
8,40,3.7,30,13,13
9,44,4.5,30,14,14
10,48,5.3,30,15,15


In [365]:
def Readability(text):
    lix = textstat.lix(text)
    rix = textstat.rix(text)
    fre = textstat.flesch_reading_ease(text)
    fkg = textstat.flesch_kincaid_grade(text)
    gf = textstat.gunning_fog(text)
    
    lix_min = read_scales['LIX'].min()
    lix_max = read_scales['LIX'].max()
    rix_min = read_scales['RIX'].min()
    rix_max = read_scales['RIX'].max()
    fre_min = read_scales['FLESCH_READING_EASE'].min()
    fre_max = read_scales['FLESCH_READING_EASE'].max()
    fkg_min = read_scales['FLESCH_KINCAID_GRADE_LEVEL'].min()
    fkg_max = read_scales['FLESCH_KINCAID_GRADE_LEVEL'].max()
    gf_min = read_scales['GUNNING_FOG'].min()
    gf_max = read_scales['GUNNING_FOG'].max()
    
    if lix < lix_min: lix_rank = 1
    elif lix > lix_max: lix_rank = 12
    else: lix_rank = read_scales[(read_scales['LIX'] >= read_scales.loc[read_scales['LIX'] <= lix, 'LIX'].max()) & (read_scales['LIX'] < read_scales.loc[read_scales['LIX'] > lix, 'LIX'].min())].index[0]
    
    if rix < rix_min: rix_rank = 1
    elif rix > rix_max: rix_rank = 12
    else: rix_rank = read_scales[(read_scales['RIX'] >= read_scales.loc[read_scales['RIX'] <= rix, 'RIX'].max()) & (read_scales['RIX'] < read_scales.loc[read_scales['RIX'] > rix, 'RIX'].min())].index[0]
    
    if fre < fre_min: fre_rank = 1
    elif fre > fre_max: fre_rank = 12
    else: fre_rank = read_scales[(read_scales['FLESCH_READING_EASE'] >= read_scales.loc[read_scales['FLESCH_READING_EASE'] <= fre, 'FLESCH_READING_EASE'].max()) & (read_scales['FLESCH_READING_EASE'] < read_scales.loc[read_scales['FLESCH_READING_EASE'] > fre, 'FLESCH_READING_EASE'].min())].index[0]
    
    if fkg < fkg_min: fkg_rank = 1
    elif fkg > fkg_max: fkg_rank = 12
    else: fkg_rank = read_scales[(read_scales['FLESCH_KINCAID_GRADE_LEVEL'] >= read_scales.loc[read_scales['FLESCH_KINCAID_GRADE_LEVEL'] <= fkg, 'FLESCH_KINCAID_GRADE_LEVEL'].max()) & (read_scales['FLESCH_KINCAID_GRADE_LEVEL'] < read_scales.loc[read_scales['FLESCH_KINCAID_GRADE_LEVEL'] > fkg, 'FLESCH_KINCAID_GRADE_LEVEL'].min())].index[0]
    
    if gf < gf_min: gf_rank = 1
    elif gf > gf_max: gf_rank = 12
    else: gf_rank = read_scales[(read_scales['GUNNING_FOG'] >= read_scales.loc[read_scales['GUNNING_FOG'] <= gf, 'GUNNING_FOG'].max()) & (read_scales['GUNNING_FOG'] < read_scales.loc[read_scales['GUNNING_FOG'] > gf, 'GUNNING_FOG'].min())].index[0]
    
    ranks = [lix_rank, rix_rank, fre_rank, fkg_rank, gf_rank]
    
    return statistics.fmean(ranks)

BE CAREFUL: the next cell takes about 8 MINUTES to compute that's why we provided an html file as example.

In [366]:
start = time.time()
links = []
scores = []
for p, page in enumerate(pages_html):
    l = []
    s = []
    print(f'tendance {p+1}/10')
    for i, a in enumerate(page):
        try:
            prefix = "https://news.google.com"
            link = prefix + a.find('a', {'class':'VDXfz'}).get('href')[1:]
            l.append(link)
            driver.get(link)
            driver.implicitly_wait(10)
            time.sleep(1)
            url = driver.current_url
            article = Article(url)
            article.download()
            article.parse()
            article.nlp()
            text = article.text
            score = Readability(text)
            s.append(score)
        except ArticleException:
            s.append(999)
            continue
        print(f'   article {i+1}/10')
    links.append(l)
    scores.append(s)
end = time.time()
computation_time = end - start
print(f'{computation_time//60} minutes and {computation_time%60} seconds')

tendance 1/10
   article 1/10
   article 2/10
   article 3/10
   article 4/10
   article 5/10
   article 6/10
   article 7/10
   article 8/10
   article 9/10
   article 10/10
tendance 2/10
   article 1/10
   article 2/10
   article 3/10
   article 4/10
   article 5/10
   article 6/10
   article 7/10
   article 8/10
   article 9/10
   article 10/10
tendance 3/10
   article 1/10
   article 2/10
   article 3/10
   article 4/10
   article 5/10
   article 6/10
   article 7/10
   article 8/10
   article 9/10
   article 10/10
tendance 4/10
   article 1/10
   article 2/10
   article 3/10
   article 4/10
   article 5/10
   article 6/10
   article 8/10
   article 9/10
   article 10/10
tendance 5/10
   article 1/10
   article 2/10
   article 3/10
   article 4/10
   article 5/10
tendance 6/10
   article 1/10
   article 2/10
   article 3/10
   article 4/10
   article 5/10
   article 6/10
   article 7/10
   article 8/10
   article 9/10
   article 10/10
tendance 7/10
   article 1/10
   article 2/10
 

In [367]:
scores

[[10.4, 7.0, 6.0, 9.0, 5.8, 6.0, 4.6, 5.0, 11.2, 9.0],
 [6.2, 4.0, 8.4, 9.0, 7.8, 8.8, 7.8, 5.2, 6.0, 7.2],
 [3.4, 4.4, 7.2, 6.8, 7.6, 10.0, 9.8, 9.6, 8.8, 9.2],
 [6.4, 12.0, 6.6, 3.8, 11.2, 6.8, 999, 3.2, 6.4, 9.8],
 [12.0, 3.2, 3.2, 5.8, 5.0],
 [3.4, 5.8, 5.8, 4.8, 5.2, 7.8, 5.8, 6.0, 9.4, 5.6],
 [5.8, 4.4, 999, 8.4, 4.8, 7.2, 6.8, 7.6, 4.8, 5.2],
 [6.8, 10.0, 7.0, 2.6, 4.0, 7.8, 6.4, 2.6, 4.0, 8.2],
 [7.6, 10.6, 8.2, 8.8, 9.2, 6.6, 9.4, 9.0, 9.6, 9.4],
 [7.8, 8.6, 7.6, 6.2, 4.6, 6.2, 8.2, 11.0, 7.0, 8.2]]

In [368]:
best_articles_index = [np.argmin(s) for s in scores]; best_articles_index

[6, 1, 0, 7, 1, 0, 1, 3, 5, 4]

# Information of article scraping

In [418]:
best_articles = []
for page, link, best in zip(pages_html, links, best_articles_index):
    print(pages_html.index(page))
    article = {}
    driver.get(link[best])
    driver.implicitly_wait(10)
    time.sleep(1)
    url = driver.current_url
    a = Article(url)
    a.download()
    a.parse()
    a.nlp()
    title = a.title
    article['title'] = title
    newspaper = page[best].find('div', {'class': 'wsLqz RD0gLb'}).find('img')['alt']
    article['newspaper'] = newspaper
    summary = a.summary
    article['summary'] = summary
    article['url'] = url
    pic = a.top_image.replace(' ', '%20')
    article['picture'] = pic
    best_articles.append(article)

0
1
2
3
4
5
6
7
8
9


In [429]:
for k, v in best_articles[0].items():
    print(f'{k}: {v}')

title: "Elle pleure comme si elle avait perdu un proche" : Poupette Kenza effondrée après avoir perdu son compte Snapchat, les internautes se moquent d'elle
newspaper: Purebreak
summary: Poupette Kenza est l'une des influenceuses avec la plus grande fan base.
"Après une nuit blanche et une journée à vomir, on va mettre à l'honneur mes haters une dernière fois aujourd'hui avant que je m'en aille (...).
Je veux une vie de paix et tranquille, sachez aujourd'hui que vous m'avez vaincue et que je mets fin à 'poupette'", avait-elle affirmé.
"Je ne sais pas ce qui s'est passé mes poupettes, mais là, c'est fini.
On m'a dit que je ne pourrai plus faire de compte.
url: https://www.purebreak.com/news/-elle-pleure-comme-si-elle-avait-perdu-un-proche-poupette-kenza-effondree-apres-avoir-perdu-son-compte-snapchat-les-internautes-se-moquent-d-elle/241524
picture: https://static1.purebreak.com/articles/4/24/15/24/@/805854-poupette-temoigne-sur-son-harcelement-da-opengraph_1200-4.jpg


In [558]:
def getScore(text):
    score = SIA.polarity_scores(text).get('compound') 
    return score

In [559]:
def getOpinion(score):
    if score > 0.5:
        return "Opinion : Favorable"
    if score > 0.250:
        return "Opinion : Plutôt favorable"
    if score < -0.5:
        return "Opinion : Critique"
    if score < -0.250:
        return "Opinion : Plutôt Critique"
    else:
        return "Opinion : Neutre"

In [560]:
def getSentiment(text):
    stop_words = set(stopwords.words('french'))
    clean = " ".join([word for word in str(text).split() if word not in stop_words])
    from nltk.stem.snowball import FrenchStemmer
    stemmer = FrenchStemmer()
    stemmed = " ".join([stemmer.stem(word) for word in clean.split(",")])
    return getOpinion(getScore(stemmed))

# HTML structure

In [564]:
template = Template("""
<div style="display: flex; margin-bottom: 20px;">
    <img src="{{ item.picture }}"
    style="float: right; margin-right: 20px; max-width: 200px; max-height: 200px;"
    alt="{{ item.newspaper}}">
    <div style="flex-direction: column;">
        <h3>{{ trend }}</h3>
        <p>{{ sentiment }}</p>
        <a href="{{ item.url }}">
            <h4>{{ item.title }}</h3>
        </a>
        <p>{{ item.summary }}</p>
    </div>
</div>
""")

html_content = ""
for item, trend in zip(best_articles, trends):
    html_content += template.render(trend=trend, item=item, sentiment=getSentiment(item['summary']))

In [565]:
page = """
<html>
    <head>
        <title>{{ title }}</title>
        <style>
            .center {
                text-align: center;
                margin-bottom: 50px;
            }
        </style>
        <meta charset="utf-8">
    </head>
    <body>
        <div class="center">
            <h1>{{ title }}</h1>
            <h2>{{ byline }}</h2>
            <h3>{{ date }}</h3>
        </div>
        {{ html_content }}
    </body>
</html>
"""
template = Template(page)
now = datetime.now()
html = template.render(title='Mon Grand Quotidien', byline='by Théo & Benjamin', html_content=html_content, date=now.strftime('%Y-%m-%d'))

In [566]:
HTML(html)

# Render

The final render is an html file, you can run the next cell to download and open it directly.

In [567]:
filename = "MonGrandQuotidien_" + now.strftime('%Y-%m-%d') + ".html"
with open(filename, 'w', encoding='utf-8') as file:
    file.write(html)
webbrowser.open(filename)

True

In [569]:
#!pip freeze > requirements.txt