### This script will scrape the Hedgeye webpage and retrieve the top trending articles' information into a CSV file

In [1]:
# For dataframes manipulation
import pandas as pd

# For web scrapping
import requests
from bs4 import BeautifulSoup

# For CSV writer
from io import StringIO
from csv import writer

1/ Consts declaration

In [2]:
# Main url for the website
CONST_URL = 'https://app.hedgeye.com'

# Suffix for the website's main page to scrape the list of trending articles
CONST_homepageSuffix = '/insights'

# HTML class for each trending article's link
CONST_TrendingArticlesClass = 'trending-insight'
CONST_nbArticlesToScrape = 6

# HTML classes for each needed information from each article link
CONST_class_articleName = 'headline-link'
CONST_class_authorPhoto = 'headshot'
CONST_class_authorName = 'full-name'
CONST_class_authorTwitter = 'twitter-handle'

CONST_itemProp_datePublished = 'datePublished'
CONST_itemProp_contentBody = 'articleBody'

# Dataframe header containing the results
columns = ['URL', 'ARTICLE_NAME', 'DATE_PUBLISHED', 'AUTHOR_NAME', 'AUTHOR_PHOTO_LINK', 'AUTHOR_TWITTER']

2/ Main

In [3]:
output = StringIO()
csv_writer = writer(output)

In [4]:
# Scrape the whole main webpage
res = requests.get(CONST_URL + CONST_homepageSuffix)
soup = BeautifulSoup(res.content,'lxml')

# Get the list of trending articles from the main page
trendingArticles = soup.find_all("div", {"class": CONST_TrendingArticlesClass})

In [5]:
# To loop through 'CONST_nbArticlesToScrape' articles maximum
cptArticles = 0

# Loop through each trending article
for article in trendingArticles:
    # Get the article URL
    articleUrl = article.find('a').get('href')
    
    # Query the URL
    resCurrentArticle = requests.get(CONST_URL + articleUrl)
    soupCurrentArticle = BeautifulSoup(resCurrentArticle.content,'lxml')
    
    # Retrieve each variable
    articleName = soupCurrentArticle.find("div", {"class": CONST_class_articleName})
    if (articleName is not None): articleName = articleName.find('h1', {"itemprop": 'name'}).getText()
        
    datePublished = soupCurrentArticle.find("time", {"itemprop": CONST_itemProp_datePublished})
    if (datePublished is not None): datePublished = datePublished.get('datetime')
    
    authorPhotoLink = soupCurrentArticle.find("div", {"class": CONST_class_authorPhoto})
    if (authorPhotoLink is not None): authorPhotoLink = authorPhotoLink.find('img').get('src')
    
    authorName = soupCurrentArticle.find("div", {"class": CONST_class_authorName})
    if (authorName is not None): authorName = authorName.getText()
    
    authorTwitter = soupCurrentArticle.find("div", {"class": CONST_class_authorTwitter})
    if (authorTwitter is not None): authorTwitter = authorTwitter.find('a').getText()

    # Write the elements into a CSV writer flow
    csv_writer.writerow((articleUrl, articleName, datePublished, authorName, authorPhotoLink, authorTwitter))
    cptArticles = cptArticles + 1
    if (cptArticles >= CONST_nbArticlesToScrape):
        break

In [6]:
# We need to get back to the start of the BytesIO
output.seek(0)

# Retrieve the results from the CSV writer and put them into a Pandas dataframe
df_results = pd.read_csv(output, header=None)
df_results.columns = columns

# Format results
df_results[[columns[1]]] = df_results[[columns[1]]].apply(lambda x: x.str.replace('\n','')) # Remove CRLF from the article names

df_results.head()

Unnamed: 0,URL,ARTICLE_NAME,DATE_PUBLISHED,AUTHOR_NAME,AUTHOR_PHOTO_LINK,AUTHOR_TWITTER
0,/insights/76052-webcast-replay-one-on-one-with...,WEBCAST REPLAY: One-On-One With Renowned Short...,2019-06-20T08:27:21-04:00,,,
1,/insights/76037-mccullough-i-want-you-to-focus...,McCullough: I Want You To Focus On 'Full Cycle...,2019-06-19T12:07:18-04:00,,,
2,/insights/76020-mccullough-draghi-goes-dovish-...,McCullough: Draghi Goes Dovish → It's Called '...,2019-06-18T12:54:17-04:00,,,
3,/insights/75992-quad-4-signs-are-everywhere,Quad 4 Signs Are Everywhere,2019-06-17T12:34:29-04:00,,,
4,/insights/75963-long-gold-you-re-getting-paid,Long Gold? You're Getting Paid,2019-06-14T11:48:47-04:00,,,
