In [1]:
import pandas as pd

import newspaper
from newspaper import Article
from newspaper import fulltext
from newspaper import Config
import nltk
import csv

In [3]:
def extract_urls_from_file(index_file):
    """Generate a list of webpages URL files that need to be processed. 

    Args:
        index_file (str): index file

    Returns:
        list: URLs of Webpages. 
    """
    
    webpage_file = open(index_file_path, "r")
    contents = webpage_file.read()
    contents = contents.replace("'","")
    contents_list = contents.split(",")
    cleaned_contents_list = [item.strip() for item in contents_list]
    webpage_file.close()
    return cleaned_contents_list

In [53]:
# Local file directories
# Testing environment needs.
source_path = "../data/Webpages/"
save_path = "../data/Output/"
index_file = '../webpages/webpages_urls.txt'

# # EC2 file directories
# source_path = "/home/ec2-user/s3-drive/Stocks/"
# save_path = "/home/ec2-user/s3-drive/Output/"
# index_file_path = "/home/ec2-user/s3-drive/CompanyNames/top_companies.txt"

In [4]:
web_urls = extract_urls_from_file(index_file_path)

In [55]:


def get_individual_articles_urls(url_list):
    
    """Generates a list of individual articles' URLs that will be processed

    Args:
        url_list (list): Extracted webpages URLs

    Returns:
        list: URLs of individual publications.
        
    """
    
    HEADERS = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:78.0) Gecko/20100101 Firefox/78.0',
           'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'}

    config = Config()
    config.headers = HEADERS
    config.request_timeout = 10
    article_urls = []
    
    for webpage in url_list:
        news_portal = newspaper.build(webpage, config = config)#, memoize_articles=False)
        for i in range (len(url_list)):
            for article in news_portal.articles:
                url = article.url
                if "shell" in url:
                    article_urls.append(url)
                    
    return article_urls

In [49]:
def process_article(shell_links):
        """Scrape and transform selected URLs linked to Shell Petroleum for use within the 
        data processing component of the formed data pipeline.  

    Args:
        shell_links (list[str]): A list of URLs for publications with Shell info 
    """  
    output_data = []
    for url in shell_links:
        if len(shell_links) >0:
            try:
                article = Article(url) #pass the url through the Article object
                article.download() #download the published article of the url

                #NLP preparation
                article.parse() #prepare the article for nlp extractions
                article.download('punkt') #download 'punkt' to enable tokenization
                article.nlp() #enable Natural Language Processing

                #Extract html text
                html = requests.get(url).text
                text = fulltext(html)
                
                #Filter articles that contains 'oil' in the main text
               
                if 'oil' in text:
                    #Remove HTML tags
                    TAG_RE = re.compile(r'<[^>]+>')
                    clean_text = TAG_RE.sub('', text)
                    new_text = ' '.join(clean_text.splitlines())

                    #Remove square brackets
                    authors = ','.join(map(str,article.authors))
                    keywords = ','.join(map(str,article.keywords))


                    output = [article.title,authors,url,new_text,article.summary,keywords,article.publish_date]
                    output_data.append(output)

            except:
                pass   
    return output_data

In [51]:
def save_table(dataframe, output_path, file_name, header):
    """Saves an input pandas dataframe as a CSV file according to input parameters.

    Args:
        dataframe (pandas.dataframe): Input dataframe.
        output_path (str): Path to which the resulting `.csv` file should be saved. 
        file_name (str): The name of the output `.csv` file. 
        header (boolean): Whether to include column headings in the output file.
    """
    print(f"Path = {output_path}, file = {file_name}")
    dataframe.to_csv(output_path + file_name + ".csv", index=False, header=header)

In [59]:
def convert_to_dataframe(extracted_contents, save_folder):
    """Converts the list of lists to a dataframe and saves into a location.
    
    Args:
        extracted_contents (list[lists[str]]): the processed publications list
        save_folder (str): location for storage 
    """
    shell_news = pd.DataFrame(extracted_contents)
    shell_news.columns = ["title","authors","url","text","summary","keywords","publish_date"]
    
    save_table(shell_news, save_path, 'shell_news_data', header=False)
    #return shell_news

In [60]:
if __name__ == "__main__":

    # Get all file names in source data directory of companies whose data needs to be processed, 
    # This information is specified within the `top_companies.txt` file. 
    web_urls = extract_urls_from_file(index_file)

    # Update the company file names to include path information. 
    individual_articles_urls = get_individual_articles_urls(web_urls)
    
    data_processing = process_article(individual_articles_urls)

    # Process company data and create full data output
    convert_to_dataframe(data_processing, save_path)

Path = ../data/Output/, file = shell_news_data
