# Getting Data
This notebook showcases how to download data available on the Internet. We cover most formats the data is typically available in, and learn/practice via example Python code or utilities for getting data. 

TOPIC1: Getting data from a Web URL: text, HTML, XML, PDF.

TOPIC2: Crawling/Scraping data from the Web (entire websites).

TOPIC3: Getting data via APIs (JSON format).

## TOPIC1: Getting data from a Web URL: text, HTML, PDF.

In [1]:
import sys

In [2]:
#To check which Python version and virtual environment this Jupyter Notebook uses
print(sys.executable)
print(sys.version_info)
#print(sys.path)

#If you find that Jupyter Notebook does not point to the required virtual environment
#remove the venv and re-create the virtual environment using
#conda create --name comp47350py37 python=3.7 jupyter
#Use 'pip install' to re-install required packages

/Users/wenqingzhao/opt/anaconda3/envs/comp47350py38/bin/python
sys.version_info(major=3, minor=8, micro=16, releaselevel='final', serial=0)


In [3]:
#Import all required packages
#If you don't have these packages, install using: pip install <package-name>

#Import package 'requests'for URL scrapping
import requests
# Import package for reading csv files 
import pandas as pd
#import package 'beautifulsoup' to extract the content of HTML fields 
#pip install bs4
from bs4 import BeautifulSoup

#pip install newspaper3k
import newspaper

#import package 'feedparser'
#Feedparser is a library to parse RSS/XML feeds, these are files with a specific XML structure
import feedparser
#import package 'json' to parse json objects
import json
#Import the necessary methods from the "twitter" library
from twitter import Twitter, OAuth, TwitterHTTPError, TwitterStream

#Look at the package structure to understand how to use it
#print(dir(requests))

#Look at individual functions
#help(requests.get)

#As an alternative can use '?', same as help() but opens a new window
#?requests.get

ImportError: cannot import name 'Twitter' from 'twitter' (/Users/wenqingzhao/opt/anaconda3/envs/comp47350py38/lib/python3.8/site-packages/twitter/__init__.py)

In [None]:
#Get a text file.
#Get book "Alice's Adventures in Wonderland" from Project Gutenberg, in text format

#Give the URL for the file to be downloaded
url='https://www.gutenberg.org/files/11/11-0.txt'
#Look at the object returned by requests.get()
requests_object = requests.get(url)

#print(requests_object.content)

#Get the content from the downloaded text file
text_page = requests_object.text
#print(text_page)

#Look at the first 500 characters of the book
print(text_page[:500])

In [None]:
# Reading from a csv file, into a data frame
df = pd.read_csv('MotorInsuranceFraudClaimABTFull.csv')

# Check how many rows and columns this dataframe has
print("number of rows and columns:", df.shape)

# Show first 10 rows of data frame
# The rows are indexed starting from 0
df.head(10)

# Show last 10 rows of data frame
# The rows are indexed starting from 0
#df.tail(10)

In [None]:
#Get an HTML file.
#Get news article from IrishTimes website.

#Give the URL for the file to be downloaded
url = "https://www.irishtimes.com/news/world/covid-pandemic-could-end-this-year-if-vaccine-targets-met-says-who-1.4784483"

#Get the content from the downloaded html file
html_page = requests.get(url).text
#Look at the format of the html file
print(html_page[:1000])

#write the content to a file
file = open("it-news-covid-pandemic-could-end-this-year.html", "w") 
file.write(html_page)
file.close()

In [None]:
# we can download a newsarticle and parse it using the newspaper3k library
# https://buildmedia.readthedocs.org/media/pdf/newspaper/latest/newspaper.pdf
# newspaper cannot parse all types of html files, for more complex file structure we still need 'beautifulsoup'
from newspaper import Article

url ="https://www.irishtimes.com/news/world/covid-pandemic-could-end-this-year-if-vaccine-targets-met-says-who-1.4784483"
article = Article(url)
article.download()

#print(article.html)

article.parse()
print("Authors:", article.authors)
print("Date:", article.publish_date)
print("Title:", article.title)
print("Text:", article.text)
print("URL:", article.url)


In [None]:
#Use package 'beautifulsoup' to extract the content of HTML fields 
#Need to know the HTML structure and the tags containing the information we need
#To look at the HTML file open it in a text editor, look for the tags that contain headline, subheadline, article body 
#If you don't have beautifulsoup4 installed, run in shell: conda install beautifulsoup4

# Method to parse the structure of an html page using package beautifulsoup.
# The code looks for specific tags in the html structure and extracts the content
def getArticleDetailsByUrl(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.text,"html.parser")
    #soup.prettify()
    
    headline = soup.title.string
    subheadline = soup.head.find("meta",attrs={"name":"description"}).get('content')

    doc_body = ''
    if "The Irish Times" in soup.text:
        for body_p_tag in soup.article.find_all("p", attrs={"class": "no_name"}):
            doc_body += body_p_tag.get_text() + '\n'

    source = "Other"
    try:
        if "irishtimes" in url:
            source = "IrishTimes"
            body_p_tag = soup.article.find("div", attrs={"class": "last_updated"}).find("p")
    except:
        pass

    first_sentence = doc_body.split(".")[0]

    return [headline, subheadline, first_sentence, doc_body, source]

# Main code that calls our parsing method getArticleDetailsByUrl(url) for specific html pages.
if __name__ == '__main__':
    article_url = "https://www.irishtimes.com/news/world/covid-pandemic-could-end-this-year-if-vaccine-targets-met-says-who-1.4784483"
    #print(getArticleDetailsByUrl(article_url))
    
    print("\nField by field:\n")
    [headline, subheadline, first_sentence, doc_body, source] = getArticleDetailsByUrl(article_url)
    print("Headline:\n", headline, "\n")
    print("Subheadline:\n", subheadline, "\n")
    print("First sentence:\n", first_sentence, "\n")
    print("Article body:\n", doc_body)

In [None]:
#Downloading and working with an XML file
#Get the whole RSS feed for the Irish Times news articles
#This is an XML file listing the URLs of individual news articles published online
#Need to know the structure of the XML to be able to extract text from specific tags

#Parse the XML file to retrieve the URLs for individual news articles.
#Parse each article's HTML page

def scrapeRSSFeed(rss_feed):
    d = feedparser.parse(rss_feed)
    #print(d)
    #print(d['entries'], "\n")
        
    for item in d['entries']:
        #Extract an article URL
        article_url = item['link']
        [headline, subheadline, first_sentence, doc_body, source] = getArticleDetailsByUrl(article_url)
        print("\nArticle:", headline, "\n")

#Here you have your very own RSS feed reader in a few lines of code.
if __name__ == '__main__':

    #The URL of the XML file
    url='https://www.irishtimes.com/rss/irish-times-top-10-stories-1.4019566'
    xml_page = requests.get(url).text
    
    #Look at the structure of the XML file
    #To have a proper look, open the XML file with a text editor
    print(xml_page[:1000])

    # Call the method that parses a given XML file
    scrapeRSSFeed(url)

In [None]:
#Get a PDF file, save it to disk.

# Give url of the PDF file
url='http://www.greenteapress.com/thinkpython/thinkpython.pdf'
# Download the pdf file into request_object
request_object = requests.get(url)

#PDF is a binary format. Use request.content instead of request.text
#Write binary content on your machine's disk in a file named 'thinkpython.pdf'
with open("thinkpython.pdf", "wb") as pdffile:
    # Look at the conent of the file; it looks all gibberish since it is a binary format.
    # To make sense of the content, we need tools that can read pdf format and extract it to plain text.
    # See next cell for pdftotext tool.
    print(request_object.content[:500])
    
    #Print the content of the request_object to a file named "thinkpython.pdf"
    pdffile.write(request_object.content)

#Check that it downloaded the file to the current directory.
#%ls

In [None]:
import pdftotext

# Make sure you have downloaded the "thinkpython.pdf" file in your current folder
# http://www.greenteapress.com/thinkpython/thinkpython.pdf

# Load your PDF
with open('thinkpython.pdf', 'rb') as f:
    pdf = pdftotext.PDF(f)
    
# What kind of object is this?
#print(type(pdf))

# What are the methods and variables of this object.
#print(dir(pdf))

# Get more detail about how to use this object
# print(help(pdf))

# How many pages?
print("pages:", len(pdf))

# Iterate over all the pages
#for page in pdf:
#    print("\n=====newpage:=====\n", page)

# Read some individual pages
print("Page 0:\n", pdf[0])
print("Page 1:\n", pdf[1])

# Read all the text into one string
string_pdf = "\n\n".join(pdf)

# Print the first 500 characters in the string
print("\n\nThe first 500 symbols in the string:\n", string_pdf[:500])

## Topic2: Crawling data from the Web.

As an alternative to using the Python package *requests*, you can use the command line *wget* utility to download an HTML page from a given URL or to download an entire website. If you don't have *wget* on your computer, first install it for your platform.

The *wget* tool is great for crawling entire or parts of websites. It recursively follows URLs up to given depth.
The example below downloads a part of the website locally, in a folder named *en.wikipedia.org*. The parameter -l tells wget to what depth it should follow URLs from the original URL. The parameter --no-parent tells wget to not download anything other than the given path. See http://linuxreviews.org/quicktips/wget/ for more details.

In [None]:
#Crawl the website to depth 1. To stop downloading interrupt the kernel from the menu above.
! wget https://en.wikipedia.org/wiki/Main_Page -r -l 1 --no-parent

In [None]:
#Need to stop crawling after a short while, otherwise it may fill your hard disk or you will get banned by the website
! wget https://www.irishtimes.com -l 1 --no-parent

For a pure Python crawler we can use the Python *wget* package or the *scrapy* package (scrapy only works with Phyton2.7 though). 

## Topic3: Getting data via APIs.
### JSON format: 
JavaScript Object Notation - a text format used widely for web-based resource sharing. Many packages and APIs return data in JSON.

Create a file named *example.json* using the Python code below to write a given string to a file.

In [None]:
json_string = """
{
    "glossary": {
        "title": "example glossary",
		"GlossDiv": {
            "title": "S",
			"GlossList": {
                "GlossEntry": {
                    "ID": "SGML",
					"SortAs": "SGML",
					"GlossTerm": "Standard Generalized Markup Language",
					"Acronym": "SGML",
					"Abbrev": "ISO 8879:1986",
					"GlossDef": {
                        "para": "A meta-markup language, used to create markup languages such as DocBook.",
						"GlossSeeAlso": ["GML", "XML"]
                    },
					"GlossSee": "markup"
                }
            }
        }
    }
}"""
with open("example.json", "w") as file:
    file.write(json_string)    

In [None]:
# Run shell command "cat" to look at the file
# The sign ! tells Jupyter Notebook that the following command is a shell/terminal command.
!cat example.json

In [None]:
json_data = json.load(open('example.json'))
#json_data looks like a nested Python dictionary
print(json_data)

In [None]:
#We can refer to different fields of the json object
print(json_data['glossary']['title'])
print(json_data['glossary']['GlossDiv']['title'])
print(json_data['glossary']['GlossDiv']['GlossList']['GlossEntry']['ID'])

In the example below we use an URL called an API endpoint and the *requests* package to get a json file, as we have seen above in getting data from an URL.


In [None]:
url='https://data.colorado.gov/resource/4ykn-tg5h.json'
json_dataset = requests.get(url).text
print(len(json_dataset))
#Look at the first 500 characters of the json list
print(json_dataset[:500])

with open("data_colorado_gov.json", "w") as file:
    file.write(json_dataset)


## Twitter API

You must have a Twitter account and Twitter OAuth credentials available from https://apps.twitter.com/. 
For now you can use the credentials below, but Twitter may reject too many connections on the same credentials.
It is important to create and use your own authentification. The credentials below will be reset after this lab.
Create a new application (using your own Twitter credentials) and then generate access tokens. See this tutorial for more details:
http://socialmedia-class.org/twittertutorial.html

In [None]:
# Using Twitter Search API to get public tweets from the past
# Initiate the connection to Twitter API
# Twitter API returns data in JSON format

# Variables that contains the user credentials to access Twitter API 
# ACCESS_TOKEN = 'YOUR ACCESS TOKEN"'
# ACCESS_SECRET = 'YOUR ACCESS TOKEN SECRET'
# CONSUMER_KEY = 'YOUR API KEY'
# CONSUMER_SECRET = 'ENTER YOUR API SECRET'
ACCESS_TOKEN = '2839893905-pBXUzdrHCNXyjfPuBpSwxNbH1zyEpRaa2sXK0Jd'
ACCESS_SECRET = 'eNtB7YTAfsMhPIQtKji8aQT7zQFpFfDPR2lQ89WKfgI1U'
CONSUMER_KEY = 'ZqPrfLpc0znZlz3kW2a22VmUa'
CONSUMER_SECRET = 'BHD19T0DmUV2XVvEhUAgvpXMx0nGfxevAtr53NbCd9jQjPyTqn'

oauth = OAuth(ACCESS_TOKEN, ACCESS_SECRET, CONSUMER_KEY, CONSUMER_SECRET)
twitter = Twitter(auth=oauth)
            
# Search for latest 100 tweets about "#biden"
file = open("twitter_search_100tweets_hashtag_biden.json", "w") 

iterator = twitter.search.tweets(q='#biden', result_type='recent', lang='en', count=100)
#print(json.dumps(iterator, indent=4))

for tweet in iterator['statuses']:
    # tweet is a json object
    print(tweet)
    #only print the text of the tweet out of the json object
    print("\n\"", tweet['text'], "\"\n")
    file.write(json.dumps(tweet)+"\n")

Assuming previous tweets were saved in a file *twitter_search_100tweets_hashtag_biden.json*, read the file and look at the tweets format. 

In [None]:
# We use the file saved from last step as an example of working with json
with open('twitter_search_100tweets_hashtag_biden.json', 'r') as f:
    tweets_file = f.readlines()
#print(tweets_file)

for line in tweets_file:
    #print(line)
    try:
        # Read in one line of the file, convert it into a json object 
        tweet = json.loads(line.strip())
        #print(tweet)
        if 'text' in tweet: # only messages contains 'text' field is a tweet
#             print(tweet['id']) # This is the tweet's id
#             print(tweet['created_at']) # when the tweet posted
#             print(tweet['text']) # content of the tweet
                        
#             print(tweet['user']['id']) # id of the user who posted the tweet
#             print(tweet['user']['name']) # name of the user, e.g. "Wei Xu"
#             print(tweet['user']['screen_name']) # name of the user account, e.g. "cocoweixu"

#             hashtags = []
#             for hashtag in tweet['entities']['hashtags']:
#             	hashtags.append(hashtag['text'])
#             print(hashtags)
            date = tweet['created_at']
            id = tweet['id']
            text = tweet['text']
            nfollowers = tweet['user']['followers_count']
            nfriends = tweet['user']['friends_count']
            hashtags = [hashtag['text'] for hashtag in tweet['entities']['hashtags']]
            users = [user_mention['screen_name'] for user_mention in tweet['entities']['user_mentions']]
            urls = [url['expanded_url'] for url in tweet['entities']['urls']]
    
            media_urls = []
            if 'media' in tweet['entities']:
                media_urls = [media['media_url'] for media in tweet['entities']['media']]	  
    
            print([date, id, text, hashtags, users, urls, media_urls, nfollowers, nfriends])
    except:
        # read in a line that is not in JSON format (sometimes error occured)
        print("JSON error!!!")
        continue

In [None]:
#Using Twitter Streaming API to stream tweets in real-time
#Gather all tweets containing a given keyword
#You can also gather all tweets of given user, check Twitter Streaming API details.

# Variables that contains the user credentials to access Twitter API 
# ACCESS_TOKEN = 'YOUR ACCESS TOKEN"'
# ACCESS_SECRET = 'YOUR ACCESS TOKEN SECRET'
# CONSUMER_KEY = 'YOUR API KEY'
# CONSUMER_SECRET = 'ENTER YOUR API SECRET'
ACCESS_TOKEN = '2839893905-pBXUzdrHCNXyjfPuBpSwxNbH1zyEpRaa2sXK0Jd'
ACCESS_SECRET = 'eNtB7YTAfsMhPIQtKji8aQT7zQFpFfDPR2lQ89WKfgI1U'
CONSUMER_KEY = 'ZqPrfLpc0znZlz3kW2a22VmUa'
CONSUMER_SECRET = 'BHD19T0DmUV2XVvEhUAgvpXMx0nGfxevAtr53NbCd9jQjPyTqn'

oauth = OAuth(ACCESS_TOKEN, ACCESS_SECRET, CONSUMER_KEY, CONSUMER_SECRET)

file = open("twitter_stream_10tweets_hashtag_biden.json", "w") 

# Initiate the connection to Twitter Streaming API
twitter_stream = TwitterStream(auth=oauth)

# Get a sample of the public data published on Twitter in real-time
#iterator = twitter_stream.statuses.sample()
# Get a sample of tweets in English, containing #biden"
iterator = twitter_stream.statuses.filter(track="#biden", language="en")

# Print each tweet in the stream to the screen 
# Here we set it to stop after getting 10 tweets. 
# You don't have to set it to stop, but can continue running 
# the Twitter API to collect data for days or even longer. 
# Please read the APIs T&C.
tweet_count = 10

for tweet in iterator:
    tweet_count -= 1
    # Twitter Python Tool wraps the data returned by Twitter 
    # as a TwitterDictResponse object.
    # We convert it back to the JSON format to print/score
    #print(json.dumps(tweet))  
    file.write(json.dumps(tweet)+"\n")

    # The command below will do pretty printing for JSON data, try it out
    print(json.dumps(tweet, indent=4))
       
    if tweet_count <= 0:
        break