# Getting Data
This notebook showcases how to download data available on the Internet. We cover most formats the data is typically available in, and learn/practice via example Python code or utilities for getting data. 

TOPIC1: Getting data from a Web URL: text, HTML, XML, PDF.

TOPIC2: Crawling/Scraping data from the Web (entire websites).

TOPIC3: Getting data via APIs (JSON format).

## TOPIC1: Getting data from a Web URL: text, HTML, PDF.

In [1]:
import sys

In [3]:
#To check which Python version and virtual environment this Jupyter Notebook uses
print(sys.executable)
print(sys.version_info)
#print(sys.path)

#If you find that Jupyter Notebook does not point to the required virtual environment
#remove the venv and re-create the virtual environment using
#conda create --name comp47350py37 python=3.7 jupyter
#Use 'pip install' to re-install required packages

/opt/anaconda3/envs/comp47350py38/bin/python
sys.version_info(major=3, minor=8, micro=2, releaselevel='final', serial=0)


In [4]:
#Import all required packages
#If you don't have these packages, install using: pip install <package-name>

#Import package 'requests'for URL scrapping
import requests
# Import package for reading csv files 
import pandas as pd
#import package 'beautifulsoup' to extract the content of HTML fields 
#pip install bs4
from bs4 import BeautifulSoup

#pip install newspaper3k
import newspaper

#import package 'feedparser'
#Feedparser is a library to parse RSS/XML feeds, these are files with a specific XML structure
import feedparser
#import package 'json' to parse json objects
import json
#Import the necessary methods from the "twitter" library
from twitter import Twitter, OAuth, TwitterHTTPError, TwitterStream

#Look at the package structure to understand how to use it
#print(dir(requests))

#Look at individual functions
#help(requests.get)

#As an alternative can use '?', same as help() but opens a new window
#?requests.get

In [5]:
#Get a text file.
#Get book "Alice's Adventures in Wonderland" from Project Gutenberg, in text format

#Give the URL for the file to be downloaded
url='https://www.gutenberg.org/files/11/11-0.txt'
#Look at the object returned by requests.get()
requests_object = requests.get(url)

#print(requests_object.content)

#Get the content from the downloaded text file
text_page = requests_object.text
#print(text_page)

#Look at the first 500 characters of the book
print(text_page[:500])

ï»¿The Project Gutenberg eBook of Aliceâs Adventures in Wonderland, by Lewis Carroll

This eBook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this eBook or online at
www.gutenberg.org. If you are not located in the United States, you
will have to check the laws of the country where you


In [6]:
# Reading from a csv file, into a data frame
df = pd.read_csv('MotorInsuranceFraudClaimABTFull.csv')

# Check how many rows and columns this dataframe has
print("number of rows and columns:", df.shape)

# Show first 10 rows of data frame
# The rows are indexed starting from 0
df.head(10)

# Show last 10 rows of data frame
# The rows are indexed starting from 0
#df.tail(10)

number of rows and columns: (500, 14)


Unnamed: 0,ID,Insurance Type,Income of Policy Holder,Marital Status,Num Claimants,Injury Type,Overnight Hospital Stay,Claim Amount,Total Claimed,Num Claims,Num Soft Tissue,% Soft Tissue,Claim Amount Received,Fraud Flag
0,1,CI,0,,2,Soft Tissue,No,1625,3250,2,2.0,1.0,0,1
1,2,CI,0,,2,Back,Yes,15028,60112,1,0.0,0.0,15028,0
2,3,CI,54613,Married,1,Broken Limb,No,-99999,0,0,0.0,0.0,572,0
3,4,CI,0,,3,Serious,Yes,270200,0,0,0.0,0.0,270200,0
4,5,CI,0,,4,Soft Tissue,No,8869,0,0,0.0,0.0,0,1
5,6,CI,0,,1,Broken Limb,Yes,17480,0,0,0.0,0.0,17480,0
6,7,CI,52567,Single,3,Broken Limb,No,3017,18102,2,1.0,0.5,0,1
7,8,CI,0,,2,Back,Yes,7463,0,0,0.0,0.0,7463,0
8,9,CI,0,,1,Soft Tissue,No,2067,0,0,,0.0,2067,0
9,10,CI,42300,Married,4,Back,No,2260,0,0,0.0,0.0,2260,0


In [7]:
#Get an HTML file.
#Get news article from IrishTimes website.

#Give the URL for the file to be downloaded
url = "https://www.irishtimes.com/news/world/covid-pandemic-could-end-this-year-if-vaccine-targets-met-says-who-1.4784483"

#Get the content from the downloaded html file
html_page = requests.get(url).text
#Look at the format of the html file
print(html_page[:1000])

#write the content to a file
file = open("it-news-covid-pandemic-could-end-this-year.html", "w") 
file.write(html_page)
file.close()

<!DOCTYPE html><html lang="en"><head><script data-integration="inlineScripts">
    (function() {
      var _sf_async_config = window._sf_async_config = (window._sf_async_config || {});
      _sf_async_config.uid = 31036;
      _sf_async_config.domain = "irishtimes.com";
      _sf_async_config.useCanonical = true;
      _sf_async_config.useCanonicalDomain = true;
      _sf_async_config.sections = "world";
      
      _sf_async_config.flickerControl = false;
      var _cbq = window._cbq = (window._cbq || []);
      const OT = document.cookie.split('; ').find(row => row.startsWith('blaize_jwt'));
      if (OT)
      {
        _cbq.push(['_acct', 'paid']);
      } else {
        _cbq.push(['_acct', 'anon']);
      }
    })();
  ;
    var _comscore = _comscore || []; _comscore.push({ c1: "2", c2: "8946263" });
  ;(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'

In [8]:
# we can download a newsarticle and parse it using the newspaper3k library
# https://buildmedia.readthedocs.org/media/pdf/newspaper/latest/newspaper.pdf
# newspaper cannot parse all types of html files, for more complex file structure we still need 'beautifulsoup'
from newspaper import Article

url ="https://www.irishtimes.com/news/world/covid-pandemic-could-end-this-year-if-vaccine-targets-met-says-who-1.4784483"
article = Article(url)
article.download()

#print(article.html)

article.parse()
print("Authors:", article.authors)
print("Date:", article.publish_date)
print("Title:", article.title)
print("Text:", article.text)
print("URL:", article.url)


Authors: ['Mon Jan -']
Date: None
Title: Covid pandemic could end this year if vaccine targets met, says WHO
Text: WHO director general Dr. Tedros Adhanom Ghebreyesus has said that it is possible for the acute phase of the Covid-19 pandemic to end this year, but only if strategies and tools such as testing, and vaccines are used in a comprehensive way.

The head of the World Health Organisation (WHO) has said the acute phase of the pandemic could end this year, if some key targets are met. Tedros Adhanom Ghebreyesus, the WHO’s director-general, also warned that conditions remain ideal for more coronavirus variants to emerge and that it is dangerous to assume Omicron is the last one or that “we are in the endgame”.

Dr Tedros said on Monday that “we can end Covid-19 as a global health emergency, and we can do it this year”, by reaching goals such as the WHO’s target to vaccinate 70 per cent of the population of each country by the middle of this year, with a focus on people who are at t

In [9]:
#Use package 'beautifulsoup' to extract the content of HTML fields 
#Need to know the HTML structure and the tags containing the information we need
#To look at the HTML file open it in a text editor, look for the tags that contain headline, subheadline, article body 
#If you don't have beautifulsoup4 installed, run in shell: conda install beautifulsoup4

# Method to parse the structure of an html page using package beautifulsoup.
# The code looks for specific tags in the html structure and extracts the content
def getArticleDetailsByUrl(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.text,"html.parser")
    #soup.prettify()
    
    headline = soup.title.string
    subheadline = soup.head.find("meta",attrs={"name":"description"}).get('content')

    doc_body = ''
    if "The Irish Times" in soup.text:
        for body_p_tag in soup.article.find_all("p", attrs={"class": "no_name"}):
            doc_body += body_p_tag.get_text() + '\n'

    source = "Other"
    try:
        if "irishtimes" in url:
            source = "IrishTimes"
            body_p_tag = soup.article.find("div", attrs={"class": "last_updated"}).find("p")
    except:
        pass

    first_sentence = doc_body.split(".")[0]

    return [headline, subheadline, first_sentence, doc_body, source]

# Main code that calls our parsing method getArticleDetailsByUrl(url) for specific html pages.
if __name__ == '__main__':
    article_url = "https://www.irishtimes.com/news/world/covid-pandemic-could-end-this-year-if-vaccine-targets-met-says-who-1.4784483"
    #print(getArticleDetailsByUrl(article_url))
    
    print("\nField by field:\n")
    [headline, subheadline, first_sentence, doc_body, source] = getArticleDetailsByUrl(article_url)
    print("Headline:\n", headline, "\n")
    print("Subheadline:\n", subheadline, "\n")
    print("First sentence:\n", first_sentence, "\n")
    print("Article body:\n", doc_body)


Field by field:

Headline:
 Covid pandemic could end this year if vaccine targets met, says WHO – The Irish Times 

Subheadline:
 WHO chief warns against talk of ‘endgame’ as conditions ideal for new variant 

First sentence:
  

Article body:
 


In [10]:
#Downloading and working with an XML file
#Get the whole RSS feed for the Irish Times news articles
#This is an XML file listing the URLs of individual news articles published online
#Need to know the structure of the XML to be able to extract text from specific tags

#Parse the XML file to retrieve the URLs for individual news articles.
#Parse each article's HTML page

def scrapeRSSFeed(rss_feed):
    d = feedparser.parse(rss_feed)
    #print(d)
    #print(d['entries'], "\n")
        
    for item in d['entries']:
        #Extract an article URL
        article_url = item['link']
        [headline, subheadline, first_sentence, doc_body, source] = getArticleDetailsByUrl(article_url)
        print("\nArticle:", headline, "\n")

#Here you have your very own RSS feed reader in a few lines of code.
if __name__ == '__main__':

    #The URL of the XML file
    url='https://www.irishtimes.com/rss/irish-times-top-10-stories-1.4019566'
    xml_page = requests.get(url).text
    
    #Look at the structure of the XML file
    #To have a proper look, open the XML file with a text editor
    print(xml_page[:1000])

    # Call the method that parses a given XML file
    scrapeRSSFeed(url)




Article: Enoch Burke to be fined €700 daily if he does not stay away from Co Westmeath school – The Irish Times 


Article: Sinn Féin made two more errors in 2020 election expenses statement – The Irish Times 


Article: My daughter thinks someone in our home is trying to kill her. I called Camhs. No availability – The Irish Times 


Article: Enoch Burke has ridden roughshod over the rights of students he was employed to teach – The Irish Times 


Article: ‘Better late than never’: Germany supports Scholz battle tank decision – The Irish Times 


Article: The real reasons why women get paid less than men – The Irish Times 


Article: Sonia O’Sullivan: More people are starting to take notice of Dublin sprinter Rhasidat Adeleke – The Irish Times 


Article: Security at Dublin’s Mansion House under review after Lord Mayor’s concern during anti-refugee protests – The Irish Times 


Article: Céad míle slán: An Irish emigrant’s tale of one hundred thousand goodbyes – The Irish Times 


Ar

In [77]:
#Get a PDF file, save it to disk.

# Give url of the PDF file
url='http://www.greenteapress.com/thinkpython/thinkpython.pdf'
# Download the pdf file into request_object
request_object = requests.get(url)

#PDF is a binary format. Use request.content instead of request.text
#Write binary content on your machine's disk in a file named 'thinkpython.pdf'
with open("thinkpython.pdf", "wb") as pdffile:
    # Look at the conent of the file; it looks all gibberish since it is a binary format.
    # To make sense of the content, we need tools that can read pdf format and extract it to plain text.
    # See next cell for pdftotext tool.
    print(request_object.content[:500])
    
    #Print the content of the request_object to a file named "thinkpython.pdf"
    pdffile.write(request_object.content)

#Check that it downloaded the file to the current directory.
#%ls

b'%PDF-1.5\n%\xd0\xd4\xc5\xd8\n2 0 obj <<\n/Type /ObjStm\n/N 100\n/First 804\n/Length 1113      \n/Filter /FlateDecode\n>>\nstream\nx\xda\x9dV\xdbn\x9cH\x10}\x9f\xaf\xa8\xc7d\xb5\x8a\xe9\x0b\xdd\xb0\x8a\x12E\x9b8\xca\xc3*Vl%\xcf\x1d\xe8\x19\xa30\x80\x1a\xb0=\xfb\xf5{\x8a\x8b\xed\xec\xa5\x07\xed\x83M\x0f\xd49UuNu\x83\xa0\x84RR\x82\x0ciA9YA"\xa1\xdc\x90P$dFB\x930\x92\x84%\x91c\x99\x91\x14)\xfeHj\xbb\x93\x92\xa4\xc1\x1d\xe0\x93\x04KR\xe0\x919)\xa3p\x87T\x86\x8b\x02-\x9ek\xd2\xd2\x92\xb2\xa4S<\xcfH\xdb\x84\xf3\xa5\x89\xdciI)\xb2\xe8\x94R\x8d\x8b\xa14\xc3%\'#\xb0L\xc8(<Pd\x8c\xc5s26\xa7\xd4\xa2J`3\xb2\xa00\x82\xacU;\xd4h\xf3\x8cLJ\x19R\x1bC\x19\x12\x99\x9c\xb2\x1c\xcf\xd1\x10\xaa\xb3\x8a\xf2\tDy\x96\x00\x84F\x05P\xe81Q\x922n\xdc\xe8]\x86f\x13\x0b\x9a\x94\x84H2\xca \x85\x90\t\x88pE!y\x82\xab\x01\x07\xf4\x11\xb9%VE&9a)\xa4\xc2\x15|2e2\xc1\x02\xaa\x9dH\xc0\xc8\x02\x89\x04\x94J1?\xcbk\xf8\x0eHU.9\x13\x94\xe6_,\xbbF[\x82\x85\xd7)\x17\x01b\x9d\xe1\x1f\xc4\x17)\x07B~\x91j\xb9\x130@\xa4h\x19\t\xb1\

In [28]:
import pdftotext

# Make sure you have downloaded the "thinkpython.pdf" file in your current folder
# http://www.greenteapress.com/thinkpython/thinkpython.pdf

# Load your PDF
with open('thinkpython.pdf', 'rb') as f:
    pdf = pdftotext.PDF(f)
    
# What kind of object is this?
#print(type(pdf))

# What are the methods and variables of this object.
#print(dir(pdf))

# Get more detail about how to use this object
# print(help(pdf))

# How many pages?
print("pages:", len(pdf))

# Iterate over all the pages
#for page in pdf:
#    print("\n=====newpage:=====\n", page)

# Read some individual pages
print("Page 0:\n", pdf[0])
print("Page 1:\n", pdf[1])

# Read all the text into one string
string_pdf = "\n\n".join(pdf)

# Print the first 500 characters in the string
print("\n\nThe first 500 symbols in the string:\n", string_pdf[:500])

pages: 240
Page 0:
                    Think Python
How to Think Like a Computer Scientist



                             Version 2.0.17

Page 1:
 


The first 500 symbols in the string:
                    Think Python
How to Think Like a Computer Scientist



                             Version 2.0.17




                   Think Python
How to Think Like a Computer Scientist



                               Version 2.0.17




                        Allen Downey


                       Green Tea Press
                       Needham, Massachusetts


Copyright © 2012 Allen Downey.


Green Tea Press
9 Washburn Ave
Needham MA 02492
Permission is granted to copy, distribute, a


## Topic2: Crawling data from the Web.

As an alternative to using the Python package *requests*, you can use the command line *wget* utility to download an HTML page from a given URL or to download an entire website. If you don't have *wget* on your computer, first install it for your platform.

The *wget* tool is great for crawling entire or parts of websites. It recursively follows URLs up to given depth.
The example below downloads a part of the website locally, in a folder named *en.wikipedia.org*. The parameter -l tells wget to what depth it should follow URLs from the original URL. The parameter --no-parent tells wget to not download anything other than the given path. See http://linuxreviews.org/quicktips/wget/ for more details.

In [29]:
#Crawl the website to depth 1. To stop downloading interrupt the kernel from the menu above.
! wget https://en.wikipedia.org/wiki/Main_Page -r -l 1 --no-parent

--2023-01-26 15:20:42--  https://en.wikipedia.org/wiki/Main_Page
Resolving en.wikipedia.org (en.wikipedia.org)... 91.198.174.192
Connecting to en.wikipedia.org (en.wikipedia.org)|91.198.174.192|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 91310 (89K) [text/html]
Saving to: ‘en.wikipedia.org/wiki/Main_Page’


2023-01-26 15:20:42 (1.60 MB/s) - ‘en.wikipedia.org/wiki/Main_Page’ saved [91310/91310]

Loading robots.txt; please ignore errors.
--2023-01-26 15:20:42--  https://en.wikipedia.org/robots.txt
Reusing existing connection to en.wikipedia.org:443.
HTTP request sent, awaiting response... 200 OK
Length: 27525 (27K) [text/plain]
Saving to: ‘en.wikipedia.org/robots.txt’


2023-01-26 15:20:42 (15.4 MB/s) - ‘en.wikipedia.org/robots.txt’ saved [27525/27525]

FINISHED --2023-01-26 15:20:42--
Total wall clock time: 0.2s
Downloaded: 2 files, 116K in 0.06s (2.02 MB/s)


In [30]:
#Need to stop crawling after a short while, otherwise it may fill your hard disk or you will get banned by the website
! wget https://www.irishtimes.com -l 1 --no-parent

--2023-01-26 15:20:47--  https://www.irishtimes.com/
Resolving www.irishtimes.com (www.irishtimes.com)... 23.72.36.195, 23.72.36.192
Connecting to www.irishtimes.com (www.irishtimes.com)|23.72.36.195|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: ‘index.html’

index.html              [  <=>               ]   1.49M  3.77MB/s    in 0.4s    

2023-01-26 15:20:47 (3.77 MB/s) - ‘index.html’ saved [1563888]



For a pure Python crawler we can use the Python *wget* package or the *scrapy* package (scrapy only works with Phyton2.7 though). 

## Topic3: Getting data via APIs.
### JSON format: 
JavaScript Object Notation - a text format used widely for web-based resource sharing. Many packages and APIs return data in JSON.

Create a file named *example.json* using the Python code below to write a given string to a file.

In [31]:
json_string = """
{
    "glossary": {
        "title": "example glossary",
		"GlossDiv": {
            "title": "S",
			"GlossList": {
                "GlossEntry": {
                    "ID": "SGML",
					"SortAs": "SGML",
					"GlossTerm": "Standard Generalized Markup Language",
					"Acronym": "SGML",
					"Abbrev": "ISO 8879:1986",
					"GlossDef": {
                        "para": "A meta-markup language, used to create markup languages such as DocBook.",
						"GlossSeeAlso": ["GML", "XML"]
                    },
					"GlossSee": "markup"
                }
            }
        }
    }
}"""
with open("example.json", "w") as file:
    file.write(json_string)    

In [32]:
# Run shell command "cat" to look at the file
# The sign ! tells Jupyter Notebook that the following command is a shell command.
!cat example.json


{
    "glossary": {
        "title": "example glossary",
		"GlossDiv": {
            "title": "S",
			"GlossList": {
                "GlossEntry": {
                    "ID": "SGML",
					"SortAs": "SGML",
					"GlossTerm": "Standard Generalized Markup Language",
					"Acronym": "SGML",
					"Abbrev": "ISO 8879:1986",
					"GlossDef": {
                        "para": "A meta-markup language, used to create markup languages such as DocBook.",
						"GlossSeeAlso": ["GML", "XML"]
                    },
					"GlossSee": "markup"
                }
            }
        }
    }
}

In [33]:
json_data = json.load(open('example.json'))
#json_data looks like a nested Python dictionary
print(json_data)

{'glossary': {'title': 'example glossary', 'GlossDiv': {'title': 'S', 'GlossList': {'GlossEntry': {'ID': 'SGML', 'SortAs': 'SGML', 'GlossTerm': 'Standard Generalized Markup Language', 'Acronym': 'SGML', 'Abbrev': 'ISO 8879:1986', 'GlossDef': {'para': 'A meta-markup language, used to create markup languages such as DocBook.', 'GlossSeeAlso': ['GML', 'XML']}, 'GlossSee': 'markup'}}}}}


In [34]:
#We can refer to different fields of the json object
print(json_data['glossary']['title'])
print(json_data['glossary']['GlossDiv']['title'])
print(json_data['glossary']['GlossDiv']['GlossList']['GlossEntry']['ID'])

example glossary
S
SGML


In the example below we use an URL called an API endpoint and the *requests* package to get a json file, as we have seen above in getting data from an URL.


In [35]:
url='https://data.colorado.gov/resource/4ykn-tg5h.json'
json_dataset = requests.get(url).text
print(len(json_dataset))
#Look at the first 500 characters of the json list
print(json_dataset[:500])

with open("data_colorado_gov.json", "w") as file:
    file.write(json_dataset)


732370
[{"entityid":"19871006674","entityname":"BILLINGS DITCH COMPANY","principaladdress1":"0721 Lane 2 North","principalcity":"Alamosa","principalstate":"CO","principalzipcode":"81101","principalcountry":"US","mailingaddress1":"P O Box 534","mailingcity":"Monte Vista","mailingstate":"CO","mailingzipcode":"81144","mailingcountry":"US","entitystatus":"Good Standing","jurisdictonofformation":"CO","entitytype":"Nonprofit Corporation","agentfirstname":"Lyla","agentlastname":"Hathaway","agentprincipaladdre


## Twitter API

You must have a Twitter account and Twitter OAuth credentials available from https://apps.twitter.com/. 
For now you can use the credentials below, but Twitter may reject too many connections on the same credentials.
It is important to create and use your own authentification. The credentials below will be reset after this lab.
Create a new application (using your own Twitter credentials) and then generate access tokens. See this tutorial for more details:
http://socialmedia-class.org/twittertutorial.html

In [36]:
# Using Twitter Search API to get public tweets from the past
# Initiate the connection to Twitter API
# Twitter API returns data in JSON format

# Variables that contains the user credentials to access Twitter API 
# ACCESS_TOKEN = 'YOUR ACCESS TOKEN"'
# ACCESS_SECRET = 'YOUR ACCESS TOKEN SECRET'
# CONSUMER_KEY = 'YOUR API KEY'
# CONSUMER_SECRET = 'ENTER YOUR API SECRET'
ACCESS_TOKEN = '2839893905-pBXUzdrHCNXyjfPuBpSwxNbH1zyEpRaa2sXK0Jd'
ACCESS_SECRET = 'eNtB7YTAfsMhPIQtKji8aQT7zQFpFfDPR2lQ89WKfgI1U'
CONSUMER_KEY = 'ZqPrfLpc0znZlz3kW2a22VmUa'
CONSUMER_SECRET = 'BHD19T0DmUV2XVvEhUAgvpXMx0nGfxevAtr53NbCd9jQjPyTqn'

oauth = OAuth(ACCESS_TOKEN, ACCESS_SECRET, CONSUMER_KEY, CONSUMER_SECRET)
twitter = Twitter(auth=oauth)
            
# Search for latest 100 tweets about "#biden"
file = open("twitter_search_100tweets_hashtag_biden.json", "w") 

iterator = twitter.search.tweets(q='#biden', result_type='recent', lang='en', count=100)
#print(json.dumps(iterator, indent=4))

for tweet in iterator['statuses']:
    # tweet is a json object
    print(tweet)
    #only print the text of the tweet out of the json object
    print("\n\"", tweet['text'], "\"\n")
    file.write(json.dumps(tweet)+"\n")

{'created_at': 'Thu Jan 26 15:21:09 +0000 2023', 'id': 1618630126840778752, 'id_str': '1618630126840778752', 'text': "Article summary: https://t.co/iBFSOzPyGw (I'm a bot)\n\n#OAN #Biden #President https://t.co/F83Kr3ZKYs", 'truncated': False, 'entities': {'hashtags': [{'text': 'OAN', 'indices': [54, 58]}, {'text': 'Biden', 'indices': [59, 65]}, {'text': 'President', 'indices': [66, 76]}], 'symbols': [], 'user_mentions': [], 'urls': [{'url': 'https://t.co/iBFSOzPyGw', 'expanded_url': 'https://otherweb.com/r/I8mcQEnC/s', 'display_url': 'otherweb.com/r/I8mcQEnC/s', 'indices': [17, 40]}, {'url': 'https://t.co/F83Kr3ZKYs', 'expanded_url': 'https://twitter.com/1209936918/status/1618629730571550720', 'display_url': 'twitter.com/1209936918/sta…', 'indices': [77, 100]}]}, 'metadata': {'iso_language_code': 'en', 'result_type': 'recent'}, 'source': '<a href="https://help.twitter.com/en/using-twitter/how-to-tweet#source-labels" rel="nofollow">Valurank</a>', 'in_reply_to_status_id': None, 'in_reply

Assuming previous tweets were saved in a file *twitter_search_100tweets_hashtag_biden.json*, read the file and look at the tweets format. 

In [37]:
# We use the file saved from last step as an example of working with json
with open('twitter_search_100tweets_hashtag_biden.json', 'r') as f:
    tweets_file = f.readlines()
#print(tweets_file)

for line in tweets_file:
    #print(line)
    try:
        # Read in one line of the file, convert it into a json object 
        tweet = json.loads(line.strip())
        #print(tweet)
        if 'text' in tweet: # only messages contains 'text' field is a tweet
#             print(tweet['id']) # This is the tweet's id
#             print(tweet['created_at']) # when the tweet posted
#             print(tweet['text']) # content of the tweet
                        
#             print(tweet['user']['id']) # id of the user who posted the tweet
#             print(tweet['user']['name']) # name of the user, e.g. "Wei Xu"
#             print(tweet['user']['screen_name']) # name of the user account, e.g. "cocoweixu"

#             hashtags = []
#             for hashtag in tweet['entities']['hashtags']:
#             	hashtags.append(hashtag['text'])
#             print(hashtags)
            date = tweet['created_at']
            id = tweet['id']
            text = tweet['text']
            nfollowers = tweet['user']['followers_count']
            nfriends = tweet['user']['friends_count']
            hashtags = [hashtag['text'] for hashtag in tweet['entities']['hashtags']]
            users = [user_mention['screen_name'] for user_mention in tweet['entities']['user_mentions']]
            urls = [url['expanded_url'] for url in tweet['entities']['urls']]
    
            media_urls = []
            if 'media' in tweet['entities']:
                media_urls = [media['media_url'] for media in tweet['entities']['media']]	  
    
            print([date, id, text, hashtags, users, urls, media_urls, nfollowers, nfriends])
    except:
        # read in a line that is not in JSON format (sometimes error occured)
        print("JSON error!!!")
        continue

['Thu Jan 26 15:21:09 +0000 2023', 1618630126840778752, "Article summary: https://t.co/iBFSOzPyGw (I'm a bot)\n\n#OAN #Biden #President https://t.co/F83Kr3ZKYs", ['OAN', 'Biden', 'President'], [], ['https://otherweb.com/r/I8mcQEnC/s', 'https://twitter.com/1209936918/status/1618629730571550720'], [], 306, 367]
['Thu Jan 26 15:20:59 +0000 2023', 1618630082607869954, 'The Zionist settlers kill freely under the financing and protection of their dog, #Biden!\nThey dug out his classifi… https://t.co/VU7OisXtFa', ['Biden'], [], ['https://twitter.com/i/web/status/1618630082607869954'], [], 166, 796]
['Thu Jan 26 15:20:53 +0000 2023', 1618630059580964864, 'Both #Trump And #Biden #Acted Inappropriately: New #Poll #Shows #Americans #Agree On This #Issue https://t.co/AUMY8JZfqz #inappropriately', ['Trump', 'Biden', 'Acted', 'Poll', 'Shows', 'Americans', 'Agree', 'Issue', 'inappropriately'], [], ['https://www.learndaily.thats.im/both-trump-and-biden-acted-inappropriately-new-poll-shows-americans-ag

In [38]:
#Using Twitter Streaming API to stream tweets in real-time
#Gather all tweets containing a given keyword
#You can also gather all tweets of given user, check Twitter Streaming API details.

# Variables that contains the user credentials to access Twitter API 
# ACCESS_TOKEN = 'YOUR ACCESS TOKEN"'
# ACCESS_SECRET = 'YOUR ACCESS TOKEN SECRET'
# CONSUMER_KEY = 'YOUR API KEY'
# CONSUMER_SECRET = 'ENTER YOUR API SECRET'
ACCESS_TOKEN = '2839893905-pBXUzdrHCNXyjfPuBpSwxNbH1zyEpRaa2sXK0Jd'
ACCESS_SECRET = 'eNtB7YTAfsMhPIQtKji8aQT7zQFpFfDPR2lQ89WKfgI1U'
CONSUMER_KEY = 'ZqPrfLpc0znZlz3kW2a22VmUa'
CONSUMER_SECRET = 'BHD19T0DmUV2XVvEhUAgvpXMx0nGfxevAtr53NbCd9jQjPyTqn'

oauth = OAuth(ACCESS_TOKEN, ACCESS_SECRET, CONSUMER_KEY, CONSUMER_SECRET)

file = open("twitter_stream_10tweets_hashtag_biden.json", "w") 

# Initiate the connection to Twitter Streaming API
twitter_stream = TwitterStream(auth=oauth)

# Get a sample of the public data published on Twitter in real-time
#iterator = twitter_stream.statuses.sample()
# Get a sample of tweets in English, containing #biden"
iterator = twitter_stream.statuses.filter(track="#biden", language="en")

# Print each tweet in the stream to the screen 
# Here we set it to stop after getting 10 tweets. 
# You don't have to set it to stop, but can continue running 
# the Twitter API to collect data for days or even longer. 
# Please read the APIs T&C.
tweet_count = 10

for tweet in iterator:
    tweet_count -= 1
    # Twitter Python Tool wraps the data returned by Twitter 
    # as a TwitterDictResponse object.
    # We convert it back to the JSON format to print/score
    #print(json.dumps(tweet))  
    file.write(json.dumps(tweet)+"\n")

    # The command below will do pretty printing for JSON data, try it out
    print(json.dumps(tweet, indent=4))
       
    if tweet_count <= 0:
        break

{
    "created_at": "Thu Jan 26 15:22:07 +0000 2023",
    "id": 1618630369196281856,
    "id_str": "1618630369196281856",
    "text": "RT @ArmedVeteran45: The White House:\n\nWon\u2019t answer questions on Joe's classified documents scandal, but WILL attack Trump!\n\nWon\u2019t stop spen\u2026",
    "source": "<a href=\"http://twitter.com/download/android\" rel=\"nofollow\">Twitter for Android</a>",
    "truncated": false,
    "in_reply_to_status_id": null,
    "in_reply_to_status_id_str": null,
    "in_reply_to_user_id": null,
    "in_reply_to_user_id_str": null,
    "in_reply_to_screen_name": null,
    "user": {
        "id": 1515132402208755727,
        "id_str": "1515132402208755727",
        "name": "Tony Testerman\ud83c\uddfa\ud83c\uddf8",
        "screen_name": "TestermanTony",
        "location": "Kaufman, TX",
        "url": null,
        "description": "American made Patriot. God loving man. Happily married, and loving life. \ud83c\uddfa\ud83c\uddf8\ud83d\udcaf\ud83d\udcaa",


{
    "created_at": "Thu Jan 26 15:22:17 +0000 2023",
    "id": 1618630413123227654,
    "id_str": "1618630413123227654",
    "text": "RT @AndyOstroy: \"Had #Trump handled the #ClassifiedDocuments that were in his possession the way that #Biden did it never would have been t\u2026",
    "source": "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>",
    "truncated": false,
    "in_reply_to_status_id": null,
    "in_reply_to_status_id_str": null,
    "in_reply_to_user_id": null,
    "in_reply_to_user_id_str": null,
    "in_reply_to_screen_name": null,
    "user": {
        "id": 3550514712,
        "id_str": "3550514712",
        "name": "Gripweed \ud83c\uddee\ud83c\uddf9\ud83c\uddec\ud83c\udde7",
        "screen_name": "ThatBeatleGirl9",
        "location": "Nowhere, man.",
        "url": null,
        "description": "Old-school curmudgeon. I\u2019m a Marxist, of the Groucho variety.",
        "translator_type": "none",
        "protected": false,
  

{
    "created_at": "Thu Jan 26 15:22:41 +0000 2023",
    "id": 1618630511555141634,
    "id_str": "1618630511555141634",
    "text": "RT @truthtsar: It\u2019s being reported that Putin is buying the weapons/equipment that Biden left in Afghanistan to arm the Taliban.\n\nA further\u2026",
    "source": "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>",
    "truncated": false,
    "in_reply_to_status_id": null,
    "in_reply_to_status_id_str": null,
    "in_reply_to_user_id": null,
    "in_reply_to_user_id_str": null,
    "in_reply_to_screen_name": null,
    "user": {
        "id": 959993219713941504,
        "id_str": "959993219713941504",
        "name": "Dorthy Estabrook",
        "screen_name": "DorthyEstabrook",
        "location": "Rural TN",
        "url": null,
        "description": "\ud83c\uddfa\ud83c\uddf8Proud American Conservative\ud83c\uddfa\ud83c\uddf8#AmericaFirst #BorderSecurityNow Tired of all the lie\u2019s coming from D.C. Apath

{
    "created_at": "Thu Jan 26 15:23:17 +0000 2023",
    "id": 1618630664680767488,
    "id_str": "1618630664680767488",
    "text": "@Zibilid @catturd2 Unprovoked #War my @$$!\n\n#Obama-#Biden overthrew a duly elected Ukraine Gov't in 2014, Biden Fam\u2026 https://t.co/CeYzGrQ9Zd",
    "display_text_range": [
        19,
        140
    ],
    "source": "<a href=\"http://twitter.com/download/android\" rel=\"nofollow\">Twitter for Android</a>",
    "truncated": true,
    "in_reply_to_status_id": 1618629724355559426,
    "in_reply_to_status_id_str": "1618629724355559426",
    "in_reply_to_user_id": 1604556493646446593,
    "in_reply_to_user_id_str": "1604556493646446593",
    "in_reply_to_screen_name": "Zibilid",
    "user": {
        "id": 1591242509153353729,
        "id_str": "1591242509153353729",
        "name": "ResistCorruptDC",
        "screen_name": "ResistCorruptDC",
        "location": null,
        "url": null,
        "description": "Our #Constitutional Republic protects t

{
    "created_at": "Thu Jan 26 15:23:39 +0000 2023",
    "id": 1618630756494086146,
    "id_str": "1618630756494086146",
    "text": "RT @RitaCosby: Tonight on @77WABCradio: Angry #NY merchants set up private patrols to combat #crime, some media make excuses for #rioters d\u2026",
    "source": "<a href=\"https://mobile.twitter.com\" rel=\"nofollow\">Twitter Web App</a>",
    "truncated": false,
    "in_reply_to_status_id": null,
    "in_reply_to_status_id_str": null,
    "in_reply_to_user_id": null,
    "in_reply_to_user_id_str": null,
    "in_reply_to_screen_name": null,
    "user": {
        "id": 1538168303788490752,
        "id_str": "1538168303788490752",
        "name": "TrinityTotalNutrition.com",
        "screen_name": "TrinityTNutri",
        "location": "California, USA",
        "url": "http://trinitytotalnutrition.com",
        "description": "At Trinity Total Nutrition we believe in nutrition for mind, body, and spirit. We provide the highest grade wellness, weight loss,

{
    "created_at": "Thu Jan 26 15:23:58 +0000 2023",
    "id": 1618630835233763330,
    "id_str": "1618630835233763330",
    "text": "@odomk1959 @POTUS Also, $35 insulin was Trumps plan, #Biden stopped it 1st week in office only to reintroduce, clai\u2026 https://t.co/90zkWjPb0s",
    "display_text_range": [
        18,
        140
    ],
    "source": "<a href=\"http://twitter.com/download/android\" rel=\"nofollow\">Twitter for Android</a>",
    "truncated": true,
    "in_reply_to_status_id": 1618612315603226625,
    "in_reply_to_status_id_str": "1618612315603226625",
    "in_reply_to_user_id": 3325244059,
    "in_reply_to_user_id_str": "3325244059",
    "in_reply_to_screen_name": "odomk1959",
    "user": {
        "id": 3269290855,
        "id_str": "3269290855",
        "name": "Anna Waul",
        "screen_name": "AnnaWaul",
        "location": "San Diego California, USA",
        "url": null,
        "description": "\u1d18\u029f\u1d00\u0274\u1d1b \u1d0d\u1d0f\u0280\u1d07 \u1d1b\u0

{
    "created_at": "Thu Jan 26 15:24:20 +0000 2023",
    "id": 1618630928448000002,
    "id_str": "1618630928448000002",
    "text": "RT @MaimunkaNews: \u203c\ufe0f\ud83c\uddfa\ud83c\uddf8 #Trump to #Biden: \"Tanks will start first, and then #nuclear weapons\"\n\n\ud83d\udcdd \u201cNow stop this crazy war. It\u2019s simple,\u201d wri\u2026",
    "source": "<a href=\"http://twitter.com/download/android\" rel=\"nofollow\">Twitter for Android</a>",
    "truncated": false,
    "in_reply_to_status_id": null,
    "in_reply_to_status_id_str": null,
    "in_reply_to_user_id": null,
    "in_reply_to_user_id_str": null,
    "in_reply_to_screen_name": null,
    "user": {
        "id": 1365819579335860227,
        "id_str": "1365819579335860227",
        "name": "\ud83c\udf99Lee Kash\ud83e\udd14\ud83d\udfe7\u2b1b",
        "screen_name": "KASH_Presents",
        "location": null,
        "url": "https://youtube.com/@kashpresents648",
        "description": "KASH Presents: \ud83c\uddfa\ud83c\uddf2

{
    "created_at": "Thu Jan 26 15:24:32 +0000 2023",
    "id": 1618630979673030661,
    "id_str": "1618630979673030661",
    "text": "RT @RitaCosby: Tonight on @77WABCradio: Angry #NY merchants set up private patrols to combat #crime, some media make excuses for #rioters d\u2026",
    "source": "<a href=\"https://mobile.twitter.com\" rel=\"nofollow\">Twitter Web App</a>",
    "truncated": false,
    "in_reply_to_status_id": null,
    "in_reply_to_status_id_str": null,
    "in_reply_to_user_id": null,
    "in_reply_to_user_id_str": null,
    "in_reply_to_screen_name": null,
    "user": {
        "id": 19681270,
        "id_str": "19681270",
        "name": "Mike",
        "screen_name": "mwillman",
        "location": "Harrison, Ohio",
        "url": null,
        "description": "Loving this time of year and truly\nhating the current administration and hoping for a miracle to save us from their sick &twisted evil! \n!WE NEED TRUMP ASAP!",
        "translator_type": "none",
        "pro