# <u> Data Extraction

In [1]:
# Libraries you may need to install
# pip install alpha_vantage
# pip install textblob
# pip install pprint
# pip install pandas
# pip install regex
# pip install nltk
# pip install numpy
# pip install matplotlib
# pip install string 

In [1]:
#Importing modules needed
import requests
from pprint import pprint as pp
import pandas as pd
import re
import datetime
from alpha_vantage.timeseries import TimeSeries
import csv
import string

# Introduction
#### For our analysis, we will be using the following data:
<ul> 
    <li>NYT Data: Headlines, abstracts and published dates of New York Times articles, using the New York Times API. We will separate this into two CSVs for later analysis - one with articles from 2021, and one with articles from November 2021 only
    <li>Finance Data: Financial market data for 3 large pharmaceutical companies (Astrazeneca, Pfizer and Moderna), using the Alpha Vantage API
    <li>Guardian Data: Headlines and published dates of Guardian articles about covid, using the Guardian API. We will also extract word frequencies of commonly used words in Guardian article headlines, for each quarter of the year 2021.

# NYT Data

## 2021 Articles
#### Getting data from NewYorkTimes API for the year 2021. API for NewYorkTimes only returns result for a month. Hence, we use for loop so that it runs through each month and appends the data returned
#### Writing the obtained data to csv file named 'NYT_UnfilteredData.csv'

NB You may need to restart the kernel and run this cell again if you run into errors - this is due to an issue with the API.


In [2]:
year='2021'
# month=1

column_list=['abstract', 'web_url', 'snippet', 'lead_paragraph', 'print_section',
       'print_page', 'source', 'multimedia', 'headline', 'keywords',
       'pub_date', 'document_type', 'news_desk', 'section_name', 'byline',
       'type_of_material', '_id', 'word_count', 'uri', 'subsection_name']

appid='tmGtDzyAKd4oBddx3xJMNnycRGHhj0hM'#use personal api id

df1=pd.DataFrame(columns=column_list)

for i in range(1,12):
    endpoint1=('https://api.nytimes.com/svc/archive/v1/{}/{}.json?api-key={}'.format(year, i,appid))
    response=requests.get(endpoint1)
    data=response.json()
    dataextract=data['response']['docs']
    
    df1=df1.append(dataextract, ignore_index=True)
df1.head()


Unnamed: 0,abstract,web_url,snippet,lead_paragraph,print_section,print_page,source,multimedia,headline,keywords,pub_date,document_type,news_desk,section_name,byline,type_of_material,_id,word_count,uri,subsection_name
0,Reasons to be hopeful about the Biden economy.,https://www.nytimes.com/2020/12/31/opinion/202...,Reasons to be hopeful about the Biden economy.,The next few months will be hell in terms of p...,A,18.0,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...","{'main': 'Things Will Get Better. Seriously.',...","[{'name': 'persons', 'value': 'Biden, Joseph R...",2021-01-01T00:00:09+0000,article,OpEd,Opinion,"{'original': 'By Paul Krugman', 'person': [{'f...",Op-Ed,nyt://article/ab590671-f1be-5b6d-8212-a7d18aab...,882,nyt://article/ab590671-f1be-5b6d-8212-a7d18aab...,
1,The video shows a man raising something to his...,https://www.nytimes.com/2020/12/31/us/george-f...,The video shows a man raising something to his...,The Minneapolis Police Department released bod...,A,17.0,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...",{'main': 'Minneapolis Police Release Body Came...,"[{'name': 'persons', 'value': 'Idd, Dolal B', ...",2021-01-01T00:16:53+0000,article,National,U.S.,"{'original': 'By Nicholas Bogel-Burroughs', 'p...",News,nyt://article/8f633850-09ab-53ff-8753-bc72eaba...,861,nyt://article/8f633850-09ab-53ff-8753-bc72eaba...,
2,"Every December since 2017, Ada Rojas has guide...",https://www.nytimes.com/2020/12/31/us/resolvin...,,"Every December since 2017, Ada Rojas has guide...",,,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...",{'main': 'Resolving to live a lot better than ...,[],2021-01-01T00:58:19+0000,article,Express,U.S.,"{'original': 'By Concepción de León', 'person'...",News,nyt://article/35e85454-abfc-5376-b4b5-7d99316d...,263,nyt://article/35e85454-abfc-5376-b4b5-7d99316d...,
3,"The suit, led by Representative Louie Gohmert ...",https://www.nytimes.com/2020/12/31/us/politics...,"The suit, led by Representative Louie Gohmert ...",[Here’s what you need to know about President-...,,,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...",{'main': 'Justice Dept. Asks Judge to Toss Ele...,"[{'name': 'organizations', 'value': 'Justice D...",2021-01-01T01:24:55+0000,article,Washington,U.S.,{'original': 'By Maggie Haberman and Katie Ben...,News,nyt://article/5fc45bbd-d622-56da-875c-b5b7c4ef...,695,nyt://article/5fc45bbd-d622-56da-875c-b5b7c4ef...,Politics
4,The United States recorded its 20 millionth ca...,https://www.nytimes.com/2020/12/31/world/the-u...,,The United States recorded its 20 millionth ca...,,,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...","{'main': 'The U.S. reaches 20 million cases.',...","[{'name': 'subject', 'value': 'internal-essent...",2021-01-01T01:28:22+0000,article,Foreign,World,"{'original': 'By Kate Taylor', 'person': [{'fi...",News,nyt://article/d75205a5-201f-5dc3-b8fe-278c3d64...,438,nyt://article/d75205a5-201f-5dc3-b8fe-278c3d64...,


In [None]:
#df1.to_csv('NYT_UnfilteredData.csv')

#### Converting content of abstract column to all lowercase

In [3]:
df1['abstract']=df1['abstract'].str.lower()
df1.head()

Unnamed: 0,abstract,web_url,snippet,lead_paragraph,print_section,print_page,source,multimedia,headline,keywords,pub_date,document_type,news_desk,section_name,byline,type_of_material,_id,word_count,uri,subsection_name
0,reasons to be hopeful about the biden economy.,https://www.nytimes.com/2020/12/31/opinion/202...,Reasons to be hopeful about the Biden economy.,The next few months will be hell in terms of p...,A,18.0,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...","{'main': 'Things Will Get Better. Seriously.',...","[{'name': 'persons', 'value': 'Biden, Joseph R...",2021-01-01T00:00:09+0000,article,OpEd,Opinion,"{'original': 'By Paul Krugman', 'person': [{'f...",Op-Ed,nyt://article/ab590671-f1be-5b6d-8212-a7d18aab...,882,nyt://article/ab590671-f1be-5b6d-8212-a7d18aab...,
1,the video shows a man raising something to his...,https://www.nytimes.com/2020/12/31/us/george-f...,The video shows a man raising something to his...,The Minneapolis Police Department released bod...,A,17.0,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...",{'main': 'Minneapolis Police Release Body Came...,"[{'name': 'persons', 'value': 'Idd, Dolal B', ...",2021-01-01T00:16:53+0000,article,National,U.S.,"{'original': 'By Nicholas Bogel-Burroughs', 'p...",News,nyt://article/8f633850-09ab-53ff-8753-bc72eaba...,861,nyt://article/8f633850-09ab-53ff-8753-bc72eaba...,
2,"every december since 2017, ada rojas has guide...",https://www.nytimes.com/2020/12/31/us/resolvin...,,"Every December since 2017, Ada Rojas has guide...",,,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...",{'main': 'Resolving to live a lot better than ...,[],2021-01-01T00:58:19+0000,article,Express,U.S.,"{'original': 'By Concepción de León', 'person'...",News,nyt://article/35e85454-abfc-5376-b4b5-7d99316d...,263,nyt://article/35e85454-abfc-5376-b4b5-7d99316d...,
3,"the suit, led by representative louie gohmert ...",https://www.nytimes.com/2020/12/31/us/politics...,"The suit, led by Representative Louie Gohmert ...",[Here’s what you need to know about President-...,,,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...",{'main': 'Justice Dept. Asks Judge to Toss Ele...,"[{'name': 'organizations', 'value': 'Justice D...",2021-01-01T01:24:55+0000,article,Washington,U.S.,{'original': 'By Maggie Haberman and Katie Ben...,News,nyt://article/5fc45bbd-d622-56da-875c-b5b7c4ef...,695,nyt://article/5fc45bbd-d622-56da-875c-b5b7c4ef...,Politics
4,the united states recorded its 20 millionth ca...,https://www.nytimes.com/2020/12/31/world/the-u...,,The United States recorded its 20 millionth ca...,,,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...","{'main': 'The U.S. reaches 20 million cases.',...","[{'name': 'subject', 'value': 'internal-essent...",2021-01-01T01:28:22+0000,article,Foreign,World,"{'original': 'By Kate Taylor', 'person': [{'fi...",News,nyt://article/d75205a5-201f-5dc3-b8fe-278c3d64...,438,nyt://article/d75205a5-201f-5dc3-b8fe-278c3d64...,


#### Filtering the rows of data where the abstract contains words within the list 'words'
#### Writing the obtained dataframe to csv file named 'NYTGeneraldata2.csv'

In [4]:
words = ['covid','pandemic','coronavirus','vaccine','health','lockdown','restriction','quarantine','delta','omicron','variant']

df2=df1[df1['abstract'].str.contains('|'.join(words))]

#### Filtering the dataframe so that it only has two columns; 'abstract', 'pub_date' along with index

In [5]:
df3=df2.filter(['abstract','pub_date'])


In [6]:
df3.reset_index(drop=True, inplace=True)

#### Converting the 'pub_date' to datetime data type,

In [7]:
df3['pub_date']= pd.to_datetime(df3['pub_date'], format = "%Y-%m-%d %H:%M:%S", utc = True)
df3

Unnamed: 0,abstract,pub_date
0,the united states recorded its 20 millionth ca...,2021-01-01 01:28:22+00:00
1,"throughout the coronavirus pandemic, well-atte...",2021-01-01 03:08:25+00:00
2,a highly contagious coronavirus variant first ...,2021-01-01 03:44:59+00:00
3,the pandemic caused the cancellation of the ro...,2021-01-01 04:24:17+00:00
4,with a new variant of the virus emerging elsew...,2021-01-01 08:00:06+00:00
...,...,...
7951,"slowed but not stopped by the pandemic, wilson...",2021-11-30 16:14:32+00:00
7952,intense research into the new coronavirus vari...,2021-11-30 17:08:17+00:00
7953,britain’s approach to coronavirus-related rest...,2021-11-30 18:14:44+00:00
7954,"unlike alpha, beta and delta, the name of the ...",2021-11-30 18:51:06+00:00


#### Writing the obtained dataframe to csv file named 'NYTGeneraldata2.csv'

In [8]:
df3.to_csv('data/NYTGeneraldata2.csv')

In [9]:
### code for november extraction with headline and abstract

In [10]:
df4 = df1
a=df4.headline
b=[]
for i in a.index:
    b.append(a[i]['main'])

df4['headline']=b
df4['headline']=df4['headline'].str.lower()
df4

Unnamed: 0,abstract,web_url,snippet,lead_paragraph,print_section,print_page,source,multimedia,headline,keywords,pub_date,document_type,news_desk,section_name,byline,type_of_material,_id,word_count,uri,subsection_name
0,reasons to be hopeful about the biden economy.,https://www.nytimes.com/2020/12/31/opinion/202...,Reasons to be hopeful about the Biden economy.,The next few months will be hell in terms of p...,A,18,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...",things will get better. seriously.,"[{'name': 'persons', 'value': 'Biden, Joseph R...",2021-01-01T00:00:09+0000,article,OpEd,Opinion,"{'original': 'By Paul Krugman', 'person': [{'f...",Op-Ed,nyt://article/ab590671-f1be-5b6d-8212-a7d18aab...,882,nyt://article/ab590671-f1be-5b6d-8212-a7d18aab...,
1,the video shows a man raising something to his...,https://www.nytimes.com/2020/12/31/us/george-f...,The video shows a man raising something to his...,The Minneapolis Police Department released bod...,A,17,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...",minneapolis police release body camera video o...,"[{'name': 'persons', 'value': 'Idd, Dolal B', ...",2021-01-01T00:16:53+0000,article,National,U.S.,"{'original': 'By Nicholas Bogel-Burroughs', 'p...",News,nyt://article/8f633850-09ab-53ff-8753-bc72eaba...,861,nyt://article/8f633850-09ab-53ff-8753-bc72eaba...,
2,"every december since 2017, ada rojas has guide...",https://www.nytimes.com/2020/12/31/us/resolvin...,,"Every December since 2017, Ada Rojas has guide...",,,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...",resolving to live a lot better than in 2020.,[],2021-01-01T00:58:19+0000,article,Express,U.S.,"{'original': 'By Concepción de León', 'person'...",News,nyt://article/35e85454-abfc-5376-b4b5-7d99316d...,263,nyt://article/35e85454-abfc-5376-b4b5-7d99316d...,
3,"the suit, led by representative louie gohmert ...",https://www.nytimes.com/2020/12/31/us/politics...,"The suit, led by Representative Louie Gohmert ...",[Here’s what you need to know about President-...,,,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...",justice dept. asks judge to toss election laws...,"[{'name': 'organizations', 'value': 'Justice D...",2021-01-01T01:24:55+0000,article,Washington,U.S.,{'original': 'By Maggie Haberman and Katie Ben...,News,nyt://article/5fc45bbd-d622-56da-875c-b5b7c4ef...,695,nyt://article/5fc45bbd-d622-56da-875c-b5b7c4ef...,Politics
4,the united states recorded its 20 millionth ca...,https://www.nytimes.com/2020/12/31/world/the-u...,,The United States recorded its 20 millionth ca...,,,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...",the u.s. reaches 20 million cases.,"[{'name': 'subject', 'value': 'internal-essent...",2021-01-01T01:28:22+0000,article,Foreign,World,"{'original': 'By Kate Taylor', 'person': [{'fi...",News,nyt://article/d75205a5-201f-5dc3-b8fe-278c3d64...,438,nyt://article/d75205a5-201f-5dc3-b8fe-278c3d64...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50591,claire keegan’s “small things like these” unco...,https://www.nytimes.com/2021/11/30/books/revie...,Claire Keegan’s “Small Things Like These” unco...,SMALL THINGS LIKE THESE By Claire Keegan,,,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...","the horrors of irish magdalene laundries, revi...","[{'name': 'subject', 'value': 'Books and Liter...",2021-11-30T19:13:07+0000,article,BookReview,Books,"{'original': 'By Lydia Millet', 'person': [{'f...",Review,nyt://article/02e3b75d-49de-51c4-859c-4e4a85b6...,678,nyt://article/02e3b75d-49de-51c4-859c-4e4a85b6...,Book Review
50592,the centers for disease control and prevention...,https://www.nytimes.com/2021/11/30/us/politics...,The Centers for Disease Control and Prevention...,Top federal health officials said on Tuesday t...,,,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...",federal health officials say that they are exp...,"[{'name': 'subject', 'value': 'Coronavirus Omi...",2021-11-30T19:34:18+0000,article,Washington,U.S.,"{'original': 'By Noah Weiland', 'person': [{'f...",News,nyt://article/324cab09-d56f-5fdf-9fb2-09ddcb1e...,619,nyt://article/324cab09-d56f-5fdf-9fb2-09ddcb1e...,Politics
50593,emma coronel aispuro pleaded guilty in june to...,https://www.nytimes.com/2021/11/30/us/politics...,Emma Coronel Aispuro pleaded guilty in June to...,"Emma Coronel Aispuro, the wife of the notoriou...",,,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...",el chapo’s wife sentenced to 3 years in prison,"[{'name': 'subject', 'value': 'Drug Abuse and ...",2021-11-30T19:34:33+0000,article,Washington,U.S.,"{'original': 'By Alan Feuer', 'person': [{'fir...",News,nyt://article/69ef883b-535a-5e9e-8188-52aacb82...,743,nyt://article/69ef883b-535a-5e9e-8188-52aacb82...,Politics
50594,“i think it’s important that the i.r.s. have v...,https://www.nytimes.com/2021/11/30/business/ir...,“I think it’s important that the I.R.S. have v...,WASHINGTON — Treasury Secretary Janet L. Yelle...,,,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...",janet yellen still supports a plan to make ban...,"[{'name': 'persons', 'value': 'Yellen, Janet L...",2021-11-30T19:38:16+0000,article,Business,Business Day,"{'original': 'By Alan Rappeport', 'person': [{...",News,nyt://article/18242ec4-293a-5eb7-8f8c-b272cfb2...,443,nyt://article/18242ec4-293a-5eb7-8f8c-b272cfb2...,


In [11]:
words = ['covid','pandemic','coronavirus','vaccine','health','lockdown','restriction','quarantine','delta','omicron','variant']

df5=df4[df4['headline'].str.contains('|'.join(words))]
df5

Unnamed: 0,abstract,web_url,snippet,lead_paragraph,print_section,print_page,source,multimedia,headline,keywords,pub_date,document_type,news_desk,section_name,byline,type_of_material,_id,word_count,uri,subsection_name
11,a highly contagious coronavirus variant first ...,https://www.nytimes.com/2020/12/31/world/flori...,,A highly contagious coronavirus variant first ...,,,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...",florida health officials detect the state’s fi...,"[{'name': 'subject', 'value': 'internal-essent...",2021-01-01T03:44:59+0000,article,Foreign,World,"{'original': 'By Allyson Waller', 'person': [{...",News,nyt://article/c29df2db-63c0-5c92-b646-b4f6a09c...,265,nyt://article/c29df2db-63c0-5c92-b646-b4f6a09c...,
16,with a new variant of the virus emerging elsew...,https://www.nytimes.com/2021/01/01/nyregion/ny...,With a new variant of the virus emerging elsew...,As the final hours ticked away in a harrowing ...,A,5,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...",virus numbers are surging. why is new york’s v...,"[{'name': 'subject', 'value': 'Coronavirus (20...",2021-01-01T08:00:06+0000,article,Metro,New York,"{'original': 'By Joseph Goldstein', 'person': ...",News,nyt://article/f9171427-eed3-5031-88fa-f3ba050a...,1541,nyt://article/f9171427-eed3-5031-88fa-f3ba050a...,
18,test your knowledge of this week’s health news.,https://www.nytimes.com/interactive/2021/01/01...,Test your knowledge of this week’s health news.,Test your knowledge of this week’s health news.,,,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...","weekly health quiz: a new coronavirus variant,...","[{'name': 'subject', 'value': 'Exercise', 'ran...",2021-01-01T10:00:02+0000,multimedia,Well,Well,"{'original': 'By Toby Bilanow', 'person': [{'f...",Interactive Feature,nyt://interactive/22b2316e-59ad-5af2-a53f-b7d3...,0,nyt://interactive/22b2316e-59ad-5af2-a53f-b7d3...,Live
24,our 7-day well challenge will show you how to ...,https://www.nytimes.com/2021/01/01/well/live/n...,Our 7-Day Well Challenge will show you how to ...,Here’s a better way to start the new year: Ski...,D,6,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...","for a healthier 2021, keep the best habits of ...","[{'name': 'subject', 'value': 'Content Type: S...",2021-01-01T10:00:13+0000,article,Well,Well,"{'original': 'By Tara Parker-Pope', 'person': ...",News,nyt://article/228e9d6c-c91d-5cfa-9ef6-ff1f8596...,1322,nyt://article/228e9d6c-c91d-5cfa-9ef6-ff1f8596...,Live
47,the coronavirus has disrupted supply chains in...,https://www.nytimes.com/2021/01/01/realestate/...,The coronavirus has disrupted supply chains in...,Shopping for a dishwasher isn’t what it used t...,RE,4,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...",furnishing your home in a pandemic will requir...,"[{'name': 'subject', 'value': 'Real Estate and...",2021-01-01T13:00:09+0000,article,RealEstate,Real Estate,"{'original': 'By Ronda Kaysen', 'person': [{'f...",News,nyt://article/3f7df85e-2f30-50f8-b290-2221dba7...,1167,nyt://article/3f7df85e-2f30-50f8-b290-2221dba7...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50566,the company said that analyses and modeling of...,https://www.nytimes.com/2021/11/30/health/rege...,The company said that analyses and modeling of...,Regeneron said on Tuesday that its Covid-19 an...,,,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...",regeneron says its antibody treatment may not ...,"[{'name': 'subject', 'value': 'Antibodies', 'r...",2021-11-30T16:03:10+0000,article,Science,Health,"{'original': 'By Benjamin Mueller', 'person': ...",News,nyt://article/b5f7cdef-66e4-5d3d-94c7-dd42adb5...,319,nyt://article/b5f7cdef-66e4-5d3d-94c7-dd42adb5...,
50575,intense research into the new coronavirus vari...,https://www.nytimes.com/article/omicron-corona...,Intense research into the new coronavirus vari...,"First identified in Botswana and South Africa,...",A,12,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...",omicron: what is known — and still unknown,"[{'name': 'subject', 'value': 'Coronavirus Omi...",2021-11-30T17:08:17+0000,article,Science,Health,"{'original': 'By Andrew Jacobs', 'person': [{'...",News,nyt://article/695d6348-43e1-52a1-a9d1-abff2b40...,1331,nyt://article/695d6348-43e1-52a1-a9d1-abff2b40...,
50581,britain’s approach to coronavirus-related rest...,https://www.nytimes.com/2021/11/30/world/europ...,Britain’s approach to coronavirus-related rest...,"LONDON — At almost every step of the pandemic,...",,,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...","amid variant fears, u.k. discovers limits to i...","[{'name': 'subject', 'value': 'Politics and Go...",2021-11-30T18:14:44+0000,article,Foreign,World,{'original': 'By Mark Landler and Megan Specia...,News,nyt://article/2f9a2356-a67d-5263-b297-52b6e338...,1219,nyt://article/2f9a2356-a67d-5263-b297-52b6e338...,Europe
50587,"unlike alpha, beta and delta, the name of the ...",https://www.nytimes.com/2021/11/30/world/omicr...,"Unlike Alpha, Beta and Delta, the name of the ...",Among the many unknowns surrounding the new co...,,,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...",how do you say ‘omicron’?,"[{'name': 'subject', 'value': 'English Languag...",2021-11-30T18:51:06+0000,article,Express,World,"{'original': 'By Christine Hauser', 'person': ...",News,nyt://article/e5022683-7baf-5cc1-8fe7-53dea5d4...,287,nyt://article/e5022683-7baf-5cc1-8fe7-53dea5d4...,


In [13]:
df5.drop(['web_url','snippet','lead_paragraph','print_section','print_page','source','multimedia','document_type','_id','uri'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [14]:
df6=df5.filter(['abstract','headline','pub_date'])
df6.reset_index(drop=True, inplace=True)
df6

Unnamed: 0,abstract,headline,pub_date
0,a highly contagious coronavirus variant first ...,florida health officials detect the state’s fi...,2021-01-01T03:44:59+0000
1,with a new variant of the virus emerging elsew...,virus numbers are surging. why is new york’s v...,2021-01-01T08:00:06+0000
2,test your knowledge of this week’s health news.,"weekly health quiz: a new coronavirus variant,...",2021-01-01T10:00:02+0000
3,our 7-day well challenge will show you how to ...,"for a healthier 2021, keep the best habits of ...",2021-01-01T10:00:13+0000
4,the coronavirus has disrupted supply chains in...,furnishing your home in a pandemic will requir...,2021-01-01T13:00:09+0000
...,...,...,...
6579,the company said that analyses and modeling of...,regeneron says its antibody treatment may not ...,2021-11-30T16:03:10+0000
6580,intense research into the new coronavirus vari...,omicron: what is known — and still unknown,2021-11-30T17:08:17+0000
6581,britain’s approach to coronavirus-related rest...,"amid variant fears, u.k. discovers limits to i...",2021-11-30T18:14:44+0000
6582,"unlike alpha, beta and delta, the name of the ...",how do you say ‘omicron’?,2021-11-30T18:51:06+0000


In [15]:
df6['pub_date']= pd.to_datetime(df6['pub_date'], format = "%Y-%m-%d %H:%M:%S")
df6

Unnamed: 0,abstract,headline,pub_date
0,a highly contagious coronavirus variant first ...,florida health officials detect the state’s fi...,2021-01-01 03:44:59+00:00
1,with a new variant of the virus emerging elsew...,virus numbers are surging. why is new york’s v...,2021-01-01 08:00:06+00:00
2,test your knowledge of this week’s health news.,"weekly health quiz: a new coronavirus variant,...",2021-01-01 10:00:02+00:00
3,our 7-day well challenge will show you how to ...,"for a healthier 2021, keep the best habits of ...",2021-01-01 10:00:13+00:00
4,the coronavirus has disrupted supply chains in...,furnishing your home in a pandemic will requir...,2021-01-01 13:00:09+00:00
...,...,...,...
6579,the company said that analyses and modeling of...,regeneron says its antibody treatment may not ...,2021-11-30 16:03:10+00:00
6580,intense research into the new coronavirus vari...,omicron: what is known — and still unknown,2021-11-30 17:08:17+00:00
6581,britain’s approach to coronavirus-related rest...,"amid variant fears, u.k. discovers limits to i...",2021-11-30 18:14:44+00:00
6582,"unlike alpha, beta and delta, the name of the ...",how do you say ‘omicron’?,2021-11-30 18:51:06+00:00


In [16]:
#Send dataframe to a CSV
df6.to_csv('data/NYTGeneraldata1.csv')

# Finance Data Yearly
#### Getting yearly data using Alpha Vantage API for Pfizer(pfe), Moderna(mrna), AstraZeneca(azn)

In [17]:
key='HHU6WKJFGNX07KWW'

ts = TimeSeries(key = key, output_format = 'csv')

data_azn = ts.get_daily_adjusted(symbol = 'AZN', outputsize='full')
df_azn = pd.DataFrame(list(data_azn[0]))

header_row=0
df_azn.columns = df_azn.iloc[header_row]
df_azn = df_azn.drop(header_row)
#df_azn['day_of_week']=df_azn['timestamp'].dt.day_name()

df_azn

Unnamed: 0,timestamp,open,high,low,close,adjusted_close,volume,dividend_amount,split_coefficient
1,2021-12-20,56.56,56.64,56.0,56.56,56.56,4098658,0.0000,1.0
2,2021-12-17,56.57,56.995,55.96,56.02,56.02,6035602,0.0000,1.0
3,2021-12-16,56.12,57.185,56.06,57.09,57.09,6855978,0.0000,1.0
4,2021-12-15,54.94,55.75,54.64,55.72,55.72,4834671,0.0000,1.0
5,2021-12-14,54.53,54.89,54.35,54.5,54.5,3585669,0.0000,1.0
...,...,...,...,...,...,...,...,...,...
5567,1999-11-05,46.31,46.44,45.44,45.69,10.0944041917,403000,0.0000,1.0
5568,1999-11-04,45.13,45.94,44.5,45.38,10.0259151284,282800,0.0000,1.0
5569,1999-11-03,45.44,45.63,44.38,45.31,10.0104498561,356000,0.0000,1.0
5570,1999-11-02,44.75,44.75,43.88,44.0,9.72102833078,362100,0.0000,1.0


In [18]:
data_pfe = ts.get_daily_adjusted(symbol = 'PFE', outputsize='full')

df_pfe = pd.DataFrame(list(data_pfe[0]))

header_row=0
df_pfe.columns = df_pfe.iloc[header_row]
df_pfe = df_pfe.drop(header_row)

df_pfe

Unnamed: 0,timestamp,open,high,low,close,adjusted_close,volume,dividend_amount,split_coefficient
1,2021-12-20,60.6,61.71,59.83,61.02,61.02,55712217,0.0000,1.0
2,2021-12-17,59.93,61.19,58.7,59.48,59.48,104320166,0.0000,1.0
3,2021-12-16,58.36,61.43,57.75,61.25,61.25,75183647,0.0000,1.0
4,2021-12-15,56.11,58.915,55.89,58.8,58.8,75202914,0.0000,1.0
5,2021-12-14,54.77,55.95,54.316,55.54,55.54,48685674,0.0000,1.0
...,...,...,...,...,...,...,...,...,...
5567,1999-11-05,36.25,36.25,34.25,34.75,15.5244869919,44375100,0.0000,1.0
5568,1999-11-04,38.63,39.06,37.0,37.25,16.6413565596,14065500,0.0000,1.0
5569,1999-11-03,39.0,39.31,38.38,38.56,17.2265962132,7114400,0.0000,1.0
5570,1999-11-02,38.62,39.25,38.12,38.12,17.0300271692,6623800,0.0000,1.0


In [19]:
data_mrna = ts.get_daily_adjusted(symbol = 'MRNA', outputsize='full')

df_mrna = pd.DataFrame(list(data_mrna[0]))

header_row=0
df_mrna.columns = df_mrna.iloc[header_row]
df_mrna = df_mrna.drop(header_row)


df_mrna

Unnamed: 0,timestamp,open,high,low,close,adjusted_close,volume,dividend_amount,split_coefficient
1,2021-12-20,321.305,321.305,275.12,276.38,276.38,16985330,0.0000,1.0
2,2021-12-17,275.57,296.39,269.0,294.8,294.8,10627523,0.0000,1.0
3,2021-12-16,284.2,287.1335,277.07,282.02,282.02,5974339,0.0000,1.0
4,2021-12-15,271.7994,290.35,271.55,283.28,283.28,9269533,0.0000,1.0
5,2021-12-14,264.295,278.33,257.2005,277.16,277.16,6444908,0.0000,1.0
...,...,...,...,...,...,...,...,...,...
761,2018-12-13,19.25,19.29,18.68,18.76,18.76,1608629,0.0000,1.0
762,2018-12-12,18.55,19.48,18.02,18.68,18.68,2590873,0.0000,1.0
763,2018-12-11,20.55,20.66,17.99,18.01,18.01,3435271,0.0000,1.0
764,2018-12-10,18.9,19.439,18.0,18.8,18.8,4233345,0.0000,1.0


#### Filtering data obtained above for AstraZeneca for the year 2021, getting percentage change in close price,converting the 'timestamp' column to datetime format and also adding a columns with name of days



In [20]:
azn = df_azn[df_azn.timestamp.str.contains("2021-")]
new_azn = azn[~azn.timestamp.str.contains("2021-12-")]
new_azn.sort_values('timestamp', inplace=True, axis=0, ignore_index=True)
azn_close = new_azn['adjusted_close']
new_azn["change"]= azn_close.astype(float).pct_change()
#change timestamp format
new_azn['timestamp']= pd.to_datetime(new_azn['timestamp'], format = "%Y-%m-%d %H:%M:%S", utc=True)
new_azn['day_of_week']=new_azn['timestamp'].dt.day_name()




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_azn.sort_values('timestamp', inplace=True, axis=0, ignore_index=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_azn["change"]= azn_close.astype(float).pct_change()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_azn['timestamp']= pd.to_datetime(new_azn['timestamp'], format = "%Y-%m-%d %H:%M:%S", utc=True)
A value is trying to be set on a 

#### Creating a new dataframe 'close', which only has "timestamp","day_of_week","adjusted_close" and "change" columns. Writing the new dataframe to csv file named 'AZN_close.csv'

In [21]:
close = new_azn[["timestamp","day_of_week","adjusted_close", "change"]]
close.to_csv('data/AZN_close.csv')

#### Filtering data obtained above for Pfizer for the year 2021, getting percentage change in close price, converting the 'timestamp' column to datetime format and and also adding a columns with name of days


In [22]:
#pd.to_numeric(df_pfe['adjusted_close'], downcast="float")
pfe = df_pfe[df_pfe.timestamp.str.contains("2021-")]
new_pfe = pfe[~pfe.timestamp.str.contains("2021-12-")]
new_pfe.sort_values('timestamp', inplace=True, axis=0, ignore_index=True)
pfe_close = new_pfe['adjusted_close']
new_pfe["change"]= pfe_close.astype(float).pct_change()

# #change timestamp format
new_pfe['timestamp']= pd.to_datetime(new_pfe['timestamp'], format = "%Y-%m-%d %H:%M:%S", utc=True)
new_pfe['day_of_week']=new_pfe['timestamp'].dt.day_name()





A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_pfe.sort_values('timestamp', inplace=True, axis=0, ignore_index=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_pfe["change"]= pfe_close.astype(float).pct_change()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_pfe['timestamp']= pd.to_datetime(new_pfe['timestamp'], format = "%Y-%m-%d %H:%M:%S", utc=True)
A value is trying to be set on a 

#### Creating a new dataframe 'close', which only has "timestamp","day_of_week","adjusted_close" and "change" columns for Pfizer. Writing the new dataframe to csv file named 'PFE_close.csv'

In [23]:
close = new_pfe[["timestamp","day_of_week","adjusted_close", "change"]]
close.to_csv('data/PFE_close.csv')

#### Filtering data obtained above for Moderna for the year 2021, getting percentage change in close price, converting the 'timestamp' column to datetime format and and also adding a columns with name of days

In [24]:
mrna = df_mrna[df_mrna.timestamp.str.contains("2021-")]
new_mrna = mrna[~mrna.timestamp.str.contains("2021-12-")]
new_mrna.sort_values('timestamp', inplace=True, axis=0, ignore_index=True)
mrna_close = new_mrna['adjusted_close']
new_mrna["change"]= mrna_close.astype(float).pct_change()

#change timestamp format
new_mrna['timestamp']= pd.to_datetime(new_mrna['timestamp'], format = "%Y-%m-%d %H:%M:%S", utc=True)
new_mrna['day_of_week']=new_mrna['timestamp'].dt.day_name()




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_mrna.sort_values('timestamp', inplace=True, axis=0, ignore_index=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_mrna["change"]= mrna_close.astype(float).pct_change()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_mrna['timestamp']= pd.to_datetime(new_mrna['timestamp'], format = "%Y-%m-%d %H:%M:%S", utc=True)
A value is trying to be set 

#### Creating a new dataframe 'close', which only has "timestamp","day_of_week","adjusted_close" and "change" columns for Moderna. Writing the new dataframe to csv file named 'MRNA_close.csv'

In [25]:
close = new_mrna[["timestamp","day_of_week","adjusted_close", "change"]]
close.to_csv('data/MRNA_close.csv')

# Finance Data November 2021 
#### Using the same process as described above, but for the monthly data 

In [26]:
#key='HHU6WKJFGNX07KWW'

#ts = TimeSeries(key = key, output_format = 'csv')

data_azn = ts.get_intraday_extended(symbol = 'AZN', interval = '5min', slice = 'year1month1')
df_azn = pd.DataFrame(list(data_azn[0]))

header_row=0
df_azn.columns = df_azn.iloc[header_row]
df_azn = df_azn.drop(header_row)

#show output
df_azn

Unnamed: 0,time,open,high,low,close,volume
1,2021-12-20 18:05:00,56.6,56.6,56.6,56.6,100
2,2021-12-20 16:05:00,56.56,56.56,56.56,56.56,51160
3,2021-12-20 16:00:00,56.55,56.58,56.51,56.57,193022
4,2021-12-20 15:55:00,56.49,56.575,56.47,56.555,125128
5,2021-12-20 15:50:00,56.45,56.495,56.4036,56.485,77627
...,...,...,...,...,...,...
2346,2021-11-22 05:00:00,56.8,56.8,56.79,56.79,900
2347,2021-11-22 04:45:00,56.94,56.94,56.91,56.91,951
2348,2021-11-22 04:20:00,57.14,57.14,57.14,57.14,400
2349,2021-11-22 04:15:00,57.14,57.14,57.14,57.14,800


In [27]:
data_azn_add = ts.get_intraday_extended(symbol = 'AZN', interval = '5min', slice = 'year1month2')

df_azn_add = pd.DataFrame(list(data_azn_add[0]))

header_row=0
df_azn_add.columns = df_azn_add.iloc[header_row]
df_azn_add = df_azn_add.drop(header_row)

#show output
print(df_azn_add)

0                    time   open    high     low   close  volume
1     2021-11-19 19:35:00  56.86   56.88   56.86   56.88     500
2     2021-11-19 19:30:00   56.8   56.85    56.8   56.85     489
3     2021-11-19 16:05:00  56.66   56.66   56.66   56.66    7037
4     2021-11-19 16:00:00  56.68    56.7  56.645   56.66  182269
5     2021-11-19 15:55:00  56.64  56.705  56.625  56.685  126365
...                   ...    ...     ...     ...     ...     ...
2335  2021-10-22 07:30:00   62.0   62.02    62.0   62.02     908
2336  2021-10-22 07:25:00  61.98   61.99   61.96   61.97    3300
2337  2021-10-22 07:15:00  61.95   62.03   61.95   61.99    4268
2338  2021-10-22 07:10:00  61.93   61.98   61.93   61.98     706
2339  2021-10-22 07:05:00  61.81   61.85   61.81   61.84    7524

[2339 rows x 6 columns]


In [28]:
data_pfe = ts.get_intraday_extended(symbol = 'PFE', interval = '5min', slice = 'year1month1')

df_pfe = pd.DataFrame(list(data_pfe[0]))

header_row=0
df_pfe.columns = df_pfe.iloc[header_row]
df_pfe = df_pfe.drop(header_row)

#show output
print(df_pfe)

0                    time     open     high      low  close volume
1     2021-12-20 20:00:00    60.82    60.87    60.82  60.83   3750
2     2021-12-20 19:55:00    60.83    60.83  60.8001  60.82   3018
3     2021-12-20 19:50:00  60.8699  60.8699    60.83  60.83    530
4     2021-12-20 19:45:00    60.85    60.85    60.85  60.85   3169
5     2021-12-20 19:40:00    60.85    60.85    60.85  60.85    885
...                   ...      ...      ...      ...    ...    ...
3516  2021-11-22 04:25:00     51.2     51.2     51.2   51.2    342
3517  2021-11-22 04:20:00    51.09    51.12    51.05  51.11   5868
3518  2021-11-22 04:15:00    51.03    51.03    51.03  51.03    361
3519  2021-11-22 04:10:00    51.02    51.05    51.02  51.05   2658
3520  2021-11-22 04:05:00    51.02    51.03    51.02  51.03    681

[3520 rows x 6 columns]


In [29]:
data_pfe_add = ts.get_intraday_extended(symbol = 'PFE', interval = '5min', slice = 'year1month2')

df_pfe_add = pd.DataFrame(list(data_pfe_add[0]))

header_row=0
df_pfe_add.columns = df_pfe_add.iloc[header_row]
df_pfe_add = df_pfe_add.drop(header_row)

#show output
print(df_pfe_add)

0                    time           open           high            low  \
1     2021-11-19 20:00:00          50.94        50.9999          50.94   
2     2021-11-19 19:55:00          50.91          50.95          50.91   
3     2021-11-19 19:50:00        50.9501        50.9501          50.92   
4     2021-11-19 19:40:00          50.98           51.0          50.98   
5     2021-11-19 19:35:00          50.99           51.0          50.99   
...                   ...            ...            ...            ...   
3522  2021-10-22 05:20:00   42.541636528   42.541636528   42.541636528   
3523  2021-10-22 05:10:00  42.5218128391  42.5218128391  42.5218128391   
3524  2021-10-22 05:05:00  42.5119009946  42.5119009946  42.5119009946   
3525  2021-10-22 05:00:00   42.581283906   42.581283906  42.5713720615   
3526  2021-10-22 04:10:00  42.6209312839  42.6209312839  42.6209312839   

0             close volume  
1             50.94   5057  
2             50.94   2950  
3             50.93   28

In [30]:
data_mrna = ts.get_intraday_extended(symbol = 'MRNA', interval = '5min', slice = 'year1month1')

df_mrna = pd.DataFrame(list(data_mrna[0]))

header_row=0
df_mrna.columns = df_mrna.iloc[header_row]
df_mrna = df_mrna.drop(header_row)

#show output
df_mrna

Unnamed: 0,time,open,high,low,close,volume
1,2021-12-20 20:00:00,276.73,277.0,276.5,276.5,3052
2,2021-12-20 19:55:00,276.11,276.4999,276.0,276.4999,2830
3,2021-12-20 19:50:00,276.3799,276.5,276.33,276.33,1534
4,2021-12-20 19:45:00,276.6999,276.83,276.51,276.51,1071
5,2021-12-20 19:40:00,276.54,276.84,276.54,276.84,266
...,...,...,...,...,...,...
3173,2021-11-22 04:25:00,269.48,270.99,269.48,270.99,2638
3174,2021-11-22 04:20:00,269.09,269.5,269.0,269.5,906
3175,2021-11-22 04:15:00,268.6,269.5,268.6,269.11,2380
3176,2021-11-22 04:10:00,268.85,268.85,268.51,268.51,734


In [31]:
data_mrna_add = ts.get_intraday_extended(symbol = 'MRNA', interval = '5min', slice = 'year1month2')

df_mrna_add = pd.DataFrame(list(data_mrna_add[0]))

header_row=0
df_mrna_add.columns = df_mrna_add.iloc[header_row]
df_mrna_add = df_mrna_add.drop(header_row)


#show output
df_mrna_add

Unnamed: 0,time,open,high,low,close,volume
1,2021-11-19 20:00:00,263.0,263.3999,262.6,262.6,1280
2,2021-11-19 19:55:00,262.8,262.8,262.8,262.8,508
3,2021-11-19 19:45:00,263.2,263.2,263.2,263.2,102
4,2021-11-19 19:40:00,262.75,262.99,262.75,262.99,1146
5,2021-11-19 19:35:00,262.75,262.75,262.75,262.75,161
...,...,...,...,...,...,...
3042,2021-10-22 04:30:00,332.85,332.85,332.85,332.85,102
3043,2021-10-22 04:20:00,332.88,332.88,332.88,332.88,333
3044,2021-10-22 04:15:00,332.01,332.01,332.0,332.0,512
3045,2021-10-22 04:10:00,333.1,333.5,332.0,333.5,1662


In [32]:
astrazeneca = [df_azn, df_azn_add]
azn = pd.concat(astrazeneca)
azn_new = azn[~azn.time.str.contains("2021-12-")]
final_azn = azn_new[~azn_new.time.str.contains("2021-10-")]
final_azn.sort_values("time", inplace = True)
final_azn.set_index("time", inplace = True)
final_azn["change"]= final_azn.close.astype(float).pct_change()
final_azn = final_azn[["close", "change"]]
final_azn.to_csv('data/AZN_monthly.csv')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_azn.sort_values("time", inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_azn["change"]= final_azn.close.astype(float).pct_change()


In [43]:
pfizer = [df_pfe, df_pfe_add]
pfe = pd.concat(pfizer)
pfe_new = pfe[~pfe.time.str.contains("2021-12-")]
final_pfe = pfe_new[~pfe_new.time.str.contains("2021-10-")]
final_pfe.sort_values("time", inplace = True)
final_pfe.set_index("time", inplace = True)
final_pfe["change"]= final_pfe.close.astype(float).pct_change()
final_pfe = final_pfe[["close", "change"]]
final_pfe.to_csv('data/PFE_monthly.csv')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_pfe.sort_values("time", inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_pfe["change"]= final_pfe.close.astype(float).pct_change()


In [34]:
moderna = [df_mrna, df_mrna_add]
mrna = pd.concat(moderna)
mrna_new = mrna[~mrna.time.str.contains("2021-12-")]
final_mrna = mrna_new[~mrna_new.time.str.contains("2021-10-")]
final_mrna.sort_values("time", inplace = True)
final_mrna.set_index("time", inplace = True)
final_mrna["change"]= final_mrna.close.astype(float).pct_change()
final_mrna = final_mrna[["close", "change"]]
final_mrna.to_csv('data/MRNA_monthly.csv')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_mrna.sort_values("time", inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_mrna["change"]= final_mrna.close.astype(float).pct_change()


# Guardian data

### Occurrence analysis data

#### Extracting headlines and published dates of articles about covid from the Guardian API, sending to a CSV called 'Guardian_articles_covid.csv'. This will later be used in occurrence analysis, to determine how often covid-related words appear in headlines over time.
   

NB You may need to restart the kernel and run this cell again if you run into errors - this is due to an issue with the API.

In [44]:
# File to put API responses into
response_file = open('data/Guardian_articles_covid.csv','w')
from_date = '2021-01-01'
end_date = '2021-12-09'
query = 'covid'

# Get number of pages returned by response (so can loop through them later).
API_Key = '6e7020bf-248a-4bbf-840d-3b737649c9f8' #Put personal API key here
URL_1 = 'https://content.guardianapis.com/search?from-date={}&to-date={}&q={}&page-size=200&page=1&api-key={}'.format(from_date,end_date,query,API_Key)
responses_1 = (requests.request("GET", URL_1)).json()
no_of_pages = responses_1['response']['pages']
no_of_results = responses_1['response']['total']
print("No. of pages = "+str(no_of_pages))
print("No. of results = "+str(no_of_results))
response_file.write('Title,DateTimePublished\n')

# Loop through pages and through results in each page, write title (headline) to csv

for i in range(1, no_of_pages+1):
    URL = 'https://content.guardianapis.com/search?from-date={}&to-date={}&page-size=200&q={}&page={}&api-key={}'.format(from_date,end_date,query,
        i, API_Key)
    responses = (requests.request("GET", URL)).json()
    #print(responses)
    try:
        status = responses['response']['status']
        page = responses['response']['currentPage']
    except KeyError as err:
        print('Error: '+ str(err))
        pass
    if status == 'ok' and page != no_of_pages:
        for j in range(0, 199):
            x = responses['response']['results'][j]['webTitle']
            title = (re.sub(r'[^\w\s]', '', x)).lower()
            date = responses['response']['results'][j]['webPublicationDate']
            response_file.write(title +','+date + '\n')
    else:
        #print(status)
        for j in range(0, no_of_results%200):
            try:
                x = responses['response']['results'][j]['webTitle']
                title = (re.sub(r'[^\w\s]', '', x)).lower()
                response_file.write(title + ',' + date + '\n')
            except IndexError or KeyError as err1:
                print("Error: "+str(err1))

No. of pages = 114
No. of results = 22736
Error: 'currentPage'


KeyError: 'results'

### Word Cloud data

#### Extracting all Guardian article headlines published each quarter of 2021 and putting them into separate CSVs (called 'Guardian_articles_Qn.csv'. This will later be used to build word clouds showing the most mentioned words in the news each quarter.

In [37]:
# Defining a function to extract all Guardian headlines from the Guardian API, given a start and end date, and a file to put the responses into
def extract_headlines(response_file, from_date, end_date):
    # # Get number of pages returned by response (so can loop through them later).
    API_Key = '28e80572-27de-49b8-a096-6725a36b9bab' #Put personal API key here
    URL_1 = 'https://content.guardianapis.com/search?from-date={}&to-date={}&page-size=200&page=1&api-key={}'.format(from_date,end_date,API_Key)
    responses_1 = (requests.request("GET", URL_1)).json()
    no_of_pages = responses_1['response']['pages']
    no_of_results = responses_1['response']['total']
    
# Print no. of pages and results to the console as a check
    print("No. of pages = "+str(no_of_pages))
    print("No. of results = "+str(no_of_results))

    # Loop through pages and through results in each page, write title to csv

    for i in range(1, no_of_pages+1):
        URL = 'https://content.guardianapis.com/search?from-date={}&to-date={}&page-size=200&page={}&api-key={}'.format(from_date,end_date,
            i, API_Key)
        responses = (requests.request("GET", URL)).json()
        try:
            status = responses['response']['status']
            page = responses['response']['currentPage']
        except KeyError as err:
            print('Error: '+ err)
        if status == 'ok' and page != no_of_pages:
            for j in range(0, 199):
                title = (responses['response']['results'][j]['webTitle'])
                response_file.write(title + '\n')
        else:
            for j in range(0, no_of_results%200):
                try:
                    title = responses['response']['results'][j]['webTitle']
                    response_file.write(title + '\n')
                except IndexError as err1:
                    print("Error: "+str(err1))

In [38]:
# Q1
extract_headlines(response_file = (open('data/Guardian_articles_Q1.csv','w')), from_date = '2021-01-01', end_date = '2021-03-31')

No. of pages = 96
No. of results = 19028


In [39]:
# Q2
extract_headlines(response_file = (open('data/Guardian_articles_Q2.csv','w')), from_date = '2021-04-01', end_date = '2021-06-30')

No. of pages = 97
No. of results = 19203


In [40]:
# Q3
extract_headlines(response_file = (open('data/Guardian_articles_Q3.csv','w')), from_date = '2021-07-01', end_date = '2021-09-30')

No. of pages = 97
No. of results = 19245


In [41]:
# Q4 - NB cut off of 12th December used, as this is when our analysis was conducted
extract_headlines(response_file = (open('data/Guardian_articles_Q4.csv','w')), from_date = '2021-10-01', end_date = '2021-12-12')

No. of pages = 81
No. of results = 16017


#### Getting word frequencies from headline articles each quarter - using the 4 CSVs made previously, and creating a CSV for each quarter, with words and the number of times they appear in the CSV

In [42]:
# List of irrelevant words to exclude from analysis
filler_words = ['crossword','amid','way','over','could','day','years','into','may','must','right','been','go','week','time','make','back','happened','so','do','our','about','but','if','now','should','off','our','cant','got','have','dont','than','more','get','an','does','just','say','take','met','had','be','im','like','isnt','any','some','meet','were','them','this','then','from','two','three','how','out','are','said','can','with','is','me','says','all','we','no','was','that','not','one','its','us','has','my','will','on','in','they','their','you','your','by','at','and','a','i','because','it','too','to','for','of','his','her','he','him','she','the','or','before','after','as','why','what','who','when']


def extract_word_frequencies(input_file,output_file):
    string_file = input_file.read()

    # Cleaning data - removing punctuation, making all lower case, getting rid of line breaks and turning into list
    stripped_response = (re.sub(r'[^\w\s]', '', string_file)).lower()
    new_stripped_response = stripped_response.replace("\n"," ")
    words = new_stripped_response.split(" ")

    # Making empty dictionary to add to later, turning list of words into set (so we get unique set of words to list through)
    dictionary = {}
    set_of_words = set(words)

    # Looping through set of words (excluding numbers and filler words) and getting count - adding the results to a dictionary
    for i in set_of_words:
        if i.isalpha() and i not in filler_words:
            dictionary.update({i: words.count(i)})

    # Only showing words which appear more than 40 times
    shorter_dict = {k:v for (k,v) in dictionary.items() if v > 40}

    # Sort list by word count descending (make into list to do this, then back into dictionary)
    marklist = sorted(shorter_dict.items(), key=lambda x:x[1], reverse=True)
    sorted_dict = dict(marklist)

    with output_file as f:
        writer = csv.writer(f)
        f.write("Word,Count\n")
        for k, v in sorted_dict.items():
           writer.writerow([k, v])

In [53]:
# Q1
extract_word_frequencies(input_file = open('data/Guardian_articles_Q1.csv','r'), output_file = open('Q1_Analysis.csv', 'w'))

In [54]:
# Q2
extract_word_frequencies(input_file = open('data/Guardian_articles_Q2.csv','r'), output_file = open('Q2_Analysis.csv', 'w'))

In [55]:
# Q3
extract_word_frequencies(input_file = open('data/Guardian_articles_Q3.csv','r'), output_file = open('Q3_Analysis.csv', 'w'))

In [56]:
# Q4
extract_word_frequencies(input_file = open('data/Guardian_articles_Q4.csv','r'), output_file = open('Q4_Analysis.csv', 'w'))