In [20]:
import re

In [1]:
#reading the webpage
import requests
req= requests.get ("https://www.nytimes.com/interactive/2017/06/23/opinion/trumps-lies.html")

In [2]:
# parse the HTML file with BeautifulSoup
from bs4 import BeautifulSoup
bsoup= BeautifulSoup(req.text,"html.parser")

In [3]:
# separating each sentence
sentences= bsoup.find_all("span", attrs={"class":"short-desc"})
# this searches all the soup objects with the "span" tag with attribute  class equal short-desc

In [4]:
len(sentences) # number of sentences

180

In [5]:
# this data is pretty untidy, need to make it tidy and usable by extracting :

# 1. the date
# 2. the lie
# 3. the reason why it is a lie
# 4. the hyperlink included in the reason

# this is useful to create a dataset

In [6]:
# extracting date from the first object of sentences
first_sentence= sentences[0]
first_sentence.find("strong").text[0:-1] + ", 2017"  # extracting date from tag "strong" and adding year manually

'Jan. 21, 2017'

In [7]:
# extracting the lie from the first object of sentences
first_sentence.contents # this returns a list of the children in the html code (tags and strings nested within a tag)
first_sentence.contents[1] [1:-2] # takes only the second element of the list and within it, removing the excessive quotation marks

"I wasn't a fan of Iraq. I didn't want to go into Iraq."

In [8]:
# extracting the reason from the first object of sentences
first_sentence.find("a").text [1:-1] # extracting tag "a" and removing parenthesis

'He was for an invasion before he was against it.'

In [9]:
# extracting the hyperlink from the first object of sentences
first_sentence.find("a")["href"] # accessing the "href" attribute within the "a" tag

'https://www.buzzfeed.com/andrewkaczynski/in-2002-donald-trump-said-he-supported-invading-iraq-on-the'

In [10]:
# loop over all the sentences and extract the items above

all_items=[]
for sentence in sentences:
    date= sentence.find("strong").text[0:-1] + ", 2017" 
    lie= sentence.contents [1][1:-2]
    reason= sentence.find ("a").text[1:-1]
    hyperlink= sentence.find ("a")["href"]
    all_items.append((date,lie,reason,hyperlink))




In [11]:
len(all_items) # a list of 180 objects (1 per sentence) of tuples of length 4 with date, lie, reason and hyperlink

180

In [12]:
# create a dataframe
import pandas as pd
df= pd.DataFrame(all_items, columns=["date","lie","reason","hyperlink"])

In [13]:
df.head()

Unnamed: 0,date,lie,reason,hyperlink
0,"Jan. 21, 2017",I wasn't a fan of Iraq. I didn't want to go in...,He was for an invasion before he was against it.,https://www.buzzfeed.com/andrewkaczynski/in-20...
1,"Jan. 21, 2017",A reporter for Time magazine — and I have been...,Trump was on the cover 11 times and Nixon appe...,http://nation.time.com/2013/11/06/10-things-yo...
2,"Jan. 23, 2017",Between 3 million and 5 million illegal votes ...,There's no evidence of illegal voting.,https://www.nytimes.com/2017/01/23/us/politics...
3,"Jan. 25, 2017","Now, the audience was the biggest ever. But th...",Official aerial photos show Obama's 2009 inaug...,https://www.nytimes.com/2017/01/21/us/politics...
4,"Jan. 25, 2017",Take a look at the Pew reports (which show vot...,The report never mentioned voter fraud.,https://www.nytimes.com/2017/01/24/us/politics...


In [14]:
# change the date format into pandas format
df["date"] = pd.to_datetime(df["date"])

In [15]:
df.tail()

Unnamed: 0,date,lie,reason,hyperlink
175,2017-10-25,We have trade deficits with almost everybody.,We have trade surpluses with more than 100 cou...,https://www.bea.gov/newsreleases/international...
176,2017-10-27,"Wacky & totally unhinged Tom Steyer, who has b...",Steyer has financially supported many winning ...,https://www.opensecrets.org/donor-lookup/resul...
177,2017-11-01,"Again, we're the highest-taxed nation, just ab...",We're not.,http://www.politifact.com/truth-o-meter/statem...
178,2017-11-07,When you look at the city with the strongest g...,"Several other cities, including New York and L...",http://www.politifact.com/truth-o-meter/statem...
179,2017-11-11,"I'd rather have him – you know, work with him...","There is no evidence that Democrats ""set up"" R...",https://www.nytimes.com/interactive/2017/12/10...


In [16]:
# exporting dataset to csv for future possible analysis
df.to_csv("example3code.csv", index=False,encoding= "utf-8")

In [17]:
# distribution of dates
df_date=df["date"] # store date in list
df_date = df_date.astype("datetime64")# convert to date type
df_date.groupby(df_date.dt.month).count().plot(kind="bar") # grouping by month and plotting


<matplotlib.axes._subplots.AxesSubplot at 0x1c56f90e048>

In [46]:

# which websites are the links from, how many auto references, how many of those websites are openly left wing
# this is an effort to analyze if there is bias

df_hyperlink=df["hyperlink"] # store the links in a list

# we need the source not the entire web address

df_hyperlink=df_hyperlink.tolist() # make it a list of strings rather than pandas.series


In [47]:
# first let us remove "http://" and "https://www." 

list_1 = ["http://","https://www.","https://","www."] # characters to remove
accumulator_links=-1 # setting accumulator
for link in df_hyperlink: # loop through links list
    accumulator_links+=1  # add one to accumulator
    for element in list_1: # loop through characters to remove
        df_hyperlink[accumulator_links]= df_hyperlink[accumulator_links].replace(element,"") # remove those characters from the links



In [48]:
# now let us remove anything after the first forward slash so to get only the main domains
separator = '/' # setting a character after which I want to remove all text
accumulator_links_1=0 # setting accumulator
for link in df_hyperlink: # loop through the links 
    df_hyperlink[accumulator_links_1]=link.split(separator, 1)[0] # remove all text after the separator
    accumulator_links_1+=1 # add one to accumulator

In [50]:
df_hyperlink = pd.Series(df_hyperlink) # re-convert to pandas series in order to use value counts

In [51]:
df_hyperlink.value_counts()

nytimes.com                57
washingtonpost.com         39
politifact.com             31
factcheck.org              10
content.govdelivery.com     5
cnn.com                     4
usatoday.com                3
time.com                    3
buzzfeed.com                2
money.cnn.com               2
realclearpolitics.com       2
bea.gov                     2
snopes.com                  2
dnainfo.com                 1
washingtonmonthly.com       1
nbcnews.com                 1
transcripts.cnn.com         1
warontherocks.com           1
markets.on.nytimes.com      1
chicagotribune.com          1
pbs.org                     1
businessinsider.com         1
nation.time.com             1
thehill.com                 1
heritage.org                1
palmbeachpost.com           1
opensecrets.org             1
pewresearch.org             1
talkingpointsmemo.com       1
public.tableau.com          1
mdjonline.com               1
dtype: int64