In [2]:
import pandas as pd
import numpy as np
from pandas import DataFrame
from sqlalchemy import create_engine
from bs4 import BeautifulSoup
import requests
from ps_wd import pswd
from time import sleep
from random import randint

In [3]:
dow_csv = "Resources/DowJonesIndex.csv"
dow_df = pd.read_csv(dow_csv)
dow_df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,1/8/2016,16519.16992,16651.89063,16314.57031,16346.4502,16346.4502,141850000
1,1/11/2016,16358.70996,16461.84961,16232.03027,16398.57031,16398.57031,127790000
2,1/12/2016,16419.10938,16591.34961,16322.07031,16516.2207,16516.2207,117480000
3,1/13/2016,16526.63086,16593.50977,16123.2002,16151.41016,16151.41016,153530000
4,1/14/2016,16159.00977,16482.05078,16075.12012,16379.04981,16379.04981,158830000


In [4]:
new_dow_df = dow_df[['Date', 'Open', 'Close']].copy()
new_dow_df.head()

Unnamed: 0,Date,Open,Close
0,1/8/2016,16519.16992,16346.4502
1,1/11/2016,16358.70996,16398.57031
2,1/12/2016,16419.10938,16516.2207
3,1/13/2016,16526.63086,16151.41016
4,1/14/2016,16159.00977,16379.04981


In [5]:
new_dow_df['Change'] = new_dow_df['Open'] - new_dow_df['Close']
new_dow_df.head()

Unnamed: 0,Date,Open,Close,Change
0,1/8/2016,16519.16992,16346.4502,172.71972
1,1/11/2016,16358.70996,16398.57031,-39.86035
2,1/12/2016,16419.10938,16516.2207,-97.11132
3,1/13/2016,16526.63086,16151.41016,375.2207
4,1/14/2016,16159.00977,16379.04981,-220.04004


In [6]:
engine = create_engine('postgresql://postgres:'+ pswd + '@localhost:5432/dow_jones_db')
connection = engine.connect()


In [7]:
engine.table_names()

['news_titles', 'dow_jones']

In [8]:
new_dow_df.to_sql(name='dow_jones', con=engine, if_exists='replace', index=False)

In [9]:
pd.read_sql_query('select * from dow_jones', con=engine).head()

Unnamed: 0,Date,Open,Close,Change
0,1/8/2016,16519.16992,16346.4502,172.71972
1,1/11/2016,16358.70996,16398.57031,-39.86035
2,1/12/2016,16419.10938,16516.2207,-97.11132
3,1/13/2016,16526.63086,16151.41016,375.2207
4,1/14/2016,16159.00977,16379.04981,-220.04004


In [10]:
# URL of page to be scraped
url = 'https://www.cnn.com/article/sitemap-2016-1.html'

In [11]:
# Retrieve page with the requests module
response = requests.get(url)

In [12]:
# Create BeautifulSoup object; parse with 'html.parser'
soup = BeautifulSoup(response.text, 'html.parser')

In [13]:
# Examine the results, then determine element that contains sought info
print(soup.prettify())

<!DOCTYPE html>
<html class="sitemap">
 <head>
  <meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
  <meta charset="utf-8"/>
  <meta content="text/html" http-equiv="Content-Type"/>
  <meta content="width=device-width, initial-scale=1.0, minimum-scale=1.0" name="viewport"/>
  <link href="/optimizelyjs/131788053.js" rel="dns-prefetch"/>
  <link href="//tpc.googlesyndication.com" rel="dns-prefetch"/>
  <link href="//pagead2.googlesyndication.com" rel="dns-prefetch"/>
  <link href="//www.googletagservices.com" rel="dns-prefetch"/>
  <link href="//partner.googleadservices.com" rel="dns-prefetch"/>
  <link href="//www.google.com" rel="dns-prefetch"/>
  <link href="//aax.amazon-adsystem.com" rel="dns-prefetch"/>
  <link href="//c.amazon-adsystem.com" rel="dns-prefetch"/>
  <link href="//cdn.krxd.net" rel="dns-prefetch"/>
  <link href="//ads.rubiconproject.com" rel="dns-prefetch"/>
  <link href="//optimized-by.rubiconproject.com" rel="dns-prefetch"/>
  <link href="//fastlane.rubi

In [14]:
# results are returned as an iterable list
title_results = soup.find_all('span', class_="sitemap-link")
title_results

[<span class="sitemap-link sitemap-link-head">Title</span>,
 <span class="sitemap-link"><a href="https://www.cnn.com/2016/01/30/politics/ted-cruz-ronald-reagan-john-mccain-1980/index.html">McCain on Cruz claim: 'It's an outright lie'</a></span>,
 <span class="sitemap-link"><a href="https://www.cnn.com/2016/01/31/entertainment/britain-wogan-dead/index.html">Terry Wogan: Veteran broadcaster dies</a></span>,
 <span class="sitemap-link"><a href="https://www.cnn.com/2016/01/31/politics/iowa-caucuses-best-quotes/index.html">9 great quotes about the Iowa caucuses</a></span>,
 <span class="sitemap-link"><a href="https://www.cnn.com/2016/01/30/us/missing-teen-arrest-virginia/index.html">Missing teen found dead; Virginia Tech student charged</a></span>,
 <span class="sitemap-link"><a href="https://www.cnn.com/2016/01/30/politics/donald-trump-jerry-falwell-iowa-2016/index.html">Jerry Falwell Jr. cites his father in Trump endorsement</a></span>,
 <span class="sitemap-link"><a href="https://www.cnn

In [15]:
# Loop through returned results
title_list = []
for result in title_results:
    title = result.text
    title_list.append(title)
    print(title)

Title
McCain on Cruz claim: 'It's an outright lie'
Terry Wogan: Veteran broadcaster dies
9 great quotes about the Iowa caucuses
Missing teen found dead; Virginia Tech student charged
Jerry Falwell Jr. cites his father in Trump endorsement
ISIS claims responsibility for deadly blasts in Damascus suburb
First on CNN: Main Cruz super PACs struggled to raise money in fall and winter
What's streaming on Netflix, Amazon Prime, Hulu in February
Sometimes all it takes is one person to see what you don't
Cruz targets Rubio ahead of Iowa caucuses
CNN/WMUR poll: Trump, Sanders still up in New Hampshire
Major Iowa poll shows Trump, Clinton holding narrow leads
Graham on Cruz's pitch: 'Vote for me, everybody else is an idiot'
Sanders: Clinton emails 'very serious issue'
Sanders unhappy at Clinton's 'tone' as attacks mount
How to stop presidential candidates from lying
Iowa's secretary of state rips Cruz over campaign mailer
Woman's body, 2 live monkeys found in Florida motel room
Clinton exudes con

White House tries to quell Democratic rebellion over immigration raids
When it comes to dating sites, race matters
Where did Democrats and Republicans agree at the State of the Union address
Israel: 4 charged in 'lynching' of Eritrean migrant mistaken for terrorist
'Wolves of the sea': The vanishing fishermen of Greece's Paros island
Nikki Haley to Donald Trump: Turn down the volume
Who said it: Nikki Haley or Barack Obama
Should college fire hijab-wearing professor?
Steven Avery, subject of 'Making a Murderer' documentary, files appeals
Donald Trump's Texas state director leaves campaign
State of the Union: Obama's complicated victory lap
Clinton stands by Obama, challenges Sanders on guns in State of the Union ad
Senate to consider suspending Syrian refugee program
Supreme Court hears Iran victims compensation case
Republican response to State of the Union address: Transcript
Nikki Haley's condition: It's my speech
Haley to Trump: Don't take it personally
U.S. knew of actors' connect

In [16]:
title_df = DataFrame(title_list, columns =['Title'])
title_df

Unnamed: 0,Title
0,Title
1,McCain on Cruz claim: 'It's an outright lie'
2,Terry Wogan: Veteran broadcaster dies
3,9 great quotes about the Iowa caucuses
4,Missing teen found dead; Virginia Tech student...
...,...
2233,21 achievable New Year's resolutions for your ...
2234,What could happen to Bill Cosby
2235,Air India flight turns back over rat scare
2236,Transylvania? Kotor? Lonely Planet's best trav...


In [17]:
# results are returned as an iterable list
date_results = soup.find_all('span', class_="date")
date_results

[<span class="date date-head">Date</span>,
 <span class="date">2016-01-31</span>,
 <span class="date">2016-01-31</span>,
 <span class="date">2016-01-31</span>,
 <span class="date">2016-01-31</span>,
 <span class="date">2016-01-31</span>,
 <span class="date">2016-01-31</span>,
 <span class="date">2016-01-31</span>,
 <span class="date">2016-01-31</span>,
 <span class="date">2016-01-31</span>,
 <span class="date">2016-01-31</span>,
 <span class="date">2016-01-31</span>,
 <span class="date">2016-01-31</span>,
 <span class="date">2016-01-31</span>,
 <span class="date">2016-01-31</span>,
 <span class="date">2016-01-31</span>,
 <span class="date">2016-01-31</span>,
 <span class="date">2016-01-31</span>,
 <span class="date">2016-01-31</span>,
 <span class="date">2016-01-31</span>,
 <span class="date">2016-01-31</span>,
 <span class="date">2016-01-31</span>,
 <span class="date">2016-01-31</span>,
 <span class="date">2016-01-31</span>,
 <span class="date">2016-01-31</span>,
 <span class="date">2

In [18]:
# Loop through returned results
date_list = []
for result in date_results:
    date = result.text
    date_list.append(date)
    print(date)
   
    

Date
2016-01-31
2016-01-31
2016-01-31
2016-01-31
2016-01-31
2016-01-31
2016-01-31
2016-01-31
2016-01-31
2016-01-31
2016-01-31
2016-01-31
2016-01-31
2016-01-31
2016-01-31
2016-01-31
2016-01-31
2016-01-31
2016-01-31
2016-01-31
2016-01-31
2016-01-31
2016-01-31
2016-01-31
2016-01-31
2016-01-31
2016-01-31
2016-01-31
2016-01-31
2016-01-31
2016-01-31
2016-01-31
2016-01-31
2016-01-31
2016-01-31
2016-01-31
2016-01-31
2016-01-31
2016-01-31
2016-01-31
2016-01-31
2016-01-31
2016-01-31
2016-01-31
2016-01-30
2016-01-30
2016-01-30
2016-01-30
2016-01-30
2016-01-30
2016-01-30
2016-01-30
2016-01-30
2016-01-30
2016-01-30
2016-01-30
2016-01-30
2016-01-30
2016-01-30
2016-01-30
2016-01-30
2016-01-30
2016-01-30
2016-01-30
2016-01-30
2016-01-30
2016-01-30
2016-01-30
2016-01-30
2016-01-30
2016-01-30
2016-01-30
2016-01-30
2016-01-30
2016-01-30
2016-01-30
2016-01-30
2016-01-30
2016-01-30
2016-01-30
2016-01-30
2016-01-30
2016-01-30
2016-01-30
2016-01-30
2016-01-30
2016-01-30
2016-01-30
2016-01-30
2016-01-30
2016-

2016-01-08
2016-01-08
2016-01-08
2016-01-08
2016-01-08
2016-01-08
2016-01-08
2016-01-08
2016-01-08
2016-01-08
2016-01-08
2016-01-08
2016-01-08
2016-01-08
2016-01-08
2016-01-08
2016-01-08
2016-01-08
2016-01-08
2016-01-08
2016-01-08
2016-01-08
2016-01-08
2016-01-08
2016-01-08
2016-01-08
2016-01-08
2016-01-08
2016-01-08
2016-01-08
2016-01-08
2016-01-08
2016-01-08
2016-01-08
2016-01-08
2016-01-08
2016-01-08
2016-01-08
2016-01-08
2016-01-08
2016-01-08
2016-01-08
2016-01-08
2016-01-08
2016-01-08
2016-01-08
2016-01-08
2016-01-08
2016-01-08
2016-01-08
2016-01-08
2016-01-08
2016-01-07
2016-01-07
2016-01-07
2016-01-07
2016-01-07
2016-01-07
2016-01-07
2016-01-07
2016-01-07
2016-01-07
2016-01-07
2016-01-07
2016-01-07
2016-01-07
2016-01-07
2016-01-07
2016-01-07
2016-01-07
2016-01-07
2016-01-07
2016-01-07
2016-01-07
2016-01-07
2016-01-07
2016-01-07
2016-01-07
2016-01-07
2016-01-07
2016-01-07
2016-01-07
2016-01-07
2016-01-07
2016-01-07
2016-01-07
2016-01-07
2016-01-07
2016-01-07
2016-01-07
2016-01-07

In [19]:
date_df = DataFrame(date_list, columns =['Date'])
date_df

Unnamed: 0,Date
0,Date
1,2016-01-31
2,2016-01-31
3,2016-01-31
4,2016-01-31
...,...
2233,2016-01-01
2234,2016-01-01
2235,2016-01-01
2236,2016-01-01


In [20]:
title_df['tmp'] = 1
date_df['tmp'] = 1

combined_news_df = pd.merge(title_df, date_df, on=['tmp'])
combined_news_df = combined_news_df.drop('tmp', axis=1)
combined_news_df

Unnamed: 0,Title,Date
0,Title,Date
1,Title,2016-01-31
2,Title,2016-01-31
3,Title,2016-01-31
4,Title,2016-01-31
...,...,...
5008639,China says it's building new homegrown aircraf...,2016-01-01
5008640,China says it's building new homegrown aircraf...,2016-01-01
5008641,China says it's building new homegrown aircraf...,2016-01-01
5008642,China says it's building new homegrown aircraf...,2016-01-01


In [21]:
concat_news_df = pd.concat([title_df, date_df ])
concat_news_df

Unnamed: 0,Title,tmp,Date
0,Title,1,
1,McCain on Cruz claim: 'It's an outright lie',1,
2,Terry Wogan: Veteran broadcaster dies,1,
3,9 great quotes about the Iowa caucuses,1,
4,Missing teen found dead; Virginia Tech student...,1,
...,...,...,...
2233,,1,2016-01-01
2234,,1,2016-01-01
2235,,1,2016-01-01
2236,,1,2016-01-01


In [22]:
# URL of page to be scraped
# https://www.cnn.com/article/sitemap-2016-1.html
#url = 'https://www.cnn.com/article/sitemap-' + year + '-' + month + '.html'

In [23]:
# Retrieve page with the requests module
#response = requests.get(url)

In [24]:
years = [2016, 2017, 2018, 2019, 2020]
years = np.arange(2016, 2021)
month = np.arange(1, 13)

In [25]:
years = np.arange(2016, 2018)
months = np.arange(1, 3)

title_list2 = []

for year in years:
    for month in months: 
  
        response_title = requests.get("https://www.cnn.com/article/sitemap-" + str(year) + '-' + str(month) + ".html")
  
        soup_title = BeautifulSoup(response_title.text, 'html.parser')
  
        title_results2 = soup.find_all('span', class_="sitemap-link")
        
        for result in title_results2:
            title = result.text
            title_list2.append(title)
title_list2 = []    

In [26]:
title_list2 = []

In [27]:
len(title_list2)

0

In [28]:
# Loop through returned results
title_list2 = []
for result in title_results2:
    title = result.text
    title_list2.append(title)
title_list2

['Title',
 "McCain on Cruz claim: 'It's an outright lie'",
 'Terry Wogan: Veteran broadcaster dies',
 '9 great quotes about the Iowa caucuses',
 'Missing teen found dead; Virginia Tech student charged',
 'Jerry Falwell Jr. cites his father in Trump endorsement',
 'ISIS claims responsibility for deadly blasts in Damascus suburb',
 'First on CNN: Main Cruz super PACs struggled to raise money in fall and winter',
 "What's streaming on Netflix, Amazon Prime, Hulu in February",
 "Sometimes all it takes is one person to see what you don't",
 'Cruz targets Rubio ahead of Iowa caucuses',
 'CNN/WMUR poll: Trump, Sanders still up in New Hampshire',
 'Major Iowa poll shows Trump, Clinton holding narrow leads',
 "Graham on Cruz's pitch: 'Vote for me, everybody else is an idiot'",
 "Sanders: Clinton emails 'very serious issue'",
 "Sanders unhappy at Clinton's 'tone' as attacks mount",
 'How to stop presidential candidates from lying',
 "Iowa's secretary of state rips Cruz over campaign mailer",
 "W

In [29]:
len(title_list2)


2238

In [30]:
title_list2[2237]

"China says it's building new homegrown aircraft carrier"

In [31]:
# response_list=[]
# for year in years:
#     for month in range(13):
#         # Get one of the posts
#         response = requests.get('https://www.cnn.com/article/sitemap-' + str(year) + '-' + str(month) + '.html')
        
#         # Save post
#         response_list.append(response)

In [32]:
# for page in response_list:
#     text = page.text
#     soup2 = BeautifulSoup(text, 'html.parser')

In [33]:
# soup2 = BeautifulSoup(response_list[0].text, 'html.parser')
# print(soup2.prettify())

In [34]:
# title_results2 = soup2.find_all('span', class_="sitemap-link")
# title_results2

In [40]:
#years = np.arange(2016, 2018)
months = np.arange(1, 3)

title_list3 = []

#for year in years:
for month in months: 
  
        response_title2 = requests.get("https://www.cnn.com/article/sitemap-2016-" + str(month) + ".html")
  
        soup_title2 = BeautifulSoup(response_title2.text, 'html.parser')
  
        title_results3 = soup_title2.find_all('span', class_="sitemap-link")
        
        for result in title_results3:
            title = result.text
            title_list3.append(title)
title_list3 = []    

In [39]:
len(title_list3)

0