In [1]:
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql.types import StructType, StructField, StringType
from pyspark.sql.types import *
from pyspark.sql import SparkSession, functions, types
from pyspark.sql.functions import col, explode

import datetime
import requests

sc = SparkContext('local')
spark = SparkSession(sc)

In [5]:
#### Setting up Time Frame

start_str = '2019-04-06'
end_str = '2019-04-11'

start_date = datetime.date(int(start_str[:4]), int(start_str[6:7]), int(start_str[9:10]))
end_date = datetime.date(int(end_str[:4]), int(end_str[6:7]), int(end_str[9:10]))

days_2_loop = abs((end_date - start_date).days) + 1

In [8]:
#### Downloading and Saving News into JSON

# Sources
news_sources = ['bloomberg', 'cnbc', 'reuters', 'the-wall-street-journal', 'fortune']
apiKey = '713cda0a7da543909c73cf9b06b3a158'

# Loop thru each Source and News Article
for source in news_sources:
    days_count = 0
    for day in range(days_2_loop):
        date = str(start_date + datetime.timedelta(days=days_count))
        json_file = open('news_data_apple_json/' + source + '_' + date + '.json', "w")
    
        url = ('https://newsapi.org/v2/everything?'
        'q=Apple&'
        'sources=' + source + '&' +
        'from=' + date + '&'
        'sortBy=popularity&'
        'apiKey=' + apiKey + '&'
        'page=1&'
        'pageSize=100')
        
        response = requests.get(url)
        
        json_file.write(response.text)
        json_file.close()
        
        days_count = days_count + 1

https://newsapi.org/v2/everything?q=Apple&sources=bloomberg&from=2019-04-06&sortBy=popularity&apiKey=713cda0a7da543909c73cf9b06b3a158&page=1&pageSize=100
https://newsapi.org/v2/everything?q=Apple&sources=bloomberg&from=2019-04-07&sortBy=popularity&apiKey=713cda0a7da543909c73cf9b06b3a158&page=1&pageSize=100
https://newsapi.org/v2/everything?q=Apple&sources=bloomberg&from=2019-04-08&sortBy=popularity&apiKey=713cda0a7da543909c73cf9b06b3a158&page=1&pageSize=100
https://newsapi.org/v2/everything?q=Apple&sources=bloomberg&from=2019-04-09&sortBy=popularity&apiKey=713cda0a7da543909c73cf9b06b3a158&page=1&pageSize=100
https://newsapi.org/v2/everything?q=Apple&sources=bloomberg&from=2019-04-10&sortBy=popularity&apiKey=713cda0a7da543909c73cf9b06b3a158&page=1&pageSize=100
https://newsapi.org/v2/everything?q=Apple&sources=bloomberg&from=2019-04-11&sortBy=popularity&apiKey=713cda0a7da543909c73cf9b06b3a158&page=1&pageSize=100
https://newsapi.org/v2/everything?q=Apple&sources=cnbc&from=2019-04-06&sortB

In [9]:
#### Read all JSON Files

df_temp = spark.read.json("news_data_apple_json/*json")
df_temp.show(5)

+--------------------+----+-------+------+------------+
|            articles|code|message|status|totalResults|
+--------------------+----+-------+------+------------+
|[[Reuters Editori...|null|   null|    ok|          38|
|[[Reuters Editori...|null|   null|    ok|          28|
|[[Todd Haselton, ...|null|   null|    ok|          21|
|[[Todd Haselton, ...|null|   null|    ok|          19|
|[[Sankalp Phartiy...|null|   null|    ok|          17|
+--------------------+----+-------+------+------------+
only showing top 5 rows



In [11]:
#### Convert nested DF into exploded DF

df_temp_3 = df_temp.withColumn('articles_nested', explode(df_temp.articles))

df_temp_3 = df_temp_3.withColumn('author', df_temp_3.articles_nested.author)

df_temp_3 = df_temp_3.withColumn('title', (col('articles_nested.title')))
df_temp_3 = df_temp_3.withColumn('description', (col('articles_nested.description')))
df_temp_3 = df_temp_3.withColumn('publishedAt', (col('articles_nested.publishedAt')))
df_temp_3 = df_temp_3.withColumn('url', (col('articles_nested.url')))
df_temp_3 = df_temp_3.withColumn('content', (col('articles_nested.content')))
df_temp_3 = df_temp_3.withColumn('Date', col('publishedAt').substr(1, 10))

df_temp_3 = df_temp_3.drop('articles')
df_temp_3 = df_temp_3.drop('articles_nested')

df_write = df_temp_3.select('Date', 'status', 'totalResults', 'author', 'title', 'description', 'publishedAt', 'url', 'content')

df_write.show()

+----------+------+------------+-----------------+------------------------------+--------------------------------+--------------------+--------------------+--------------------+
|      Date|status|totalResults|           author|                         title|                     description|         publishedAt|                 url|             content|
+----------+------+------------+-----------------+------------------------------+--------------------------------+--------------------+--------------------+--------------------+
|2019-04-03|    ok|          38|Reuters Editorial|          Exclusive: Japan ...|            Japan Display Inc...|2019-04-03T05:59:18Z|https://www.reute...|TOKYO (Reuters) -...|
|2019-04-02|    ok|          38|Reuters Editorial|          Apple, luxury bra...|            Apple Inc and oth...|2019-04-02T04:07:10Z|https://www.reute...|BEIJING (Reuters)...|
|2019-04-05|    ok|          38|       Kenneth Li|          Apple Music's U.S...|            Apple Inc's strea

In [15]:
#### Save DF to CSV
df_write.repartition(1).write.csv('News_current', header=True)