# Task 1- Data Identification and Collection

### In this task, the news API has been used to collect data about the news articles and news sources by making 50 API calls and finally generate 2 datasets - sources. csv and articles.csv 

#### API: News API (https://newsapi.org)


In [1]:
import pandas as pd
import os
import json
import time
import requests

## Define Get Request

In [2]:
def request(suffix):
    base_url = 'https://newsapi.org/v2'
    url = base_url + '/' + suffix + '&apiKey=149f49e283b3475a832c686e2b548ed5' #api key with max limit of 100 calls in 24 hours
    print(url)
    response = requests.get(url).json()
    time.sleep(10)
    return response

## Get all the different English language news sources

In [3]:
sources = request('top-headlines/sources?language=en')['sources']
sources_data = {
    'source_id' : [],
    'source_name' : [],
    'description' : [],
    'url' : [],
    'category' : [],
    'country' : []
}
for source in sources:
    sources_data['source_id'].append(source['id'])
    sources_data['source_name'].append(source['name'])
    sources_data['description'].append(source['description'])
    sources_data['url'].append(source['url'])
    sources_data['category'].append(source['category'])
    sources_data['country'].append(source['country'])

sources_df = pd.DataFrame(sources_data)
print('creating sources csv file')
sources_df.to_csv('sources.csv', encoding='utf-8', index=False)
print('sources.csv successfully created')

https://newsapi.org/v2/top-headlines/sources?language=en&apiKey=149f49e283b3475a832c686e2b548ed5
creating sources csv file
sources.csv successfully created


In [4]:
#create a directory to store the temporary csv files in order to merge them into one
if not os.path.exists('articles'):
        os.mkdir('articles')

## Define Get Articles method for all the types and different dates

In [5]:
#getArticles method defined with 3 parameters from_date, to_date and type of the article
def getArticles(from_date,to_date,types):
    top_headlines = {
    'source_id' : [],
    'source_name' : [],
    'author' : [],
    'title' : [],
    'url' : [],
    'url_to_image' : [],
    'published_at' : [],
    'article_type' : [],
    'article_word_count' : []
    }
    for article_type in types:
        suffix = 'everything?q=' + article_type + '&language=en&from=' + from_date + '&to=' + to_date
        headlines = request(suffix)['articles'] #request data from the API
        #print(len(headlines))
        for headline in headlines:
            top_headlines['source_id'].append(headline['source']['id'])
            top_headlines['source_name'].append(headline['source']['name'])
            top_headlines['author'].append(headline['author'])
            top_headlines['title'].append(headline['title'])
            top_headlines['url'].append(headline['url'])
            top_headlines['url_to_image'].append(headline['urlToImage'])
            top_headlines['published_at'].append(headline['publishedAt'])   
            top_headlines['article_type'].append(article_type)
            content = headline['content']
            word_count = len(content.split()) #count the number of words in article content
            top_headlines['article_word_count'].append(word_count)

    headlines_df = pd.DataFrame(top_headlines) #convert into dataframe
    return headlines_df

## Get article headlines in English language from different types for the specified dates

### Get article headlines in English language from different types from = 2023-03-18 and to = 2023-03-19

In [6]:
types = ['business', 'entertainment', 'general', 'health', 'science', 'sports', 'technology']
article1 = getArticles('2023-03-18','2023-03-19',types)
article1.to_csv('articles/article1.csv', encoding='utf-8', index=False)

https://newsapi.org/v2/everything?q=business&language=en&from=2023-03-18&to=2023-03-19&apiKey=149f49e283b3475a832c686e2b548ed5
https://newsapi.org/v2/everything?q=entertainment&language=en&from=2023-03-18&to=2023-03-19&apiKey=149f49e283b3475a832c686e2b548ed5
https://newsapi.org/v2/everything?q=general&language=en&from=2023-03-18&to=2023-03-19&apiKey=149f49e283b3475a832c686e2b548ed5
https://newsapi.org/v2/everything?q=health&language=en&from=2023-03-18&to=2023-03-19&apiKey=149f49e283b3475a832c686e2b548ed5
https://newsapi.org/v2/everything?q=science&language=en&from=2023-03-18&to=2023-03-19&apiKey=149f49e283b3475a832c686e2b548ed5
https://newsapi.org/v2/everything?q=sports&language=en&from=2023-03-18&to=2023-03-19&apiKey=149f49e283b3475a832c686e2b548ed5
https://newsapi.org/v2/everything?q=technology&language=en&from=2023-03-18&to=2023-03-19&apiKey=149f49e283b3475a832c686e2b548ed5


### Get article headlines in English language from different types from = 2023-03-19 and to = 2023-03-20

In [7]:
types = ['business', 'fashion', 'food', 'health', 'science', 'sports', 'lifestyle']
article2 = getArticles('2023-03-19','2023-03-20',types)
article2.to_csv('articles/article2.csv', encoding='utf-8', index=False)

https://newsapi.org/v2/everything?q=business&language=en&from=2023-03-19&to=2023-03-20&apiKey=149f49e283b3475a832c686e2b548ed5
https://newsapi.org/v2/everything?q=fashion&language=en&from=2023-03-19&to=2023-03-20&apiKey=149f49e283b3475a832c686e2b548ed5
https://newsapi.org/v2/everything?q=food&language=en&from=2023-03-19&to=2023-03-20&apiKey=149f49e283b3475a832c686e2b548ed5
https://newsapi.org/v2/everything?q=health&language=en&from=2023-03-19&to=2023-03-20&apiKey=149f49e283b3475a832c686e2b548ed5
https://newsapi.org/v2/everything?q=science&language=en&from=2023-03-19&to=2023-03-20&apiKey=149f49e283b3475a832c686e2b548ed5
https://newsapi.org/v2/everything?q=sports&language=en&from=2023-03-19&to=2023-03-20&apiKey=149f49e283b3475a832c686e2b548ed5
https://newsapi.org/v2/everything?q=lifestyle&language=en&from=2023-03-19&to=2023-03-20&apiKey=149f49e283b3475a832c686e2b548ed5


### Get article headlines in English language from different types from = 2023-03-20 and to = 2023-03-21

In [8]:
types = ['crime', 'entertainment', 'general', 'health', 'art', 'sports', 'technology']
article3 = getArticles('2023-03-20','2023-03-21',types)
article3.to_csv('articles/article3.csv', encoding='utf-8', index=False)

https://newsapi.org/v2/everything?q=crime&language=en&from=2023-03-20&to=2023-03-21&apiKey=149f49e283b3475a832c686e2b548ed5
https://newsapi.org/v2/everything?q=entertainment&language=en&from=2023-03-20&to=2023-03-21&apiKey=149f49e283b3475a832c686e2b548ed5
https://newsapi.org/v2/everything?q=general&language=en&from=2023-03-20&to=2023-03-21&apiKey=149f49e283b3475a832c686e2b548ed5
https://newsapi.org/v2/everything?q=health&language=en&from=2023-03-20&to=2023-03-21&apiKey=149f49e283b3475a832c686e2b548ed5
https://newsapi.org/v2/everything?q=art&language=en&from=2023-03-20&to=2023-03-21&apiKey=149f49e283b3475a832c686e2b548ed5
https://newsapi.org/v2/everything?q=sports&language=en&from=2023-03-20&to=2023-03-21&apiKey=149f49e283b3475a832c686e2b548ed5
https://newsapi.org/v2/everything?q=technology&language=en&from=2023-03-20&to=2023-03-21&apiKey=149f49e283b3475a832c686e2b548ed5


### Get article headlines in English language from different types from = 2023-03-21 and to = 2023-03-22

In [9]:
types = ['business', 'entertainment', 'general', 'health', 'science', 'legal', 'politics']
article4 = getArticles('2023-03-21','2023-03-22',types)
article4.to_csv('articles/article4.csv', encoding='utf-8', index=False)

https://newsapi.org/v2/everything?q=business&language=en&from=2023-03-21&to=2023-03-22&apiKey=149f49e283b3475a832c686e2b548ed5
https://newsapi.org/v2/everything?q=entertainment&language=en&from=2023-03-21&to=2023-03-22&apiKey=149f49e283b3475a832c686e2b548ed5
https://newsapi.org/v2/everything?q=general&language=en&from=2023-03-21&to=2023-03-22&apiKey=149f49e283b3475a832c686e2b548ed5
https://newsapi.org/v2/everything?q=health&language=en&from=2023-03-21&to=2023-03-22&apiKey=149f49e283b3475a832c686e2b548ed5
https://newsapi.org/v2/everything?q=science&language=en&from=2023-03-21&to=2023-03-22&apiKey=149f49e283b3475a832c686e2b548ed5
https://newsapi.org/v2/everything?q=legal&language=en&from=2023-03-21&to=2023-03-22&apiKey=149f49e283b3475a832c686e2b548ed5
https://newsapi.org/v2/everything?q=politics&language=en&from=2023-03-21&to=2023-03-22&apiKey=149f49e283b3475a832c686e2b548ed5


### Get article headlines in English language from different types from = 2023-03-22 and to = 2023-03-23 

In [10]:
types = ['business', 'finance', 'general', 'local', 'science', 'sports', 'technology']
article5 = getArticles('2023-03-22','2023-03-23',types)
article5.to_csv('articles/article5.csv', encoding='utf-8', index=False)

https://newsapi.org/v2/everything?q=business&language=en&from=2023-03-22&to=2023-03-23&apiKey=149f49e283b3475a832c686e2b548ed5
https://newsapi.org/v2/everything?q=finance&language=en&from=2023-03-22&to=2023-03-23&apiKey=149f49e283b3475a832c686e2b548ed5
https://newsapi.org/v2/everything?q=general&language=en&from=2023-03-22&to=2023-03-23&apiKey=149f49e283b3475a832c686e2b548ed5
https://newsapi.org/v2/everything?q=local&language=en&from=2023-03-22&to=2023-03-23&apiKey=149f49e283b3475a832c686e2b548ed5
https://newsapi.org/v2/everything?q=science&language=en&from=2023-03-22&to=2023-03-23&apiKey=149f49e283b3475a832c686e2b548ed5
https://newsapi.org/v2/everything?q=sports&language=en&from=2023-03-22&to=2023-03-23&apiKey=149f49e283b3475a832c686e2b548ed5
https://newsapi.org/v2/everything?q=technology&language=en&from=2023-03-22&to=2023-03-23&apiKey=149f49e283b3475a832c686e2b548ed5


### Get article headlines in English language from different types from = 2023-03-23 and to = 2023-03-24

In [11]:
types = ['music', 'jobs', 'weather', 'politics', 'science', 'international', 'technology']
article6 = getArticles('2023-03-23','2023-03-24',types)
article6.to_csv('articles/article6.csv', encoding='utf-8', index=False)

https://newsapi.org/v2/everything?q=music&language=en&from=2023-03-23&to=2023-03-24&apiKey=149f49e283b3475a832c686e2b548ed5
https://newsapi.org/v2/everything?q=jobs&language=en&from=2023-03-23&to=2023-03-24&apiKey=149f49e283b3475a832c686e2b548ed5
https://newsapi.org/v2/everything?q=weather&language=en&from=2023-03-23&to=2023-03-24&apiKey=149f49e283b3475a832c686e2b548ed5
https://newsapi.org/v2/everything?q=politics&language=en&from=2023-03-23&to=2023-03-24&apiKey=149f49e283b3475a832c686e2b548ed5
https://newsapi.org/v2/everything?q=science&language=en&from=2023-03-23&to=2023-03-24&apiKey=149f49e283b3475a832c686e2b548ed5
https://newsapi.org/v2/everything?q=international&language=en&from=2023-03-23&to=2023-03-24&apiKey=149f49e283b3475a832c686e2b548ed5
https://newsapi.org/v2/everything?q=technology&language=en&from=2023-03-23&to=2023-03-24&apiKey=149f49e283b3475a832c686e2b548ed5


### Get article headlines in English language from all the types from = 2023-03-24 and to = 2023-03-25

In [12]:
types = ['disasters', 'entertainment', 'general', 'health', 'science', 'international', 'technology']
article7 = getArticles('2023-03-24','2023-03-25',types)
article7.to_csv('articles/article7.csv', encoding='utf-8', index=False)

https://newsapi.org/v2/everything?q=disasters&language=en&from=2023-03-24&to=2023-03-25&apiKey=149f49e283b3475a832c686e2b548ed5
https://newsapi.org/v2/everything?q=entertainment&language=en&from=2023-03-24&to=2023-03-25&apiKey=149f49e283b3475a832c686e2b548ed5
https://newsapi.org/v2/everything?q=general&language=en&from=2023-03-24&to=2023-03-25&apiKey=149f49e283b3475a832c686e2b548ed5
https://newsapi.org/v2/everything?q=health&language=en&from=2023-03-24&to=2023-03-25&apiKey=149f49e283b3475a832c686e2b548ed5
https://newsapi.org/v2/everything?q=science&language=en&from=2023-03-24&to=2023-03-25&apiKey=149f49e283b3475a832c686e2b548ed5
https://newsapi.org/v2/everything?q=international&language=en&from=2023-03-24&to=2023-03-25&apiKey=149f49e283b3475a832c686e2b548ed5
https://newsapi.org/v2/everything?q=technology&language=en&from=2023-03-24&to=2023-03-25&apiKey=149f49e283b3475a832c686e2b548ed5


## Merge CSV files generated for each date into one

In [13]:
files_path = 'C:/users/priyanka/DataScience/articles'
files_list = os.listdir(files_path) #select the file path in which all the temporary csv files are present
files_list

['article1.csv',
 'article2.csv',
 'article3.csv',
 'article4.csv',
 'article5.csv',
 'article6.csv',
 'article7.csv']

In [14]:
#concat all the csv files into a dataframe and convert it into a single csv file
all_articles = pd.DataFrame() 
print('merging and creating articles csv file')
all_articles = pd.concat([pd.read_csv(files_path + '/' + article_file) for article_file in files_list], ignore_index=True)
all_articles.to_csv('articles.csv', encoding='utf-8', index=False)
print('articles successfully created')

merging and creating articles csv file
articles successfully created
