# [Merge and filter data](#merge-and-filter-data)

In [1]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

In [2]:
import os
from azure.storage.blob import BlobServiceClient
from glob import glob
from io import StringIO
from IPython.display import display

import pandas as pd

In [3]:
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)

<a id="toc"></a>

## [Table of Contents](#table-of-contents)
0. [About](#about)
1. [User Inputs](#user-inputs)
2. [Merge and filter Hubble data](#merge-and-filter-hubble-data)
3. [Merge and filter NYTimes data](#merge-and-filter-nytimes-data)
4. [Merge and filter Guardian data](#merge-and-filter-guardian-data)
5. [Merge and filter Space.com data](#merge-and-filter-space.com-data)

<a id="about"></a>

## 0. [About](#about)

In this notebook, we will merge scraped listings data (including metadata), from various news publications and stored in `data/raw`, into a separate `data/processed/<publication_name>_processed.csv` file per news publication and filter out any articles that are less than 500 words in length

<a id="user-inputs"></a>

## 1. [User Inputs](#user-inputs)

We'll define below the variables to be used throughout the code.

In [4]:
# General inputs
PROJ_ROOT_DIR = os.getcwd()
data_dir = os.path.join(PROJ_ROOT_DIR, "data", "raw")
processed_data_dir = os.path.join(PROJ_ROOT_DIR, "data", "processed")
az_storage_container_name = "myconedesx7"
cloud_data = True

# Hubble Filenames
# # Local files
hubble_filename = "hubble_urls.csv"
hubble_text = "hubble.csv"
# # Cloud-based files
hubble_inputs = {
    "blobedesz23": "urls",
    "blobedesz22": "text",
}
hubble_processed_filename = "hubble_processed.csv"

# NY Times Filenames
# # Local files
nytimes_filename = "nytimes_urls__*.csv"
nytimes_text_filenames = [
    "nytimes.csv",
    # # use below if you scrape only certain articles' text at once
    # # and then want to combine all tries together
    # "nytimes_1.csv",
    # "nytimes_3.csv",
    # "nytimes_2.csv"
]
# # Cloud-based files
nytimes_inputs = {
    "blobedesz27": "urls_1950_1989",
    "blobedesz28": "urls_1990_1999",
    "blobedesz29": "urls_2000_2019",
    "blobedesz24": "text1",
    "blobedesz25": "text2",
    "blobedesz26": "text3",
}
nytimes_processed_filename = "nytimes_processed.csv"

# Space.com Filenames
# # Local files
space_filename = "space_com_urls.csv"
space_text_filenames = [
    "space.csv",
    # # use below if you scrape only certain articles' text at once
    # # and then want to combine all tries together
    # "space_1.csv",
    # "space_2.csv",
    # "space_3.csv",
    # "space_4.csv",
    # "space_5.csv",
]
# # Cloud-based files
space_inputs = {
    "blobedesz35": "urls",
    "blobedesz30": "text1",
    "blobedesz31": "text2",
    "blobedesz32": "text3",
    "blobedesz33": "text4",
    "blobedesz34": "text5",
}
space_processed_filename = "space_processed.csv"

# Guardian Filenames
# # Local files
guardian_filename = "guardian_urls.csv"
guardian_text_filenames = [
    "guardian.csv"
    # # use below if you scrape only certain articles' text at once
    # # and then want to combine all tries together
    # "guardian_1.csv",
    # "guardian_2.csv"
]
# # Cloud-based files
guardian_inputs = {
    "blobedesz21": "urls",
    "blobedesz19": "text1",
    "blobedesz20": "text2",
}
guardian_processed_filename = "guardian_processed.csv"

In [5]:
# Parameters
data_dir = "/home/elstand/Downloads/nlp-space-news-topic-modeling/data/raw"
processed_data_dir = (
    "/home/elstand/Downloads/nlp-space-news-topic-modeling/data/processed"
)
az_storage_container_name = "myconedesx7"
cloud_data = True
hubble_inputs = {"blobedesz23": "urls", "blobedesz22": "text"}
hubble_processed_filename = "hubble_processed.csv"
nytimes_inputs = {
    "blobedesz27": "urls_1950_1989",
    "blobedesz28": "urls_1990_1999",
    "blobedesz29": "urls_2000_2019",
    "blobedesz24": "text1",
    "blobedesz25": "text2",
    "blobedesz26": "text3",
}
nytimes_processed_filename = "nytimes_processed.csv"
space_inputs = {
    "blobedesz35": "urls",
    "blobedesz30": "text1",
    "blobedesz31": "text2",
    "blobedesz32": "text3",
    "blobedesz33": "text4",
    "blobedesz34": "text5",
}
space_processed_filename = "space_processed.csv"
guardian_inputs = {
    "blobedesz21": "urls",
    "blobedesz19": "text1",
    "blobedesz20": "text2",
}
guardian_processed_filename = "guardian_processed.csv"


In [6]:
conn_str = (
    "DefaultEndpointsProtocol=https;"
    f"AccountName={os.getenv('AZURE_STORAGE_ACCOUNT')};"
    f"AccountKey={os.getenv('AZURE_STORAGE_KEY')};"
    f"EndpointSuffix={os.getenv('ENDPOINT_SUFFIX')}"
)
blob_service_client = BlobServiceClient.from_connection_string(conn_str=conn_str)

<a id="merge-and-filter-hubble-data"></a>

## 2. [Merge and filter Hubble data](#merge-and-filter-hubble-data)

We'll start by loading the scraped text and listings urls from the Hubble website into separate `DataFrame`s.

In [7]:
if not cloud_data:
    df_hubble_listings = pd.read_csv(os.path.join(data_dir, hubble_filename))
    df_hubble_text = pd.read_csv(os.path.join(processed_data_dir, hubble_text))
else:
    hubble_dict = {}
    for az_blob_name, file_type in hubble_inputs.items():
        blob_client = blob_service_client.get_blob_client(
            container=az_storage_container_name, blob=az_blob_name
        )
        blobstring = blob_client.download_blob().content_as_text()
        hubble_dict[file_type] = pd.read_csv(StringIO(blobstring))
    df_hubble_text = pd.concat([v for k,v in hubble_dict.items() if "text" in k])
    df_hubble_listings = pd.concat([v for k,v in hubble_dict.items() if k == "urls"])

We'll then rename columns

In [8]:
df_hubble_listings.rename(
    columns={"publication": "publication_date", "mission": "publication"}, inplace=True
)
df_hubble_text.drop(["publication_date"], axis=1, inplace=True)
display(df_hubble_listings.head(2))
display(df_hubble_text.head(2))

Unnamed: 0,name,news_id,url,publication,publication_date,abstract
0,NASA's Hubble Captures a Dozen Galaxy Doppelga...,2019-58,https://hubblesite.org/contents/news-releases/...,hubble,2019-11-07T14:00:00.000-05:00,\r\nThe “funhouse mirror” has delighted carniv...
1,Hubble Captures Galaxies' Ghostly Gaze,2019-51,https://hubblesite.org/contents/news-releases/...,hubble,2019-10-28T10:00:00.000-04:00,The universe is a bubbling cauldron of matter ...


Unnamed: 0,url,text,publication,year,month,day,dayofweek,dayofyear,weekofyear,quarter
0,https://hubblesite.org/contents/news-releases/...,This NASA Hubble Space Telescope photo reveals...,hubble,,,,,,,
1,https://hubblesite.org/contents/news-releases/...,"When astronomers peer deep into space, they do...",hubble,,,,,,,


Next, we'll set an index for the `DataFrame`s so that we can join each `DataFrame` on its index

In [9]:
df_hubble_text = df_hubble_text.set_index(["url", "publication"])
df_hubble_listings = df_hubble_listings.set_index(["url", "publication"])
print(df_hubble_text.shape)
display(df_hubble_text.head(2))
print(df_hubble_listings.shape)
display(df_hubble_listings.head(2))

(1195, 8)


Unnamed: 0_level_0,Unnamed: 1_level_0,text,year,month,day,dayofweek,dayofyear,weekofyear,quarter
url,publication,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
https://hubblesite.org/contents/news-releases/2019/news-2019-58,hubble,This NASA Hubble Space Telescope photo reveals...,,,,,,,
https://hubblesite.org/contents/news-releases/2019/news-2019-51,hubble,"When astronomers peer deep into space, they do...",,,,,,,


(1195, 4)


Unnamed: 0_level_0,Unnamed: 1_level_0,name,news_id,publication_date,abstract
url,publication,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
https://hubblesite.org/contents/news-releases/2019/news-2019-58,hubble,NASA's Hubble Captures a Dozen Galaxy Doppelga...,2019-58,2019-11-07T14:00:00.000-05:00,\r\nThe “funhouse mirror” has delighted carniv...
https://hubblesite.org/contents/news-releases/2019/news-2019-51,hubble,Hubble Captures Galaxies' Ghostly Gaze,2019-51,2019-10-28T10:00:00.000-04:00,The universe is a bubbling cauldron of matter ...


Next, we'll merge the `DataFrame`s on the index and reset the index so that these index columns appear in the merged `DataFrame`

In [10]:
df_hubble = df_hubble_text.merge(
    df_hubble_listings, left_index=True, right_index=True, how="inner",
).reset_index(drop=False)
print(df_hubble.shape)
display(df_hubble.head(2))

(1174, 14)


Unnamed: 0,url,publication,text,year,month,day,dayofweek,dayofyear,weekofyear,quarter,name,news_id,publication_date,abstract
0,https://hubblesite.org/contents/news-releases/...,hubble,Capitalizing on the unparalleled sharpness and...,,,,,,,,Astronomers Release Most Complete Ultraviolet-...,2018-27,2018-05-17T13:00:00.000-04:00,Much of the light in the universe comes from s...
1,https://hubblesite.org/contents/news-releases/...,hubble,If civilizations exist around other stars they...,,,,,,,,Extraterrestrial Civilizations: Coming of Age ...,1998-43,1998-12-10T09:00:00.000-05:00,If civilizations exist around other stars they...


Next, we'll append `datetime` attributes as columns to the merged `DataFrame`

In [11]:
L = ["year", "month", "day", "dayofweek", "dayofyear", "weekofyear", "quarter"]
df_hubble.drop(L, axis=1, inplace=True)
df_hubble["publication_date"] = pd.to_datetime(df_hubble["publication_date"], utc=True)

In [12]:
df_hubble = df_hubble.join(
    pd.concat(
        (getattr(df_hubble["publication_date"].dt, i).rename(i) for i in L), axis=1
    )
)
df_hubble["decade"] = df_hubble["year"] // 10 * 10
print(df_hubble.shape)
df_hubble.head(3)

(1174, 15)


Unnamed: 0,url,publication,text,name,news_id,publication_date,abstract,year,month,day,dayofweek,dayofyear,weekofyear,quarter,decade
0,https://hubblesite.org/contents/news-releases/...,hubble,Capitalizing on the unparalleled sharpness and...,Astronomers Release Most Complete Ultraviolet-...,2018-27,2018-05-17 17:00:00+00:00,Much of the light in the universe comes from s...,2018,5,17,3,137,20,2,2010
1,https://hubblesite.org/contents/news-releases/...,hubble,If civilizations exist around other stars they...,Extraterrestrial Civilizations: Coming of Age ...,1998-43,1998-12-10 14:00:00+00:00,If civilizations exist around other stars they...,1998,12,10,3,344,50,4,1990
2,https://hubblesite.org/contents/news-releases/...,hubble,Astronomers have long puzzled over why a small...,Hubble Resolves Puzzle about Loner Starburst G...,2008-38,2008-11-20 14:00:00+00:00,Astronomers have long puzzled over why a small...,2008,11,20,3,325,47,4,2000


Next, we'll filter out news articles of less than 500 characters

In [13]:
df_hubble = df_hubble[(df_hubble["text"].str.len() > 500)]

Next, we'll drop unwanted columns from the merged `DataFrame`

In [14]:
unwanted_hubble_cols = [
    "url",
    "publication",
    "news_id",
    "publication_date",
    "day",
    "dayofweek",
    "dayofyear",
    "weekofyear",
    "quarter",
]
df_hubble.drop(unwanted_hubble_cols, axis=1, inplace=True)
print(df_hubble.shape)

(1054, 6)


Finally, we'll export the merged `DataFrame` to a `.csv` file

In [15]:
print(
    f"Memory footprint of DataFrame: {(df_hubble.memory_usage().sum() / 1000 / 1000):.2f} MB"
)
df_hubble.to_csv(
    os.path.join(processed_data_dir, hubble_processed_filename), index=False
)

Memory footprint of DataFrame: 0.06 MB


In [16]:
df_hubble_loaded = pd.read_csv(
    os.path.join(processed_data_dir, hubble_processed_filename)
)
print(
    f"Memory footprint of DataFrame: {(df_hubble_loaded.memory_usage().sum() / 1000 / 1000):.2f} MB"
)

Memory footprint of DataFrame: 0.05 MB


<a id="merge-and-filter-nytimes-data"></a>

## 3. [Merge and filter NYTimes data](#merge-and-filter-nytimes-data)

We'll start by loading the scraped text and listings urls from the New York Times website into separate `DataFrame`s.

In [17]:
if not cloud_data:
    df_nytimes_listings = pd.concat(
        [pd.read_csv(os.path.join(data_dir, f)) for f in glob(os.path.join(data_dir, nytimes_filename))]
    )
    df_nytimes_text = pd.concat([pd.read_csv(os.path.join(data_dir, f)) for f in nytimes_text_filenames])
else:
    nytimes_dict = {}
    for az_blob_name, file_type in nytimes_inputs.items():
        blob_client = blob_service_client.get_blob_client(
            container=az_storage_container_name, blob=az_blob_name
        )
        blobstring = blob_client.download_blob().content_as_text()
        nytimes_dict[file_type] = pd.read_csv(StringIO(blobstring))
    df_nytimes_text = pd.concat([v for k,v in nytimes_dict.items() if "text" in k])
    df_nytimes_listings = pd.concat([v for k,v in nytimes_dict.items() if "urls" in k])

We'll then rename columns

In [18]:
df_nytimes_listings.rename(
    columns={"web_url": "url", "source": "publication"}, inplace=True
)
df_nytimes_text["publication"] = df_nytimes_text["publication"].str.replace(
    "nytimes", "The New York Times"
)
display(df_nytimes_listings.head(2))
display(df_nytimes_text.head(2))

Unnamed: 0,url,lead_paragraph,abstract,snippet,publication,document_type,news_desk,section_name,type_of_material,subsection_name,word_count,page
0,https://www.nytimes.com/1981/01/06/science/abo...,SCHOOL closings have turned into a nationwide ...,SCHOOL closings have turned into a nationwid...,,The New York Times,article,Science Desk,Science,News,,816,1
1,https://www.nytimes.com/1981/01/13/science/sci...,Flight controllers have lost all but minimal c...,Flight controllers have lost all but minimal...,,The New York Times,article,Science Desk,Science,News,,105,1


Unnamed: 0,url,text,publication_date,publication,year,month,day,dayofweek,dayofyear,weekofyear,quarter
0,https://www.nytimes.com/1981/01/06/science/abo...,SCHOOL closings have turned into a nationwide ...,1981-01-06 05:00:00+00:00,The New York Times,1981,1,6,1,6,2,1
1,https://www.nytimes.com/1981/01/13/science/sci...,Flight controllers have lost all but minimal c...,1981-01-13 05:00:00+00:00,The New York Times,1981,1,13,1,13,3,1


Next, we'll set an index for the `DataFrame`s so that we can join each  `DataFrame` on its index

In [19]:
df_nytimes_text = df_nytimes_text.set_index(["url", "publication"])
df_nytimes_listings = df_nytimes_listings.set_index(["url", "publication"])
print(df_nytimes_text.shape)
display(df_nytimes_text.head(2))
print(df_nytimes_listings.shape)
display(df_nytimes_listings.head(2))

(4018, 9)


Unnamed: 0_level_0,Unnamed: 1_level_0,text,publication_date,year,month,day,dayofweek,dayofyear,weekofyear,quarter
url,publication,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
https://www.nytimes.com/1981/01/06/science/about-education-old-schools-are-a-new-resource.html,The New York Times,SCHOOL closings have turned into a nationwide ...,1981-01-06 05:00:00+00:00,1981,1,6,1,6,2,1
https://www.nytimes.com/1981/01/13/science/science-watch-spacecraft-controls-fail.html,The New York Times,Flight controllers have lost all but minimal c...,1981-01-13 05:00:00+00:00,1981,1,13,1,13,3,1


(4036, 10)


Unnamed: 0_level_0,Unnamed: 1_level_0,lead_paragraph,abstract,snippet,document_type,news_desk,section_name,type_of_material,subsection_name,word_count,page
url,publication,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
https://www.nytimes.com/1981/01/06/science/about-education-old-schools-are-a-new-resource.html,The New York Times,SCHOOL closings have turned into a nationwide ...,SCHOOL closings have turned into a nationwid...,,article,Science Desk,Science,News,,816,1
https://www.nytimes.com/1981/01/13/science/science-watch-spacecraft-controls-fail.html,The New York Times,Flight controllers have lost all but minimal c...,Flight controllers have lost all but minimal...,,article,Science Desk,Science,News,,105,1


Next, we'll merge the `DataFrame`s on the index and reset the index so that these index columns appear in the merged `DataFrame`

In [20]:
df_nytimes = df_nytimes_text.merge(
    df_nytimes_listings, left_index=True, right_index=True, how="inner",
).reset_index(drop=False)
print(df_nytimes.shape)
display(df_nytimes.head(2))

(4036, 21)


Unnamed: 0,url,publication,text,publication_date,year,month,day,dayofweek,dayofyear,weekofyear,quarter,lead_paragraph,abstract,snippet,document_type,news_desk,section_name,type_of_material,subsection_name,word_count,page
0,https://www.nytimes.com/1981/01/06/science/abo...,The New York Times,SCHOOL closings have turned into a nationwide ...,1981-01-06 05:00:00+00:00,1981,1,6,1,6,2,1,SCHOOL closings have turned into a nationwide ...,SCHOOL closings have turned into a nationwid...,,article,Science Desk,Science,News,,816,1
1,https://www.nytimes.com/1981/01/13/science/sci...,The New York Times,Flight controllers have lost all but minimal c...,1981-01-13 05:00:00+00:00,1981,1,13,1,13,3,1,Flight controllers have lost all but minimal c...,Flight controllers have lost all but minimal...,,article,Science Desk,Science,News,,105,1


Next, we'll append `datetime` attributes as columns to the merged `DataFrame`

In [21]:
df_nytimes_text["decade"] = df_nytimes_text["year"] // 10 * 10

Next, we'll filter out news articles of less than 500 characters and exclude unwanted articles (i.e. we'll remove articles from the subsection `Environment`)

In [22]:
df_nytimes = df_nytimes[
    (df_nytimes["type_of_material"] == "News")
    & (df_nytimes["subsection_name"] != "Environment")
    & (df_nytimes["text"].str.len() > 500)
]

Next, we'll drop unwanted columns from the merged `DataFrame`

In [23]:
unwanted_nytimes_cols = [
    "url",
    "section_name",
    "page",
    "news_desk",
    "document_type",
    "type_of_material",
    "publication_date",
    "day",
    "dayofweek",
    "dayofyear",
    "weekofyear",
    "quarter",
    "lead_paragraph",
]
df_nytimes.drop(unwanted_nytimes_cols, axis=1, inplace=True)
print(df_nytimes.shape)
df_nytimes.head(2)

(3483, 8)


Unnamed: 0,publication,text,year,month,abstract,snippet,subsection_name,word_count
0,The New York Times,SCHOOL closings have turned into a nationwide ...,1981,1,SCHOOL closings have turned into a nationwid...,,,816
1,The New York Times,Flight controllers have lost all but minimal c...,1981,1,Flight controllers have lost all but minimal...,,,105


Finally, we'll export the merged `DataFrame` to a `.csv` file

In [24]:
print(
    f"Memory footprint of DataFrame: {(df_nytimes.memory_usage().sum() / 1000 / 1000):.2f} MB"
)
df_nytimes.to_csv(
    os.path.join(processed_data_dir, nytimes_processed_filename), index=False
)

Memory footprint of DataFrame: 0.25 MB


In [25]:
df_nytimes_loaded = pd.read_csv(
    os.path.join(processed_data_dir, nytimes_processed_filename)
)
print(
    f"Memory footprint of DataFrame: {(df_nytimes_loaded.memory_usage().sum() / 1000 / 1000):.2f} MB"
)

Memory footprint of DataFrame: 0.22 MB


<a id="merge-and-filter-guardian-data"></a>

## 4. [Merge and filter Guardian data](#merge-and-filter-guardian-data)

We'll start by loading the scraped text and listings urls from the Guardian website into separate `DataFrame`s.

In [26]:
if not cloud_data:
    df_guardian_listings = pd.read_csv(os.path.join(data_dir, guardian_filename))
    df_guardian_text = pd.concat(
        [pd.read_csv(os.path.join(data_dir, f)) for f in guardian_text_filenames]
    )
else:
    guardian_dict = {}
    for az_blob_name, file_type in guardian_inputs.items():
        blob_client = blob_service_client.get_blob_client(
            container=az_storage_container_name, blob=az_blob_name
        )
        blobstring = blob_client.download_blob().content_as_text()
        guardian_dict[file_type] = pd.read_csv(StringIO(blobstring))
    df_guardian_text = pd.concat([v for k,v in guardian_dict.items() if "text" in k])
    df_guardian_listings = pd.concat([v for k,v in guardian_dict.items() if k == "urls"])

We'll then rename columns

In [27]:
df_guardian_listings.rename(
    columns={"webUrl": "url", "webPublicationDate": "publication_date"}, inplace=True
)
df_guardian_text.drop(["publication_date"], axis=1, inplace=True)
display(df_guardian_listings.head(2))
display(df_guardian_text.head(2))

Unnamed: 0,url,id,publication_date,apiUrl,webTitle,document_type,sectionId,sectionName,type,isHosted,pillarId,pillarName,page
0,https://www.theguardian.com/science/1957/nov/0...,science/1957/nov/04/spaceexploration.archive,1957-11-04T15:38:49Z,https://content.guardianapis.com/science/1957/...,Space Travel Comes True,,science,Science,article,False,pillar/news,News,1
1,https://www.theguardian.com/science/1962/feb/2...,science/1962/feb/21/spaceexploration.archive,1962-02-21T12:50:18Z,https://content.guardianapis.com/science/1962/...,Home and dry after three orbits,,science,Science,article,False,pillar/news,News,1


Unnamed: 0,url,text,publication,year,month,day,dayofweek,dayofyear,weekofyear,quarter
0,https://www.theguardian.com/science/1957/nov/0...,The second Russian satellite has demonstrated ...,guardian,,,,,,,
1,https://www.theguardian.com/science/1962/feb/2...,"Col. Glenn reported ""hale and hearty""Picked up...",guardian,,,,,,,


Next, we'll set an index for the `DataFrame`s so that we can join each `DataFrame` on its index

In [28]:
df_guardian_text = df_guardian_text.set_index(["url"])
df_guardian_listings = df_guardian_listings.set_index(["url"])
print(df_guardian_text.shape)
display(df_guardian_text.head(2))
print(df_guardian_listings.shape)
display(df_guardian_listings.head(2))

(4199, 9)


Unnamed: 0_level_0,text,publication,year,month,day,dayofweek,dayofyear,weekofyear,quarter
url,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
https://www.theguardian.com/science/1957/nov/04/spaceexploration.archive,The second Russian satellite has demonstrated ...,guardian,,,,,,,
https://www.theguardian.com/science/1962/feb/21/spaceexploration.archive,"Col. Glenn reported ""hale and hearty""Picked up...",guardian,,,,,,,


(4199, 12)


Unnamed: 0_level_0,id,publication_date,apiUrl,webTitle,document_type,sectionId,sectionName,type,isHosted,pillarId,pillarName,page
url,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
https://www.theguardian.com/science/1957/nov/04/spaceexploration.archive,science/1957/nov/04/spaceexploration.archive,1957-11-04T15:38:49Z,https://content.guardianapis.com/science/1957/...,Space Travel Comes True,,science,Science,article,False,pillar/news,News,1
https://www.theguardian.com/science/1962/feb/21/spaceexploration.archive,science/1962/feb/21/spaceexploration.archive,1962-02-21T12:50:18Z,https://content.guardianapis.com/science/1962/...,Home and dry after three orbits,,science,Science,article,False,pillar/news,News,1


Next, we'll merge the `DataFrame`s on the index and reset the index so that these index columns appear in the merged `DataFrame`

In [29]:
df_guardian = df_guardian_text.merge(
    df_guardian_listings, left_index=True, right_index=True, how="inner",
).reset_index(drop=False)
print(df_guardian.shape)
display(df_guardian.head(2))

(4199, 22)


Unnamed: 0,url,text,publication,year,month,day,dayofweek,dayofyear,weekofyear,quarter,id,publication_date,apiUrl,webTitle,document_type,sectionId,sectionName,type,isHosted,pillarId,pillarName,page
0,https://www.theguardian.com/science/1957/nov/0...,The second Russian satellite has demonstrated ...,guardian,,,,,,,,science/1957/nov/04/spaceexploration.archive,1957-11-04T15:38:49Z,https://content.guardianapis.com/science/1957/...,Space Travel Comes True,,science,Science,article,False,pillar/news,News,1
1,https://www.theguardian.com/science/1962/feb/2...,"Col. Glenn reported ""hale and hearty""Picked up...",guardian,,,,,,,,science/1962/feb/21/spaceexploration.archive,1962-02-21T12:50:18Z,https://content.guardianapis.com/science/1962/...,Home and dry after three orbits,,science,Science,article,False,pillar/news,News,1


Next, we'll append `datetime` attributes as columns to the merged `DataFrame`

In [30]:
L = ["year", "month", "day", "dayofweek", "dayofyear", "weekofyear", "quarter"]
df_guardian.drop(L, axis=1, inplace=True)
df_guardian["publication_date"] = pd.to_datetime(
    df_guardian["publication_date"], utc=True
)

In [31]:
df_guardian = df_guardian.join(
    pd.concat(
        (getattr(df_guardian["publication_date"].dt, i).rename(i) for i in L), axis=1
    )
)
df_guardian["decade"] = df_guardian["year"] // 10 * 10
print(df_guardian.shape)
df_guardian.head(3)

(4199, 23)


Unnamed: 0,url,text,publication,id,publication_date,apiUrl,webTitle,document_type,sectionId,sectionName,type,isHosted,pillarId,pillarName,page,year,month,day,dayofweek,dayofyear,weekofyear,quarter,decade
0,https://www.theguardian.com/science/1957/nov/0...,The second Russian satellite has demonstrated ...,guardian,science/1957/nov/04/spaceexploration.archive,1957-11-04 15:38:49+00:00,https://content.guardianapis.com/science/1957/...,Space Travel Comes True,,science,Science,article,False,pillar/news,News,1,1957,11,4,0,308,45,4,1950
1,https://www.theguardian.com/science/1962/feb/2...,"Col. Glenn reported ""hale and hearty""Picked up...",guardian,science/1962/feb/21/spaceexploration.archive,1962-02-21 12:50:18+00:00,https://content.guardianapis.com/science/1962/...,Home and dry after three orbits,,science,Science,article,False,pillar/news,News,1,1962,2,21,2,52,8,1,1960
2,https://www.theguardian.com/science/1967/jan/2...,The entire three-man crew of the Apollo One sp...,guardian,science/1967/jan/27/spaceexploration.columbia,1967-01-27 13:44:28+00:00,https://content.guardianapis.com/science/1967/...,US spacemen die as rocket catches fire,,science,Science,article,False,pillar/news,News,1,1967,1,27,4,27,4,1,1960


Next, we'll filter out news articles of less than 500 characters

In [32]:
df_guardian = df_guardian[df_guardian["text"].str.len() > 500]

Next, we'll drop unwanted columns from the merged `DataFrame`

In [33]:
unwanted_guardian_cols = [
    "url",
    "id",
    "sectionId",
    "sectionName",
    "type",
    "isHosted",
    "pillarId",
    "pillarName",
    "page",
    "publication_date",
    "day",
    "dayofweek",
    "dayofyear",
    "weekofyear",
    "quarter",
]
df_guardian.drop(unwanted_guardian_cols, axis=1, inplace=True)
print(df_guardian.shape)

(4148, 8)


Finally, we'll export the merged `DataFrame` to a `.csv` file

In [34]:
print(
    f"Memory footprint of DataFrame: {(df_guardian.memory_usage().sum() / 1000 / 1000):.2f} MB"
)
df_guardian.to_csv(
    os.path.join(processed_data_dir, guardian_processed_filename), index=False
)

Memory footprint of DataFrame: 0.30 MB


In [35]:
df_guardian_loaded = pd.read_csv(
    os.path.join(processed_data_dir, guardian_processed_filename)
)
print(
    f"Memory footprint of DataFrame: {(df_guardian_loaded.memory_usage().sum() / 1000 / 1000):.2f} MB"
)

Memory footprint of DataFrame: 0.27 MB


<a id="merge-and-filter-space.com-data"></a>

## 5. [Merge and filter Space.com data](#merge-and-filter-space.com-data)

We'll start by loading the scraped text and listings urls from the Space.com website into separate `DataFrame`s.

In [36]:
if not cloud_data:
    df_space_listings = pd.read_csv(os.path.join(data_dir, space_filename))
    df_space_text = pd.concat([pd.read_csv(os.path.join(data_dir, f)) for f in space_text_filenames])
else:
    space_dict = {}
    for az_blob_name, file_type in space_inputs.items():
        blob_client = blob_service_client.get_blob_client(
            container=az_storage_container_name, blob=az_blob_name
        )
        blobstring = blob_client.download_blob().content_as_text()
        space_dict[file_type] = pd.read_csv(StringIO(blobstring))
    df_space_text = pd.concat([v for k,v in space_dict.items() if "text" in k])
    df_space_listings = pd.concat([v for k,v in space_dict.items() if k == "urls"])
# df_space_text.drop(["publication_date"], axis=1, inplace=True)
display(df_space_listings.head(2))
display(df_space_text.head(2))

Unnamed: 0,url,archive_url
0,https://www.space.com/1926-frozen-ethane-pluto...,https://www.space.com/archive/1999/07
1,https://www.space.com/3156-milky-tourist-guide...,https://www.space.com/archive/2000/01


Unnamed: 0,url,text,publication_date,publication,year,month,day,dayofweek,dayofyear,weekofyear,quarter
0,https://www.space.com/3156-milky-tourist-guide...,What a difference a couple of years make. Buzz...,2000-01-05 12:28:00+00:00,space,2000.0,1.0,5.0,2.0,5.0,1.0,1.0
1,https://www.space.com/8193-caused-apollo-13-ac...,"CAPE CANAVERAL, Fla. All you had to do was hea...",2000-04-13 10:27:00+00:00,space,2000.0,4.0,13.0,3.0,104.0,15.0,2.0


Next, we'll set an index for the `DataFrame`s so that we can join each on the index

In [37]:
df_space_text = df_space_text.set_index(["url"])
df_space_listings = df_space_listings.set_index(["url"])
print(df_space_text.shape)
display(df_space_text.head(2))
print(df_space_listings.shape)
display(df_space_listings.head(2))

(36741, 10)


Unnamed: 0_level_0,text,publication_date,publication,year,month,day,dayofweek,dayofyear,weekofyear,quarter
url,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
https://www.space.com/3156-milky-tourist-guide.html,What a difference a couple of years make. Buzz...,2000-01-05 12:28:00+00:00,space,2000.0,1.0,5.0,2.0,5.0,1.0,1.0
https://www.space.com/8193-caused-apollo-13-accident.html,"CAPE CANAVERAL, Fla. All you had to do was hea...",2000-04-13 10:27:00+00:00,space,2000.0,4.0,13.0,3.0,104.0,15.0,2.0


(40195, 1)


Unnamed: 0_level_0,archive_url
url,Unnamed: 1_level_1
https://www.space.com/1926-frozen-ethane-pluto.html,https://www.space.com/archive/1999/07
https://www.space.com/3156-milky-tourist-guide.html,https://www.space.com/archive/2000/01


Next, we'll merge the `DataFrame`s on the index and reset the index so that these index columns appear in the merged `DataFrame`

In [38]:
df_space = df_space_text.merge(
    df_space_listings, left_index=True, right_index=True, how="inner",
).reset_index(drop=False)
print(df_space.shape)
display(df_space.head(2))

(36767, 12)


Unnamed: 0,url,text,publication_date,publication,year,month,day,dayofweek,dayofyear,weekofyear,quarter,archive_url
0,https://www.space.com/10-wallpaper-4.html,This stunning space wallpaper reveals astronau...,2010-10-21 17:00:00+00:00,space,2010.0,10.0,21.0,3.0,294.0,42.0,4.0,https://www.space.com/archive/2010/10
1,https://www.space.com/100-nasa-drill-team-brea...,"Over the course of the past decade, NASA space...",2010-11-18 19:49:00+00:00,space,2010.0,11.0,18.0,3.0,322.0,46.0,4.0,https://www.space.com/archive/2010/11


Next, we'll append `datetime` attributes as columns to the merged `DataFrame`

In [39]:
df_space["decade"] = df_space["year"] // 10 * 10

Next, we'll filter out news articles of less than 500 characters

In [40]:
df_space = df_space[df_space["text"].str.len() > 500]

Next, we'll drop unwanted columns from the merged `DataFrame`

In [41]:
unwanted_space_cols = [
    "url",
    "publication_date",
    "publication",
    "archive_url",
    "day",
    "dayofweek",
    "dayofyear",
    "weekofyear",
    "quarter",
]
df_space.drop(unwanted_space_cols, axis=1, inplace=True)
print(df_space.shape)
df_space.head(2)

(36082, 4)


Unnamed: 0,text,year,month,decade
1,"Over the course of the past decade, NASA space...",2010.0,11.0,2010.0
2,European space scientists are expected to adop...,2005.0,4.0,2000.0


Finally, we'll export the merged `DataFrame` to a `.csv` file

In [42]:
print(
    f"Memory footprint of DataFrame: {(df_space.memory_usage().sum() / 1000 / 1000):.2f} MB"
)
df_space.to_csv(os.path.join(processed_data_dir, space_processed_filename), index=False)

Memory footprint of DataFrame: 1.44 MB


In [43]:
df_space_loaded = pd.read_csv(
    os.path.join(processed_data_dir, space_processed_filename)
)
print(
    f"Memory footprint of DataFrame: {(df_space_loaded.memory_usage().sum() / 1000 / 1000):.2f} MB"
)

Memory footprint of DataFrame: 1.15 MB
