In [None]:
import pandas as pd

In [None]:
df_full = pd.read_csv('../data/full_data.csv')
df_scraped = pd.read_csv('../data/scraping_no_duplicates.csv')

Preprocessing of scraped dataframe, similar to df_full

In [None]:
df_scraped.columns = [col.lower() for col in df_scraped.columns]

df_scraped.rename({
           #'impressions': 'page_impressions',
           'page_efahrer_id': 'page_id',
           'page_canonical_url': 'url',
           'author': 'author_scraped'
            }, axis=1, inplace=True)

df_scraped.info()

Checking whether the same ids & urls are in both dataframes

In [None]:
print('Page IDs')
# Convert Series to sets
page_ids_full = set(df_full['page_id'])
page_ids_scraped = set(df_scraped['page_id'])

# Find the difference between the sets
page_ids_difference = page_ids_full.difference(page_ids_scraped)
print('Number of IDs that no data was scraped for:', len(page_ids_difference))
print(' ')

print('Page URLs')
# Convert Series to sets
page_url_full = set(df_full['url'])
page_url_scraped = set(df_scraped['url'])

# Find the difference between the sets
page_url_difference = page_url_full.difference(page_url_scraped)

print('Number URLs that no data was scraped for:',len(page_url_difference))
print('Number of URLs in complete dataset:', len(df_full.url.unique()))
print('Number of URLs in scraped dataset:', len(df_scraped['url'].unique()))

<span style="color:red">For each ID we have the scraped data but not for every URL. Assumption that the page content is the same for each page ID

Next we can merge the dataframe with the scraped content with our full dataset with the performance data. 

In [None]:
col_to_merge = ['page_id', 'url']
df_full_scraped = pd.merge(left=df_full, right=df_scraped, on=col_to_merge, how='left')
df_full_scraped.head()

In [None]:
df_full_scraped['check_merge'] = df_full_scraped['words'] == df_full_scraped['words']

print('Number of scraped pages', len(df_scraped.page_id))
print('Number of page IDs, that contain scraped data:', len(df_full_scraped[df_full_scraped['abstract'].notnull()]['page_id'].unique()))
print('Number of pages that were merged correctly based on word count:', len(df_full_scraped[df_full_scraped['check_merge'] == True]['page_id'].unique()))

In [None]:
df_full_scraped['check_merge'] = df_full_scraped['last_update'] == df_full_scraped['publish_date']

print('Number of pages that have a different update date compared to the publish date:', len(df_scraped.page_id) - len(df_full_scraped[df_full_scraped['check_merge'] == True]['page_id'].unique()))

<span style="color:red"> Some pages have a different publishing date compared to the scraped update date. This could be as the scraped date is added manually and the published_date from the data set is meta data from the actual day of publishing.</span>
<p>In the following the publishing date from the original data set is taken instead of the scraped data.

Cleaning up for the EDA file

In [None]:
df_eda = df_full_scraped.drop(['url', 'old_index', 'author_scraped', 'words', 'last_update', 'check_merge'], axis=1)
df_eda.to_csv('../data/eda.csv', encoding='utf-8', index=False)

In [None]:
df_eda.info()