### Purpose
 This notebook will help you verify that you have received data for all the tags from the scraped articles.
 ___

In [1]:
import pandas as pd
import os
import ast
import numpy as np
import re

In [55]:
def get_tags_from_csv(csv_file_path):
    all_tags = []
    unique_tags = []
    # Specify the path to your CSV file

    # Read the CSV file into a DataFrame
    df = pd.read_csv(csv_file_path, sep=';')

    # Choose the column you're interested in (replace 'Tags' with your actual column name)
    selected_column = 'Tags'

    # Filter non-empty and unique rows from the selected column
    all_tags = df[selected_column].apply(ast.literal_eval)

    # Print the non-empty, unique rows
    for arr in all_tags:
        for tag in arr:
            if tag not in unique_tags:
                unique_tags.append(tag)

    # Lower each tag
    unique_tags = list(map(str.lower,unique_tags))
    print('There are', len(unique_tags), 'unique tags in total')
    print('Showing first 10:')
    return np.array(unique_tags)

## <i><b>DATA FOR 2022</b></i>

### Get tags for each website <b>2022</b>

In [56]:
# TODO Define your file paths here
skynews_2022 = r"C:\School\Semester_1\Data_Wrangling\Data_in_the_wild_exam\data\raw\skynews\articles\2022-01_skynews_articles.csv"
guardian_2022 = r"C:\School\Semester_1\Data_Wrangling\Data_in_the_wild_exam\data\raw\guardian\2022-01-31_guardian.csv"
bbc_2022 = r"C:\School\Semester_1\Data_Wrangling\Data_in_the_wild_exam\data\raw\bbc\articles\BBC_2022.csv"

Skynews 2022
___

In [57]:
skynews_tags_2022 = get_tags_from_csv(skynews_2022)
display_df = pd.DataFrame(skynews_tags_2022, columns=['tag'])
display(display_df.head(10))

There are 667 unique tags in total
Showing first 10:


Unnamed: 0,tag
0,covid
1,covid-19
2,coronavirus
3,live
4,tony blair
5,betty white
6,newspapers
7,omicron
8,data and forensics
9,climate change


Guardian 2022
___

In [58]:
guardian_tags_2022 = get_tags_from_csv(guardian_2022)
display_df = pd.DataFrame(guardian_tags_2022, columns=['tag'])
display(display_df.head(10))

There are 3458 unique tags in total
Showing first 10:


Unnamed: 0,tag
0,liverpool
1,fulham
2,transfer window
3,football
4,sport
5,coronavirus
6,science
7,england
8,uk news
9,infectious diseases


BBC 2022
___

In [59]:
bbc_tags_2022 = get_tags_from_csv(bbc_2022)
display_df = pd.DataFrame(bbc_tags_2022, columns=['tag'])
display(display_df.head(10))

There are 414 unique tags in total
Showing first 10:


Unnamed: 0,tag
0,world health organization (who)
1,coronavirus vaccines
2,coronavirus pandemic
3,shielding
4,public health
5,china
6,social distancing
7,self-isolation
8,united states
9,reality check


### Get unique tags across ALL news sources <b>2022</b>

Excluding BBC

In [60]:
all_sources_tags = [skynews_tags_2022, guardian_tags_2022]
all_sources_tags = list(map(set,all_sources_tags))
required_tags_2022 = all_sources_tags[0].union(all_sources_tags[1])
display_df = pd.DataFrame(required_tags_2022, columns=['tag'])
display(display_df.head(10))
display(len(required_tags_2022))

Unnamed: 0,tag
0,jonathan pryce
1,biography books
2,arms trade
3,damian lewis
4,assassin's creed
5,random house
6,biloela family
7,pierce brosnan
8,ben wallace
9,myanmar


3697

#### Get all tags for which we have pytrends data <b>2022</b>
___

In [77]:
df = pd.read_csv(r"C:\School\Semester_1\Data_Wrangling\Data_in_the_wild_exam\data\stage\pytrends\normalized_results_2022\normalized_results_2022.csv", sep=';')
arr = []
for col in df.columns:
        arr.append(col)
print(arr)

['date', 'ghislaine maxwell', 'jeffrey epstein', 'united states', 'film', 'james bond', 'music', 'climate change', 'south africa', 'photography', 'meta', 'astronomy', 'nasa', 'european space agency', 'james webb space telescope', 'india', 'apple', 'omicron variant', 'joe biden', 'netherlands', 'twitter', 'us politics', 'netflix', '5g', 'silicon valley', 'rail travel', 'beer', 'pennsylvania', 'angela rayner', "prime minister's questions", 'jacob rees-mogg', 'personal finance', 'rishi sunak', 'cost of living', 'the moon', 'turkmenistan', 'brazil', 'saudi arabia', 'pollution', 'housing', 'volcanoes', 'belgium', 'dominic cummings', 'pfizer', 'bitcoin', 'ethiopia', 'yemen', 'transport', 'taliban', 'burkina faso', 'work-life balance', 'cameroon', 'taylor swift', 'archaeology', 'press freedom', 'human rights', 'imran khan', 'coffee', 'myanmar', 'rspca', 'england', 'uk news', 'infectious diseases', 'medical research', 'australia news', 'arsenal', 'barcelona', 'mexico', 'rugby union', 'liz trus

In [78]:
def save_unique_tags(normalized_results, deduplicated_pytrends_tags):
    # Initialize an empty DataFrame
    df = pd.read_csv(normalized_results, sep=';')
    unique_tags_arr = []
    for col in df.columns:
        unique_tags_arr.append(col)

    ## TODO Remove if data format is different
    # I need to remove the date
    unique_tags_arr = unique_tags_arr[1:]
    unique_tags = pd.DataFrame(unique_tags_arr)
    print('In all the collected pytrends data, there are:\n', len(unique_tags), '\nunique tags')

    # Write the merged DataFrame to a CSV file with semicolon (;) as the delimiter
    unique_tags.to_csv(deduplicated_pytrends_tags, sep=';', index=False)

    return unique_tags_arr

In [85]:
normalized_results = r"C:\School\Semester_1\Data_Wrangling\Data_in_the_wild_exam\data\stage\pytrends\normalized_results_2022\normalized_results_2022.csv"
deduplicated_pytrends_tags = r"C:\School\Semester_1\Data_Wrangling\Data_in_the_wild_exam\data\intermediate\pytrends\auxiliary\2022\unique_tags_2022.csv"

unique_pytrends_tags_2022 = save_unique_tags(normalized_results, deduplicated_pytrends_tags)
deduplicated_df = pd.DataFrame(unique_pytrends_tags_2022, columns=['tag'])
display(deduplicated_df.head(10))

In all the collected pytrends data, there are:
 3697 
unique tags


Unnamed: 0,tag
0,ghislaine maxwell
1,jeffrey epstein
2,electric cars
3,united states
4,film
5,james bond
6,music
7,climate change
8,south africa
9,photography


#### Check whether data is collected for all required tags <b>2022</b>
The cell below checks whether you have collected pytrends data for all your required tags.

In [86]:
# Print elements in array A but not in array B

missing_tags_2022 = [element for element in required_tags_2022 if element not in unique_pytrends_tags_2022]
missing_tags_2022_csv_path =  r"C:\School\Semester_1\Data_Wrangling\Data_in_the_wild_exam\data\intermediate\pytrends\auxiliary\2022\missing_pytrends_tags_2022.csv"
missing_from_pytrends_2022_df = pd.DataFrame(missing_tags_2022, columns=['tag'])
# Write the missing tags to a csv file
if len(missing_from_pytrends_2022_df) > 0:
    print('You have not collected pytrends data for', len(missing_from_pytrends_2022_df), 'of your tags.')
    missing_from_pytrends_2022_df.to_csv(missing_tags_2022_csv_path, sep=';', index=False)
    print('Writing csv file with missing tags...')
else:
    print('You have no missing tags!')

You have no missing tags!


#### Check whether you have any extra data for pytrends <b>2022</b>

In [87]:
extra_in_pytrends_2022 = [element for element in unique_pytrends_tags_2022 if element not in required_tags_2022]
extra_in_pytrends_2022_csv_path =  r"C:\School\Semester_1\Data_Wrangling\Data_in_the_wild_exam\data\intermediate\pytrends\auxiliary\2022\extra_pytrends_tags_2022.csv"
extra_in_pytrends_2022_df = pd.DataFrame(extra_in_pytrends_2022)
# Write the missing tags to a csv file
if len(extra_in_pytrends_2022_df) > 0:
    print('You have collected data for', len(extra_in_pytrends_2022_df), 'EXTRA tags. Consider dropping them.')
    extra_in_pytrends_2022_df.to_csv(extra_in_pytrends_2022_csv_path, sep=';', index=False)
    print('Writing csv file with extra tags...')
else:
    print('You have not collected any extra tags!')

You have not collected any extra tags!


# !!!
### Run only if you want to delete results for extra tags from <b>2022</b>

In [84]:
normalized_results = r"C:\School\Semester_1\Data_Wrangling\Data_in_the_wild_exam\data\stage\pytrends\normalized_results_2022\normalized_results_2022.csv"
df = pd.read_csv(normalized_results, sep=';')
df.drop(columns=extra_in_pytrends_2022, inplace=True, axis=1)

df.to_csv(normalized_results, sep=';', index=False)


## <i><b>DATA FOR 2023</b></i>

### Get tags for each website <b>2023</b>

In [88]:
skynews_2023 = r"C:\School\Semester_1\Data_Wrangling\Data_in_the_wild_exam\data\raw\skynews\articles\2023-01_skynews_articles.csv"
guardian_2023 = r"C:\School\Semester_1\Data_Wrangling\Data_in_the_wild_exam\data\raw\guardian\2023-01-31_guardian.csv"
bbc_2023 = r"C:\School\Semester_1\Data_Wrangling\Data_in_the_wild_exam\data\raw\bbc\articles\BBC_2023.csv"

Skynews 2023
___

In [89]:
skynews_tags_2023 = get_tags_from_csv(skynews_2023)
display_df = pd.DataFrame(skynews_tags_2023, columns=['tag'])
display(display_df.head(10))

There are 661 unique tags in total
Showing first 10:


Unnamed: 0,tag
0,live
1,pope benedict
2,cody fisher
3,scotland
4,newspapers
5,uk weather
6,nhs
7,amber heard
8,chris rock
9,coleen rooney


Guardian 2023
___

In [90]:
guardian_tags_2023 = get_tags_from_csv(guardian_2023)
display_df = pd.DataFrame(guardian_tags_2023, columns=['tag'])
display(display_df.head(10))

There are 3493 unique tags in total
Showing first 10:


Unnamed: 0,tag
0,california
1,commuting
2,us news
3,premier league
4,australia
5,football
6,australia sport
7,sport
8,television & radio
9,television


BBC 2023
___

In [21]:
bbc_tags_2023 = get_tags_from_csv(bbc_2023)
display_df = pd.DataFrame(bbc_tags_2023, columns=['tag'])
display(display_df.head(10))

There are 148 unique tags in total
Showing first 10:


Unnamed: 0,tag
0,us capitol riots
1,donald trump
2,us congress
3,lewis capaldi
4,raye
5,harry styles
6,self esteem
7,tiktok
8,taylor swift
9,florence + the machine


### Get unique tags across ALL news sources <b>2023</b>

Excluding BBC

In [91]:
#required_tags_2023 = unique_across_all_sources(all_sources_tags)

all_sources_tags = [skynews_tags_2023, guardian_tags_2023]
all_sources_tags = list(map(set,all_sources_tags))
required_tags_2023 = all_sources_tags[0].union(all_sources_tags[1])
display(len(required_tags_2023))
display_df = pd.DataFrame(required_tags_2023, columns=['tag'])
display(display_df.head(10))

3750

Unnamed: 0,tag
0,biography books
1,arms trade
2,judd apatow
3,mrsa and superbugs
4,vivienne westwood
5,ben wallace
6,berlin philharmonic
7,myanmar
8,trade policy
9,donald trump


#### Get all tags for which we have pytrends data <b>2023</b>
___

In [96]:
df = pd.read_csv(r"C:\School\Semester_1\Data_Wrangling\Data_in_the_wild_exam\data\stage\pytrends\normalized_results_2023\normalized_results_2023.csv", sep=';')
arr = []
for col in df.columns:
        arr.append(col)
print(arr)



In [97]:
normalized_results = r"C:\School\Semester_1\Data_Wrangling\Data_in_the_wild_exam\data\stage\pytrends\normalized_results_2023\normalized_results_2023.csv"
deduplicated_pytrends_tags = r"C:\School\Semester_1\Data_Wrangling\Data_in_the_wild_exam\data\intermediate\pytrends\auxiliary\2023\unique_tags_2023.csv"

unique_pytrends_tags_2023 = save_unique_tags(normalized_results, deduplicated_pytrends_tags)
deduplicated_df = pd.DataFrame(unique_pytrends_tags_2023, columns=['tag'])
display(deduplicated_df.head(10))

In all the collected pytrends data, there are:
 3750 
unique tags


Unnamed: 0,tag
0,privatisation
1,ordnance survey
2,opioids crisis
3,sausages
4,sikhism
5,united arab emirates
6,the gender gap
7,engineering
8,san francisco
9,maternal mortality


#### Check whether data is collected for all required tags <b>2023</b>
The cell below checks whether you have collected pytrends data for all your required tags.

In [98]:
# Print elements in array A but not in array B
missing_tags_2023 = [element for element in required_tags_2023 if element not in unique_pytrends_tags_2023]
missing_tags_2023_csv_path =  r"C:\School\Semester_1\Data_Wrangling\Data_in_the_wild_exam\data\intermediate\pytrends\auxiliary\2023\missing_pytrends_tags_2023.csv"
missing_from_pytrends_2023_df = pd.DataFrame(missing_tags_2023, columns=['tag'])
# Write the missing tags to a csv file
if len(missing_from_pytrends_2023_df) > 0:
    print('You have not collected pytrends data for', len(missing_from_pytrends_2023_df), 'of your tags.')
    missing_from_pytrends_2023_df.to_csv(missing_tags_2023_csv_path, sep=';', index=False)
    print('Writing csv file with missing tags...')
else:
    print('You have no missing tags')

You have no missing tags


#### Check whether you have any extra data for pytrends <b>2023</b>

In [99]:
extra_in_pytrends_2023 = [element for element in unique_pytrends_tags_2023 if element not in required_tags_2023]
extra_in_pytrends_2023_csv_path =  r"C:\School\Semester_1\Data_Wrangling\Data_in_the_wild_exam\data\intermediate\pytrends\auxiliary\2023\extra_pytrends_tags_2023.csv"
extra_in_pytrends_2023_df = pd.DataFrame(extra_in_pytrends_2023, columns=['tag'])
# Write the missing tags to a csv file
if len(extra_in_pytrends_2023_df) > 0:
    print('You have collected data for', len(extra_in_pytrends_2023_df), 'EXTRA tags. Consider dropping them.')
    extra_in_pytrends_2023_df.to_csv(extra_in_pytrends_2023_csv_path, sep=';', index=False)
    print('Writing csv file with extra tags...')
else:
    print('You have not collected any extra tags!')

You have not collected any extra tags!


# !!!
### Run only if you want to delete results for extra tags from <b>2023</b>

In [80]:
normalized_results = r"C:\School\Semester_1\Data_Wrangling\Data_in_the_wild_exam\data\stage\pytrends\normalized_results_2023\normalized_results_2023.csv"
df = pd.read_csv(normalized_results, sep=';')
df.drop(columns=extra_in_pytrends_2022, inplace=True, axis=1)

df.to_csv(normalized_results, sep=';', index=False)

In [5]:
df = pd.read_csv(r"C:\School\Semester_1\Data_Wrangling\Data_in_the_wild_exam\data\stage\pytrends\normalized_results_2023\normalized_results_2023.csv", sep=';')
print(len(df.columns))
print(len(set(df.columns)))

for col in df.columns:
        print(col)

3750
3750
date
privatisation
ordnance survey
opioids crisis
sausages
sikhism
united arab emirates
the gender gap
engineering
san francisco
maternal mortality
davos 2023
history
us domestic policy
jermyn street theatre
jack whitehall
benefits
matt hancock
sky news australia
ange postecoglou
nottingham
south and central asia
real betis
marilyn manson
chocolate
louis vuitton
news agencies
nfl
whatsapp
scottish independence
public sector pay
global education
gchq
saudi arabia
eurovision
austria
v&a
utah
radio 2
royal dutch shell
class issues
smartphones
newtown shooting
iran nuclear deal
england cricket team
roads
fbi
christmas
call of duty
june brown
european commission
blackburn
eid al-adha
working from home
police
tomb raider
housing
teen books
jay-z
coco gauff
mo gilligan
triple j hottest 100
singapore
sandi toksvig
aston villa women
vatican
gaming
stalking
jonas brothers
tornadoes
robert burns
co-operative group
ben whishaw
race & religion
marlon brando
kiribati
careers
sewing
turkmen