In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from googleapiclient.discovery import build
from pprint import pprint
import regex as re
from jellyfish import jaro_similarity
from tqdm import tqdm

In [2]:
filename = "covid_philippines"

path = "datasets/" + filename + "/" + filename + ".csv"
df = pd.read_csv(path)

video_channel = df[["video_id", "video_title", "channel_id", "channel_title"]]
unique_channels = df["channel_title"].unique()
view_like_comment = np.array(df[["view_count", "like_count", "comment_count"]])
query_tail = [
    " LinkedIn",
    " Wiki",
    " Official Website",
    " Facebook",
    " Twitter"
]

In [3]:
unique_channels

array(['CNN Philippines', 'ANC 24/7', 'South China Morning Post',
       'Rappler', 'INQUIRER.net', 'CNA Insider', 'Al Jazeera English',
       'CNA', 'ABS-CBN News', 'Manila Bulletin Online',
       'UNTV News and Rescue', 'BBC News', 'Global News', 'WION',
       'The Telegraph', 'UNICEF USA', 'Reuters', 'DW News',
       'UNICEF Philippines', 'GMA Integrated News', 'FRANCE 24 English',
       'Voice of America', 'Bloomberg Quicktake', 'World Bank',
       'The Straits Times', 'The Star', 'Behind Philippines', 'FEATR',
       'Hindustan Times', 'Gulf News', 'Diseases Simplified', 'TVUP',
       'Bongbong Marcos', 'Esco Lifesciences Group', 'Doctor Wessam Atif',
       'Doc Fate Cunanan', 'Asian Development Bank',
       'Adventures in America', 'Philstar News', 'HeyoLeah',
       'MedCram - Medical Lectures Explained CLEARLY', 'FinnSnow',
       'Cold Chain Innovation Hub Philippines'], dtype=object)

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,video_id,video_title,description,video_dop,view_count,like_count,comment_count,channel_id,channel_title,channel_dop,sub_count,total_videos
0,0,m3P-bmt3Uqw,JN.1 COVID-19 subvariant causing spike in cases,'An infectious disease expert says the new COV...,2023-12-25,6810,46,19,UCj6spMO3ybZPobE0T5perHA,CNN Philippines,2015-03-16,1400000,22903
1,1,sYI97jv-pZg,"PH records 2,725 new COVID cases from Dec. 12 ...",'The World Tonight: The daily average of the P...,2023-12-18,2132,14,3,UCvi6hEzLM-Z_unKPSuuzKvg,ANC 24/7,2010-01-29,1520000,71172
2,2,aLZ85hb4wjE,Normally crowded streets of Philippine capital...,'Subscribe to our YouTube channel for free her...,2020-04-02,323944,3285,619,UC4SUWizzKc1tptprBkWjX2Q,South China Morning Post,2007-01-18,3810000,16958
3,3,3YFpjgIQqEo,WATCH: DOH Usec. Tayag on rise of COVID-19 cas...,'Dateline Philippines: Karmina Constantino tal...,2023-12-07,9464,80,17,UCvi6hEzLM-Z_unKPSuuzKvg,ANC 24/7,2010-01-29,1520000,71172
4,4,n-j5rK1XOUc,W.H.O.: COVID-19 remains as health threat | Ne...,'Government agencies are set to convene after ...,2023-05-08,15037,90,25,UCj6spMO3ybZPobE0T5perHA,CNN Philippines,2015-03-16,1400000,22903


In [5]:
num_query = 0

# Per channel name
for channel_name in unique_channels:
    
    # Per query type
    for j in range(0, len(query_tail)):
        num_query += 1

print("Total number of queries: " + str(num_query))

Total number of queries: 215


## NOTE
Verifiability score is computed <u>PER CHANNEL</u> <br>
<br>
Ranking is computed <u>PER VIDEO</u>

---

Google resource initialization
- Query and Channel name are manually declared as to simulate the search process for a single channel name
- The actual loop for searching and verifying across a dataset of videos will be done in the .py file

In [6]:
# Put your personal API key here
apiKey = 'AIzaSyCIplXpNgYZ2IS44ZYyEi-hXRu1gzl9I58'

# Search engine ID
cseKey = "23c1c70a203ac4852"

google_resource = build("customsearch", "v1", developerKey=apiKey).cse()

In [7]:
channel_name = "CNN Philippines"

---

Finding a LinkedIn Profile

In [8]:
query = channel_name + query_tail[0]

In [9]:
response = google_resource.list(
    q=query,
    cx=cseKey
).execute()

In [10]:
pattern = r'https:\/\/www\.linkedin\.com\/(company|in)\/.+' # Used to find specific profile links

linkedIn = False
for i in range(0,10):
    if re.search(pattern, response.get("items")[i].get("formattedUrl")) != None:
        # Get profile name from search result
        profile_name = re.search(r'\w+\s(\w+)?', response.get("items")[i].get("htmlTitle")).group()
        
        # Get similarity between found profile name and channel name
        # This is to prevent false positives in finding a LinkedIn profile
        similarity = round(jaro_similarity(channel_name, profile_name), 2)
        
        # If n% similar, consider LinkedIn profile as found
        if similarity >= 0.80:
            linkedIn = True
            break
    
if linkedIn:
    print(str(linkedIn) + ", at index [" + str(i) + "] with " + str(similarity * 100) + "% similarity.")
    print("Link found: " + profile_name + " @ " + response.get("items")[i].get("link"))
else:
    print("No LinkedIn profile found.")

True, at index [0] with 100.0% similarity.
Link found: CNN Philippines @ https://www.linkedin.com/company/cnn-philippines


---

Finding a Wiki page

In [11]:
query = channel_name + query_tail[1]

In [12]:
response = google_resource.list(
    q=query,
    cx=cseKey
).execute()

In [13]:
pattern = r'https:\/\/\w{2}.wikipedia\.org\/wiki\/.+'

wiki = False
for i in range(0,10):
    if re.search(pattern, response.get("items")[i].get("formattedUrl")) != None:
        # Get Wiki page name from search result
        title = response.get("items")[i].get("title")
        page_name = re.search(r'.+(?=\s-\sWikipedia)', title).group()
        
        # Get similarity between found Wiki page name and channel name
        # This is to prevent false positives in finding a Wiki page
        similarity = round(jaro_similarity(channel_name, page_name), 2)
        
        # If n% similar, consider Wiki page as found
        if similarity >= 0.80:
            wiki = True
            break
    
if wiki:
    print(str(wiki) + ", at index [" + str(i) + "] with " + str(similarity * 100) + "% similarity.")
    print("Link found: "+ title + " @ " + response.get("items")[i].get("link"))
else:
    print("No Wiki page found.")

True, at index [0] with 100.0% similarity.
Link found: CNN Philippines - Wikipedia @ https://en.wikipedia.org/wiki/CNN_Philippines


---

Finding a website

In [14]:
query = channel_name + query_tail[2]

In [15]:
response = google_resource.list(
    q=query,
    cx=cseKey
).execute()

In [16]:
website = False

for i in range(0, 10):
    title = response.get("items")[i].get("title")
    if channel_name.lower() in title.lower():
        link = response.get("items")[i].get("link")
        # RegEx to exclude YouTube, LinkedIn, and Wikipedia pages
        pattern = r'https\:\/\/(\w{2}.wikipedia\.org\/wiki\/.+|www\.(youtube\.com.+|linkedin\.com.+))'
        if re.search(pattern, link) == None:
            # The first result among the filtered at this point is MOST LIKELY the official website
            website = True
            break

if website:
    print(str(website) + ", at index [" + str(i) + "]")
    print("Link found: " + title + " @ " + link)
else:
    print("No official website found.")

True, at index [0]
Link found: CNN Philippines @ https://cnnphilippines.com/


---

Finding social media presence <br>
Limited to these social media sites: <br>
- Facebook
- Twitter

In [17]:
query = channel_name + query_tail[3]

In [18]:
fb_response = google_resource.list(
    q=query,
    cx=cseKey
).execute()

In [19]:
query = channel_name + query_tail[4]

In [20]:
twitter_response = google_resource.list(
    q=query,
    cx=cseKey
).execute()

In [21]:
# Searching for a Facebook profile
facebook = False

for i in range(0, 10):
    pattern = r'^https\:\/\/www\.facebook\.com\/.+\/'
    link = fb_response.get("items")[i].get("formattedUrl")
    if re.search(pattern, link) != None:
        title = fb_response.get("items")[i].get("title")
        similarity = round(jaro_similarity(channel_name, title), 2)
        
        if similarity >= 0.80:
            facebook = True
            break
            
if facebook:
    print(str(facebook) + ", at index [" + str(i) + "] with " + str(similarity * 100) + "% similarity.")
    print("Link found: " + title + " @ " + link)
else:
    print("No Facebook profile found.")

True, at index [0] with 100.0% similarity.
Link found: CNN Philippines @ https://www.facebook.com/CNNPhilippines/


In [22]:
# Searching for a Twitter profile
twitter = False

for i in range(0, 10):
    pattern = r'https\:\/\/(twitter|x)\.com\/.+'
    link = twitter_response.get("items")[i].get("formattedUrl")
    if re.search(pattern, link) != None:
        title = twitter_response.get("items")[i].get("title")
        similarity = round(jaro_similarity(channel_name, title), 2)
        
        if similarity >= 0.80:
            twitter = True
            break
            
if twitter:
    print(str(twitter) + ", at index [" + str(i) + "] with " + str(similarity * 100) + "% similarity.")
    print("Link found: " + title + " @ " + link)
else:
    print("No Twitter profile found.")

True, at index [0] with 80.0% similarity.
Link found: CNN Philippines (@cnnphilippines) / X @ https://twitter.com/cnnphilippines


---

Compiling everything

In [None]:
# Functions
def find_linkedIn:
    pass

def find_wiki:
    pass

def find_website:
    pass

def find_fb:
    pass

def find_twitter:
    pass

def find_sources(channel_names, channel_IDs):
    # Computing total number of queries
    num_query = 0
    # Per channel name
    for channel_name in channel_names:
        # Per query type
        for j in range(0, len(query_tail)):
            num_query += 1
            
    pbar = tqdm(total=num_query)
    pbar.set_description("Finding sources...")
    
    temp_list = []
    columns = [
        "channel_id", "channel_title",
        "profiles", "websites", "social_media_presence",
        "vs"
    ]

In [None]:
channel_names = df["channel_title"].unique()
channel_IDs = df[["channel_id", "channel_title"]].groupby("channel_title").first().to_dict().get("channel_id")