In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from googleapiclient.discovery import build
from pprint import pprint
import regex as re
from jellyfish import jaro_similarity

In [2]:
filename = "covid_philippines"

path = "datasets/" + filename + ".csv"
df = pd.read_csv(path)

video_list = np.array(df[["video_id", "video_title"]])
channel_list = df["channel_title"].unique()
view_like_comment = np.array(df[["view_count", "like_count", "comment_count"]])
query_tail = [
    " LinkedIn",
    " Wiki",
    " Official Website",
    " Profiles",
    " Social Media"
]

In [3]:
# Per channel name
for i in range(0, 1):
    
    # Per query type
    for j in range(0, len(query_tail)):
        query = channel_list[i] + query_tail[j]
        print(query)

CNN Philippines LinkedIn
CNN Philippines Wiki
CNN Philippines Official Website
CNN Philippines Profiles
CNN Philippines Social Media


In [4]:
channel_list

array(['CNN Philippines', 'ANC 24/7', 'Rappler',
       'South China Morning Post', 'INQUIRER.net', 'Al Jazeera English',
       'Manila Bulletin Online', 'CNA Insider', 'ABS-CBN News',
       'BBC News', 'CNA', 'WION', 'UNTV News and Rescue', 'Global News',
       'The Telegraph', 'TODAY', 'Reuters', 'UNICEF USA', 'DW News',
       'Voice of America', 'UNICEF Philippines', 'GMA Integrated News',
       'FRANCE 24 English', 'Bloomberg Quicktake', 'The Straits Times',
       'Gulf News', 'FEATR', 'Hindustan Times', 'Diseases Simplified',
       'PBS NewsHour', 'Philstar News', 'Bongbong Marcos',
       'Doctor Wessam Atif', 'Adventures in America',
       'Ateneo de Manila University', 'TVUP', 'VFam TV', 'Janina Vela',
       'Teleperformance Philippines', 'Yahoo Southeast Asia',
       'Cold Chain Innovation Hub Philippines', 'MiracleFeet'],
      dtype=object)

## NOTE
Verifiability score is computed <u>PER CHANNEL</u> <br>
<br>
Ranking is computed <u>PER VIDEO</u>

---

Google resource initialization
- Query and Channel name are manually declared as to simulate the search process for a single channel name
- The actual loop for searching and verifying across a dataset of videos will be done in the .py file

In [5]:
# Put your personal API key here
apiKey = 'AIzaSyCIplXpNgYZ2IS44ZYyEi-hXRu1gzl9I58'

# Search engine ID
cseKey = "23c1c70a203ac4852"

google_resource = build("customsearch", "v1", developerKey=apiKey).cse()

In [6]:
channel_name = "CNN Philippines"

---

Finding a LinkedIn Profile

In [7]:
query = channel_name + query_tail[0]

In [8]:
response = google_resource.list(
    q=query,
    cx=cseKey
).execute()

In [9]:
pattern = r'https:\/\/www\.linkedin\.com\/(company|in)\/.+' # Used to find specific profile links

linkedIn = False
for i in range(0,10):
    if re.search(pattern, response.get("items")[i].get("formattedUrl")) != None:
        # Get profile name from search result
        profile_name = re.search(r'\w+\s(\w+)?', response.get("items")[i].get("htmlTitle")).group()
        
        # Get similarity between found profile name and channel name
        # This is to prevent false positives in finding a LinkedIn profile
        similarity = round(jaro_similarity(channel_name, profile_name), 2)
        
        # If n% similar, consider LinkedIn profile as found
        if similarity >= 0.80:
            linkedIn = True
            break
        else:
            pass
    
if linkedIn:
    print(str(linkedIn) + ", at index [" + str(i) + "] with " + str(similarity * 100) + "% similarity.")
else:
    print("No LinkedIn profile found.")

True, at index [0] with 100.0% similarity.


---

Finding a Wiki page

In [10]:
query = channel_name + query_tail[1]

In [11]:
response = google_resource.list(
    q=query,
    cx=cseKey
).execute()

In [12]:
pattern = r'https:\/\/\w{2}.wikipedia\.org\/wiki\/.+'

wiki = False
for i in range(0,10):
    if re.search(pattern, response.get("items")[i].get("formattedUrl")) != None:
        # Get Wiki page name from search result
        # Note: Wikipedia pages usually have <b> or </b> tokens
        htmlTitle = response.get("items")[i].get("htmlTitle").replace('<b>', '').replace('</b>', '')
        page_name = re.search(r'.+(?=\s-\sWikipedia)', htmlTitle).group()
        
        # Get similarity between found Wiki page name and channel name
        # This is to prevent false positives in finding a Wiki page
        similarity = round(jaro_similarity(channel_name, page_name), 2)
        
        # If n% similar, consider Wiki page as found
        if similarity >= 0.80:
            wiki = True
            break
        else:
            pass
    
if wiki:
    print(str(wiki) + ", at index [" + str(i) + "] with " + str(similarity * 100) + "% similarity.")
else:
    print("No Wiki page found.")

True, at index [0] with 100.0% similarity.
