In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
import regex as re
from jellyfish import jaro_similarity
from tqdm import tqdm

In [2]:
filename = "covid_philippines"

path = "datasets/" + filename + "/" + filename + ".csv"
df = pd.read_csv(path)

video_channel = df[["video_id", "video_title", "channel_id", "channel_name"]]
unique_channels = df["channel_name"].unique()
view_like_comment = np.array(df[["view_count", "like_count", "comment_count"]])
query_tail = [
    " LinkedIn",
    " Wiki",
    " Official Website",
    " Facebook",
    " Twitter"
]

In [3]:
unique_channels

array(['South China Morning Post', 'ANC 24/7', 'Rappler',
       'Al Jazeera English', 'INQUIRER.net', 'CNA Insider',
       'Manila Bulletin Online', 'ABS-CBN News', 'Reuters', 'BBC News',
       'CNA', 'WION', 'The Telegraph', 'Global News',
       'UNTV News and Rescue', 'TODAY', 'UNICEF USA', 'DW News',
       'UNICEF Philippines', 'GMA Integrated News', 'FRANCE 24 English',
       'TVUP', 'Voice of America', 'Bloomberg Quicktake',
       'Doctor Wessam Atif', 'The Straits Times', 'Behind Philippines',
       'The Star', 'FEATR', 'Hindustan Times', 'Asian Boss', 'Gulf News',
       'Diseases Simplified', 'Esco Lifesciences Group', 'Philstar News',
       'VFam TV', 'CCTV Video News Agency', 'Adventures in America',
       'BusinessWorldTV', 'Delightful Travellers', 'Kristypata',
       'HeyoLeah', 'Huawei', 'ABS-CBN Balitang America',
       'Ateneo de Manila University', 'Dating Coach Ella',
       'Cold Chain Innovation Hub Philippines', 'Yahoo Southeast Asia',
       'Asian Deve

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,video_id,video_title,description,video_dop,view_count,like_count,comment_count,channel_id,channel_name,channel_dop,sub_count,total_videos
0,0,aLZ85hb4wjE,Normally crowded streets of Philippine capital...,Subscribe to our YouTube channel for free here...,2020-04-02,324695,3287,619,UC4SUWizzKc1tptprBkWjX2Q,South China Morning Post,2007-01-18,3820000,17017
1,1,sYI97jv-pZg,"PH records 2,725 new COVID cases from Dec. 12 ...",The World Tonight: The daily average of the Ph...,2023-12-18,2230,14,3,UCvi6hEzLM-Z_unKPSuuzKvg,ANC 24/7,2010-01-29,1550000,72295
2,2,DWxIvQlpJK8,Metro Manila to be placed on lockdown due to c...,Subscribe: https://www.youtube.com/@Rappler/\n...,2020-03-12,106524,743,19,UCdnZdQxYXnbN4uWJg96oGxw,Rappler,2011-12-02,1980000,50574
3,3,3YFpjgIQqEo,WATCH: DOH Usec. Tayag on rise of COVID-19 cas...,Dateline Philippines: Karmina Constantino talk...,2023-12-07,9692,80,17,UCvi6hEzLM-Z_unKPSuuzKvg,ANC 24/7,2010-01-29,1550000,72295
4,4,lw16DeB6zns,COVID-19 leads to significant job losses in th...,The coronavirus pandemic has led to significan...,2020-09-29,90583,712,87,UCNye-wNBqNL5ZzHSJj3l8Bg,Al Jazeera English,2006-11-23,12800000,111311


In [5]:
num_query = 0

# Per channel name
for channel_name in unique_channels:
    
    # Per query type
    for j in range(0, len(query_tail)):
        num_query += 1

print("Total number of queries: " + str(num_query))

Total number of queries: 245


## NOTE
Verifiability score is computed <u>PER CHANNEL</u> <br>
<br>
Ranking is computed <u>PER VIDEO</u>

---

Google resource initialization
- Query and Channel name are manually declared as to simulate the search process for a single channel name
- The actual loop for searching and verifying across a dataset of videos will be done in the .py file

In [None]:
# Put your personal API key here
apiKey = 'AIzaSyCIplXpNgYZ2IS44ZYyEi-hXRu1gzl9I58'

# Search engine ID
cseKey = "23c1c70a203ac4852"

google_resource = build("customsearch", "v1", developerKey=apiKey).cse()

In [None]:
channel_name = "CNN Philippines"

---

Finding a LinkedIn Profile

In [None]:
query = channel_name + query_tail[0]

In [None]:
response = google_resource.list(
    q=query,
    cx=cseKey
).execute()

In [None]:
pattern = r'https:\/\/www\.linkedin\.com\/(company|in)\/.+' # Used to find specific profile links

linkedIn = False
for i in range(0,10):
    if re.search(pattern, response.get("items")[i].get("formattedUrl")) != None:
        # Get profile name from search result
        profile_name = re.search(r'\w+\s(\w+)?', response.get("items")[i].get("htmlTitle")).group()
        
        # Get similarity between found profile name and channel name
        # This is to prevent false positives in finding a LinkedIn profile
        similarity = round(jaro_similarity(channel_name, profile_name), 2)
        
        # If n% similar, consider LinkedIn profile as found
        if similarity >= 0.80:
            linkedIn = True
            break
    
if linkedIn:
    print(str(linkedIn) + ", at index [" + str(i) + "] with " + str(similarity * 100) + "% similarity.")
    print("Link found: " + profile_name + " @ " + response.get("items")[i].get("link"))
else:
    print("No LinkedIn profile found.")

---

Finding a Wiki page

In [None]:
query = channel_name + query_tail[1]

In [None]:
response = google_resource.list(
    q=query,
    cx=cseKey
).execute()

In [None]:
pattern = r'https:\/\/\w{2}.wikipedia\.org\/wiki\/.+'

wiki = False
for i in range(0,10):
    if re.search(pattern, response.get("items")[i].get("formattedUrl")) != None:
        # Get Wiki page name from search result
        title = response.get("items")[i].get("title")
        page_name = re.search(r'.+(?=\s-\sWikipedia)', title).group()
        
        # Get similarity between found Wiki page name and channel name
        # This is to prevent false positives in finding a Wiki page
        similarity = round(jaro_similarity(channel_name, page_name), 2)
        
        # If n% similar, consider Wiki page as found
        if similarity >= 0.80:
            wiki = True
            break
    
if wiki:
    print(str(wiki) + ", at index [" + str(i) + "] with " + str(similarity * 100) + "% similarity.")
    print("Link found: "+ title + " @ " + response.get("items")[i].get("link"))
else:
    print("No Wiki page found.")

---

Finding a website

In [None]:
query = channel_name + query_tail[2]

In [None]:
response = google_resource.list(
    q=query,
    cx=cseKey
).execute()

In [None]:
website = False

for i in range(0, 10):
    title = response.get("items")[i].get("title")
    if channel_name.lower() in title.lower():
        link = response.get("items")[i].get("link")
        # RegEx to exclude YouTube, LinkedIn, and Wikipedia pages
        pattern = r'https\:\/\/(\w{2}.wikipedia\.org\/wiki\/.+|www\.(youtube\.com.+|linkedin\.com.+))'
        if re.search(pattern, link) == None:
            # The first result among the filtered at this point is MOST LIKELY the official website
            website = True
            break

if website:
    print(str(website) + ", at index [" + str(i) + "]")
    print("Link found: " + title + " @ " + link)
else:
    print("No official website found.")

---

Finding social media presence <br>
Limited to these social media sites: <br>
- Facebook
- Twitter

In [None]:
query = channel_name + query_tail[3]

In [None]:
fb_response = google_resource.list(
    q=query,
    cx=cseKey
).execute()

In [None]:
query = channel_name + query_tail[4]

In [None]:
twitter_response = google_resource.list(
    q=query,
    cx=cseKey
).execute()

In [None]:
# Searching for a Facebook profile
facebook = False

for i in range(0, 10):
    pattern = r'^https\:\/\/www\.facebook\.com\/.+\/'
    link = fb_response.get("items")[i].get("formattedUrl")
    if re.search(pattern, link) != None:
        title = fb_response.get("items")[i].get("title")
        similarity = round(jaro_similarity(channel_name, title), 2)
        
        if similarity >= 0.80:
            facebook = True
            break
            
if facebook:
    print(str(facebook) + ", at index [" + str(i) + "] with " + str(similarity * 100) + "% similarity.")
    print("Link found: " + title + " @ " + link)
else:
    print("No Facebook profile found.")

In [None]:
# Searching for a Twitter profile
twitter = False

for i in range(0, 10):
    pattern = r'https\:\/\/(twitter|x)\.com\/.+'
    link = twitter_response.get("items")[i].get("formattedUrl")
    if re.search(pattern, link) != None:
        title = twitter_response.get("items")[i].get("title")
        similarity = round(jaro_similarity(channel_name, title), 2)
        
        if similarity >= 0.80:
            twitter = True
            break
            
if twitter:
    print(str(twitter) + ", at index [" + str(i) + "] with " + str(similarity * 100) + "% similarity.")
    print("Link found: " + title + " @ " + link)re.search(r'.+(?=\s-\sWikipedia)', title)
else:
    print("No Twitter profile found.")

---

Compiling everything

In [38]:
# Functions
def find_linkedIn(channel_name, query):
    found = False
    pattern = r'https:\/\/www\.linkedin\.com\/(company|in)\/.+'  # Used to find specific profile links

    try:
        li_response = google_resource.list(
            q=query,
            cx=cseKey
        ).execute()

        for i in range(0, 10):
            link = li_response.get("items")[i].get("formattedUrl")
            if re.search(pattern, link) is not None:
                # Get profile name from search result
                match = re.search(r'\w+\s(\w+)?', li_response.get("items")[i].get("title"))
                if match is not None:
                    profile_name = match.group()

                    # Get similarity between found profile name and channel name
                    # This is to prevent false positives in finding a LinkedIn profile
                    similarity = round(jaro_similarity(channel_name.lower(), profile_name.lower()), 2)

                    # If n% similar, consider LinkedIn profile as found
                    if similarity >= 0.80:
                        found = True
                        break
    except HttpError:
        pass

    if not found:
        return found, None
    else:
        return found, link


def find_wiki(channel_name, query):
    found = False
    pattern = r'https:\/\/\w{2}.wikipedia\.org\/wiki\/.+'

    try:
        wiki_response = google_resource.list(
            q=query,
            cx=cseKey
        ).execute()

        for i in range(0, 10):
            link = wiki_response.get("items")[i].get("formattedUrl")
            if re.search(pattern, link) is not None:
                # Get Wiki page name from search result
                title = wiki_response.get("items")[i].get("title")
                match = re.search(r'.+(?=\s-\sWikipedia)', title)
                if match is not None:
                    page_name = match.group()

                    # Get similarity between found Wiki page name and channel name
                    # This is to prevent false positives in finding a Wiki page
                    similarity = round(jaro_similarity(channel_name.lower(), page_name.lower()), 2)

                    # If n% similar, consider Wiki page as found
                    if similarity >= 0.80:
                        found = True
                        break
    except HttpError:
        pass

    if not found:
        return found, None
    else:
        return found, link


def find_website(channel_name, query):
    found = False
    # RegEx to exclude YouTube, LinkedIn, and Wikipedia pages
    pattern = r'https\:\/\/(\w{2}.wikipedia\.org\/wiki\/.+|www\.(youtube\.com.+|linkedin\.com.+))'

    try:
        website_response = google_resource.list(
            q=query,
            cx=cseKey
        ).execute()

        for i in range(0, 10):
            title = website_response.get("items")[i].get("title")
            link = website_response.get("items")[i].get("link")
            if channel_name.lower() in title.lower():
                if re.search(pattern, link) is None:
                    # The first result among the filtered at this point is MOST LIKELY the official website
                    found = True
                    break
    except HttpError:
        pass

    if not found:
        return found, None
    else:
        return found, link


def find_fb(channel_name, query):
    found = False
    pattern = r'^https\:\/\/www\.facebook\.com\/.+\/'

    try:
        fb_response = google_resource.list(
            q=query,
            cx=cseKey
        ).execute()

        for i in range(0, 10):
            link = fb_response.get("items")[i].get("formattedUrl")
            if re.search(pattern, link) is not None:
                title = fb_response.get("items")[i].get("title")
                similarity = round(jaro_similarity(channel_name.lower(), title.lower()), 2)

                if similarity >= 0.80:
                    found = True
                    break
    except HttpError:
        pass

    if not found:
        return found, None
    else:
        return found, link


def find_twitter(channel_name, query):
    found = False
    pattern = r'https\:\/\/(twitter|x)\.com\/.+'

    try:
        twitter_response = google_resource.list(
            q=query,
            cx=cseKey
        ).execute()

        for i in range(0, 10):
            link = twitter_response.get("items")[i].get("formattedUrl")
            if re.search(pattern, link) is not None:
                title = twitter_response.get("items")[i].get("title")
                similarity = round(jaro_similarity(channel_name.lower(), title.lower()), 2)

                if similarity >= 0.80:
                    found = True
                    break
    except HttpError:
        pass

    if not found:
        return found, None
    else:
        return found, link


def check_desc(channel_name, videos_df, pattern):
    # Get first 5 videos of channel from videos_df
    videos_df = videos_df.loc[videos_df["channel_name"] == channel_name].reset_index().drop("index", axis=1).head()
    found = (False, None)

    # For each video
    for i in range(0, videos_df.shape[0]):
        # Get description
        desc = repr(videos_df.iloc[i]["description"]).replace("\\n", " ").replace("  ", " ")

        # Using RegEx, find links using given pattern
        match = re.search(pattern, desc)
        if match is not None:
            found = (True, match.group())
            break

    return found


def find_sources(channel_names, channel_IDs, main_df):
    pbar = tqdm(total=len(channel_names))
    pbar.set_description("Finding sources...")

    source_scores = []
    ss_cols = [
        "channel_id", "channel_name",
        "profiles", "website", "social_media_presence",
        "vs"
    ]

    source_links = []
    sl_cols = [
        "channel_id", "channel_name",
        "LinkedIn", "Wiki", "Website",
        "Twitter", "Facebook"
    ]

    query_tail = [
        " LinkedIn",
        " Wiki",
        " Official Website",
        " Facebook",
        " Twitter"
    ]

    # --- Patterns to search for links within video descriptions
    linkedIn_pattern = r"(?<=(Linked(in|In)\:\s))https:\/\/(www\.)?linkedin\.com\/(company|in)\/(\w|\w[-_])+\/"
    website_pattern = r"(?<=(W|w)ebsite((\:)?\s|\sat\s))https:\/\/\w+(\.(\w|\w[-_])+)?\.\w{3}(\.\w{2})?(\/(\w|\w[-_])+)?"
    fb_pattern = r"(?<=((F|f)acebook\:\s))https:\/\/(www\.)?facebook\.com\/(\w|\w[-_])+"
    twitter_pattern = r"(?<=((T|t)witter\:\s))https:\/\/(www\.)?(twitter|x)\.com\/(\w|\w[-_])+"
    # ---

    for channel_name in channel_names:
        # --- Checking descriptions from channel's videos
        linkedIn_found = check_desc(channel_name, main_df, linkedIn_pattern)
        site_found = check_desc(channel_name, main_df, website_pattern)
        fb_found = check_desc(channel_name, main_df, fb_pattern)
        twitter_found = check_desc(channel_name, main_df, twitter_pattern)
        # ---

        # --- If link not found in descriptions, search via Google
        if not linkedIn_found[0]:
            linkedIn_found = find_linkedIn(channel_name, channel_name + query_tail[0])

        if not site_found[0]:
            site_found = find_website(channel_name, channel_name + query_tail[2])

        if not fb_found[0]:
            fb_found = find_fb(channel_name, channel_name + query_tail[3])

        if not twitter_found[0]:
            twitter_found = find_twitter(channel_name, channel_name + query_tail[4])

        wiki_found = find_wiki(channel_name, channel_name + query_tail[1])
        # ---

        profiles = 0
        website = 0
        social_media_presence = 0

        if linkedIn_found[0] and wiki_found[0]:
            profiles = 3
        elif linkedIn_found[0] and not wiki_found[0]:
            profiles = 2
        elif not linkedIn_found[0] and wiki_found[0]:
            profiles = 1

        if site_found[0]:
            website = 2

        if fb_found[0] or twitter_found[0]:
            social_media_presence = 1

        # Source scores ---
        ss_record = [
            channel_IDs.get(channel_name),  # channel_id
            channel_name,  # channel_name
            profiles,  # profiles
            website,  # website
            social_media_presence,  # social_media_presence
            np.nan  # vs
        ]
        source_scores.append(ss_record)
        # ---

        # Source links ---
        fb_link = None
        twitter_link = None

        if fb_found[0]:
            fb_link = fb_found[1]

        if twitter_found[0]:
            twitter_link = twitter_found[1]

        sl_record = [
            channel_IDs.get(channel_name),
            channel_name,
            linkedIn_found[1],
            wiki_found[1],
            site_found[1],
            twitter_link,
            fb_link
        ]
        source_links.append(sl_record)
        # ---

        pbar.update(1)
    pbar.close()

    ss_nparray = np.array(source_scores)
    sl_nparray = np.array(source_links)

    ss_df = pd.DataFrame(ss_nparray, columns=ss_cols)
    sl_df = pd.DataFrame(sl_nparray, columns=sl_cols)

    ss_df.to_csv("source_scores.csv")
    sl_df.to_csv("source_links.csv")

    print("Complete.")


def topsis(scores, weights):
    wndm = {}

    for column in weights.keys():
        temp_list = []
        x = 0
        for i in range(0, scores.shape[0]):
            num = scores.iloc[i][column] ** 2
            x += num
        denominator = sqrt(x)
        print(f"{column}: {denominator}")

        # Normalize scores
        for i in range(0, scores.shape[0]):
            norm_score = scores.iloc[i][column] / denominator
            temp_list.append(norm_score)

        # Apply weight
        for i in range(0, len(temp_list)):
            temp_list[i] *= weights.get(column)

        wndm.update({column: temp_list})

    wndm_df = pd.DataFrame.from_dict(wndm)
    ideal_best = wndm_df.max()
    ideal_worst = wndm_df.min()

    dist_from_best = []
    dist_from_worst = []

    # Euclidean distance from ideal best
    for i in range(0, wndm_df.shape[0]):
        temp_num = 0
        for column in wndm_df.columns:
            temp_num += (wndm_df.iloc[i][column] - ideal_best[column]) ** 2
        dist_from_best.append(sqrt(temp_num))

    # Euclidean distance from ideal worst
    for i in range(0, wndm_df.shape[0]):
        temp_num = 0
        for column in wndm_df.columns:
            temp_num += (wndm_df.iloc[i][column] - ideal_worst[column]) ** 2
        dist_from_worst.append(sqrt(temp_num))

    performance_rank = []
    for i in range(0, wndm_df.shape[0]):
        performance_rank.append(dist_from_worst[i] / (dist_from_best[i] + dist_from_worst[i]))

    performance_rank = pd.Series(np.array(performance_rank))

    return performance_rank

---

Testing find_sources (WARNING: Be mindful of daily quota for Custom Search API queries)

In [None]:
main_df = df.drop("Unnamed: 0", axis=1)
channel_names = main_df["channel_name"].unique()
channel_IDs = main_df[["channel_id", "channel_name"]].groupby("channel_name").first().to_dict().get("channel_id")

In [None]:
find_sources(channel_names, channel_IDs, main_df)

---

Testing TOPSIS Algorithm - Getting video rank

In [16]:
from math import sqrt

In [42]:
ss_df = pd.read_csv("datasets/covid_philippines/source_scores.csv")
ss_df.drop("Unnamed: 0", axis=1, inplace=True)
ss_df.head()

Unnamed: 0,channel_id,channel_name,profiles,website,social_media_presence,vs
0,UCvi6hEzLM-Z_unKPSuuzKvg,ANC 24/7,0,2,1,
1,UC4SUWizzKc1tptprBkWjX2Q,South China Morning Post,3,2,1,
2,UCdnZdQxYXnbN4uWJg96oGxw,Rappler,3,2,1,
3,UCNye-wNBqNL5ZzHSJj3l8Bg,Al Jazeera English,3,2,1,
4,UC5664f6TkaeHgwBly50DWZQ,Manila Bulletin Online,3,0,1,


In [32]:
ss_df[["profiles", "website", "social_media_presence", "vs"]].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
profiles,52.0,0.884615,1.247018,0.0,0.0,0.0,1.25,3.0
website,52.0,1.115385,1.003012,0.0,0.0,2.0,2.0,2.0
social_media_presence,52.0,0.596154,0.495454,0.0,0.0,1.0,1.0,1.0
vs,0.0,,,,,,,


In [33]:
main_df = df.drop("Unnamed: 0", axis=1)
main_df.head()

Unnamed: 0,video_id,video_title,description,video_dop,view_count,like_count,comment_count,channel_id,channel_name,channel_dop,sub_count,total_videos
0,aLZ85hb4wjE,Normally crowded streets of Philippine capital...,Subscribe to our YouTube channel for free here...,2020-04-02,324695,3287,619,UC4SUWizzKc1tptprBkWjX2Q,South China Morning Post,2007-01-18,3820000,17017
1,sYI97jv-pZg,"PH records 2,725 new COVID cases from Dec. 12 ...",The World Tonight: The daily average of the Ph...,2023-12-18,2230,14,3,UCvi6hEzLM-Z_unKPSuuzKvg,ANC 24/7,2010-01-29,1550000,72295
2,DWxIvQlpJK8,Metro Manila to be placed on lockdown due to c...,Subscribe: https://www.youtube.com/@Rappler/\n...,2020-03-12,106524,743,19,UCdnZdQxYXnbN4uWJg96oGxw,Rappler,2011-12-02,1980000,50574
3,3YFpjgIQqEo,WATCH: DOH Usec. Tayag on rise of COVID-19 cas...,Dateline Philippines: Karmina Constantino talk...,2023-12-07,9692,80,17,UCvi6hEzLM-Z_unKPSuuzKvg,ANC 24/7,2010-01-29,1550000,72295
4,lw16DeB6zns,COVID-19 leads to significant job losses in th...,The coronavirus pandemic has led to significan...,2020-09-29,90583,712,87,UCNye-wNBqNL5ZzHSJj3l8Bg,Al Jazeera English,2006-11-23,12800000,111311


In [34]:
ss_dict = {}
temp_dict = ss_df[["channel_id", "profiles", "website", "social_media_presence"]].to_dict()
for i in range(0, ss_df.shape[0]):
    ss_dict[temp_dict.get("channel_id").get(i)] = {
        "profiles": temp_dict.get("profiles").get(i),
        "website": temp_dict.get("website").get(i),
        "social_media_presence": temp_dict.get("social_media_presence").get(i)
    }

In [35]:
main_dict = main_df.to_dict()
main_dict["profiles"] = {}
main_dict["website"] = {}
main_dict["social_media_presence"] = {}

for i in range(0, main_df.shape[0]):
    channel_id = main_dict.get("channel_id").get(i)
    if channel_id in ss_dict:
        main_dict["profiles"].update({i: ss_dict.get(channel_id).get("profiles")})
        main_dict["website"].update({i: ss_dict.get(channel_id).get("website")})
        main_dict["social_media_presence"].update({i: ss_dict.get(channel_id).get("social_media_presence")})
    else:
        main_dict["profiles"].update({i: 0.0})
        main_dict["website"].update({i: 0.0})
        main_dict["social_media_presence"].update({i: 0.0})

main_df = pd.DataFrame.from_dict(main_dict)
main_df.head()

Unnamed: 0,video_id,video_title,description,video_dop,view_count,like_count,comment_count,channel_id,channel_name,channel_dop,sub_count,total_videos,profiles,website,social_media_presence
0,aLZ85hb4wjE,Normally crowded streets of Philippine capital...,Subscribe to our YouTube channel for free here...,2020-04-02,324695,3287,619,UC4SUWizzKc1tptprBkWjX2Q,South China Morning Post,2007-01-18,3820000,17017,3.0,2.0,1.0
1,sYI97jv-pZg,"PH records 2,725 new COVID cases from Dec. 12 ...",The World Tonight: The daily average of the Ph...,2023-12-18,2230,14,3,UCvi6hEzLM-Z_unKPSuuzKvg,ANC 24/7,2010-01-29,1550000,72295,0.0,2.0,1.0
2,DWxIvQlpJK8,Metro Manila to be placed on lockdown due to c...,Subscribe: https://www.youtube.com/@Rappler/\n...,2020-03-12,106524,743,19,UCdnZdQxYXnbN4uWJg96oGxw,Rappler,2011-12-02,1980000,50574,3.0,2.0,1.0
3,3YFpjgIQqEo,WATCH: DOH Usec. Tayag on rise of COVID-19 cas...,Dateline Philippines: Karmina Constantino talk...,2023-12-07,9692,80,17,UCvi6hEzLM-Z_unKPSuuzKvg,ANC 24/7,2010-01-29,1550000,72295,0.0,2.0,1.0
4,lw16DeB6zns,COVID-19 leads to significant job losses in th...,The coronavirus pandemic has led to significan...,2020-09-29,90583,712,87,UCNye-wNBqNL5ZzHSJj3l8Bg,Al Jazeera English,2006-11-23,12800000,111311,3.0,2.0,1.0


Unnamed: 0,video_id,channel_name,profiles,website,social_media_presence,view_count,like_count,comment_count,sub_count,total_videos
0,aLZ85hb4wjE,South China Morning Post,3.0,2.0,1.0,324695,3287,619,3820000,17017
1,sYI97jv-pZg,ANC 24/7,0.0,2.0,1.0,2230,14,3,1550000,72295
2,DWxIvQlpJK8,Rappler,3.0,2.0,1.0,106524,743,19,1980000,50574
3,3YFpjgIQqEo,ANC 24/7,0.0,2.0,1.0,9692,80,17,1550000,72295
4,lw16DeB6zns,Al Jazeera English,3.0,2.0,1.0,90583,712,87,12800000,111311


In [27]:
weights = {
    "profiles": 0.40,
    "website": 0.25,
    "social_media_presence": 0.10,
    "view_count": 0.05,
    "like_count": 0.05,
    "comment_count": 0.05,
    "sub_count": 0.05,
    "total_videos": 0.05,
}

In [28]:
main_df["rank"] = topsis(main_df, weights)
main_df.sort_values(by="rank", ascending=False).head()

Unnamed: 0,video_id,video_title,description,video_dop,view_count,like_count,comment_count,channel_id,channel_name,channel_dop,sub_count,total_videos,profiles,website,social_media_presence,rank
225,C780y-J3TzY,"WATCH: Robredo on SONA 2020, Philippines&#39; ...",Subscribe: https://www.youtube.com/@Rappler/\n...,2020-07-29,211653,7590,6100,UCdnZdQxYXnbN4uWJg96oGxw,Rappler,2011-12-02,1980000,50574,3.0,2.0,1.0,0.696443
14,MQ5aYS4YFlQ,COVID-19 In Philippines: The Starving Urban Po...,The people in the Philippines are suffering fr...,2020-09-16,1332760,12856,6503,UC_Lnb8ZHqqgLbp-7hltuT9w,CNA Insider,2014-03-12,1630000,4178,0.0,2.0,1.0,0.560492
68,Ji47WRv2tQE,Philippines Cremate Coronavirus Victims,Crematoriums in the Philippines cremate corona...,2020-05-03,1362712,2905,230,UCVSNOxehfALut52NbkfRBaA,Voice of America,2008-03-14,3170000,46718,3.0,2.0,1.0,0.550248
50,3ZXR2eARmuQ,Philippines becomes first country to shut fina...,The Philippines has become the first country t...,2020-03-17,436724,4890,1289,UC83jt4dlz1Gjl58fzQrrKZg,CNA,2006-10-24,2340000,36251,3.0,2.0,1.0,0.549301
79,6DBFwIlT4fg,Coronavirus spreads to India and Philippines |...,The World Health Organization is meeting to de...,2020-01-30,1256285,8801,3144,UCknLrEdhRCp1aegoMqRaCZg,DW News,2007-09-04,5010000,33082,1.0,2.0,1.0,0.532291


---

In [39]:
weights = {
    "profiles": 0.40,
    "website": 0.25,
    "social_media_presence": 0.10,
    "view_count": 0.05,
    "like_count": 0.05,
    "comment_count": 0.05,
    "sub_count": 0.05,
    "total_videos": 0.05,
}

In [40]:
test_df = main_df[[
    "video_id", "channel_name", "profiles", "website",
    "social_media_presence", "view_count", "like_count",
    "comment_count", "sub_count", "total_videos"]].head()
test_df

Unnamed: 0,video_id,channel_name,profiles,website,social_media_presence,view_count,like_count,comment_count,sub_count,total_videos
0,aLZ85hb4wjE,South China Morning Post,3.0,2.0,1.0,324695,3287,619,3820000,17017
1,sYI97jv-pZg,ANC 24/7,0.0,2.0,1.0,2230,14,3,1550000,72295
2,DWxIvQlpJK8,Rappler,3.0,2.0,1.0,106524,743,19,1980000,50574
3,3YFpjgIQqEo,ANC 24/7,0.0,2.0,1.0,9692,80,17,1550000,72295
4,lw16DeB6zns,Al Jazeera English,3.0,2.0,1.0,90583,712,87,12800000,111311


In [41]:
test_df["rank"] = topsis(test_df, weights)
test_df

profiles: 5.196152422706632
website: 4.47213595499958
social_media_presence: 2.23606797749979
view_count: 353664.2380196222
like_count: 3445.2805401011974
comment_count: 625.6109014395449
sub_count: 13680562.853917964
total_videos: 160282.81422535604


Unnamed: 0,video_id,channel_name,profiles,website,social_media_presence,view_count,like_count,comment_count,sub_count,total_videos,rank
0,aLZ85hb4wjE,South China Morning Post,3.0,2.0,1.0,324695,3287,619,3820000,17017,0.847683
1,sYI97jv-pZg,ANC 24/7,0.0,2.0,1.0,2230,14,3,1550000,72295,0.064802
2,DWxIvQlpJK8,Rappler,3.0,2.0,1.0,106524,743,19,1980000,50574,0.741484
3,3YFpjgIQqEo,ANC 24/7,0.0,2.0,1.0,9692,80,17,1550000,72295,0.06528
4,lw16DeB6zns,Al Jazeera English,3.0,2.0,1.0,90583,712,87,12800000,111311,0.783318


In [45]:
weights = {
    "profiles": 0.50,
    "website": 0.35,
    "social_media_presence": 0.15
}

In [47]:
test_df = ss_df[["channel_name", "profiles", "website", "social_media_presence"]].head()
test_df["vs"] = topsis(test_df, weights)
test_df

profiles: 6.0
website: 4.0
social_media_presence: 2.23606797749979


Unnamed: 0,channel_name,profiles,website,social_media_presence,vs
0,ANC 24/7,0,2,1,0.411765
1,South China Morning Post,3,2,1,1.0
2,Rappler,3,2,1,1.0
3,Al Jazeera English,3,2,1,1.0
4,Manila Bulletin Online,3,0,1,0.588235


In [49]:
sl_df = pd.read_csv("datasets/covid_philippines/source_links.csv")
sl_df.drop("Unnamed: 0", axis=1, inplace=True)
sl_df.head()

Unnamed: 0,channel_id,channel_name,LinkedIn,Wiki,Website,Twitter,Facebook
0,UCvi6hEzLM-Z_unKPSuuzKvg,ANC 24/7,,,https://news.abs-cbn.com/anc,https://twitter.com/ancalerts,https://www.facebook.com/ancalerts
1,UC4SUWizzKc1tptprBkWjX2Q,South China Morning Post,https://www.linkedin.com/company/south-china-m...,https://en.wikipedia.org/wiki/South_China_Morn...,https://scmp.com,https://twitter.com/scmpnews,https://facebook.com/scmp
2,UCdnZdQxYXnbN4uWJg96oGxw,Rappler,https://www.linkedin.com/company/rappler,https://en.wikipedia.org/wiki/Rappler,https://www.rappler.com/,,https://www.facebook.com/rapplerdotcom/
3,UCNye-wNBqNL5ZzHSJj3l8Bg,Al Jazeera English,https://www.linkedin.com/company/aljazeera,https://en.wikipedia.org/wiki/Al_Jazeera_English,https://www.aljazeera.com,https://twitter.com/AJEnglish,https://www.facebook.com/aljazeera
4,UC5664f6TkaeHgwBly50DWZQ,Manila Bulletin Online,https://www.linkedin.com/company/manilabulleti...,https://en.wikipedia.org/wiki/Manila_Bulletin,,,https://www.facebook.com/manilabulletin/


In [53]:
similarity = round(jaro_similarity("ANC 24/7", "ABS-CBN News Channel"), 2)
similarity

0.48

---

Checking video descriptions to look for profile links

In [8]:
main_df = df.drop("Unnamed: 0", axis=1)
channel_names = main_df["channel_name"].unique()
channel_IDs = main_df[["channel_id", "channel_name"]].groupby("channel_name").first().to_dict().get("channel_id")

In [29]:
linkedIn_pattern = r"(?<=(Linked(in|In)\:\s))https:\/\/(www\.)?linkedin\.com\/(company|in)\/(\w|\w[-_])+\/"
website_pattern = r"(?<=(W|w)ebsite((\:)?\s|\sat\s))https:\/\/\w+(\.(\w|\w[-_])+)?\.\w{3}(\.\w{2})?(\/(\w|\w[-_])+)?"
fb_pattern = r"(?<=((F|f)acebook\:\s))https:\/\/(www\.)?facebook\.com\/(\w|\w[-_])+"
twitter_pattern = r"(?<=((T|t)witter\:\s))https:\/\/(www\.)?(twitter|x)\.com\/(\w|\w[-_])+"

for channel_name in channel_names:
    print(f"[{channel_name}]")
    
    linkedIn_found = check_desc(channel_name, main_df, linkedIn_pattern)
    website_found = check_desc(channel_name, main_df, website_pattern)
    fb_found = check_desc(channel_name, main_df, fb_pattern)
    twitter_found = check_desc(channel_name, main_df, twitter_pattern)
    
    print(f"Website: {str(website_found[0])} @ {website_found[1]}")
    print(f"LinkedIn: {str(linkedIn_found[0])} @ {linkedIn_found[1]}")
    print(f"Facebook: {str(fb_found[0])} @ {fb_found[1]}")
    print(f"Twitter: {str(twitter_found[0])} @ {twitter_found[1]}")
    print()

[South China Morning Post]
Website: True @ https://scmp.com
LinkedIn: True @ https://www.linkedin.com/company/south-china-morning-post/
Facebook: True @ https://facebook.com/scmp
Twitter: True @ https://twitter.com/scmpnews

[ANC 24/7]
Website: True @ https://news.abs-cbn.com/anc
LinkedIn: False @ None
Facebook: True @ https://www.facebook.com/ancalerts
Twitter: True @ https://twitter.com/ancalerts

[Rappler]
Website: False @ None
LinkedIn: False @ None
Facebook: False @ None
Twitter: False @ None

[Al Jazeera English]
Website: True @ https://www.aljazeera.com
LinkedIn: False @ None
Facebook: True @ https://www.facebook.com/aljazeera
Twitter: True @ https://twitter.com/AJEnglish

[INQUIRER.net]
Website: False @ None
LinkedIn: False @ None
Facebook: True @ https://facebook.com/inquirerdotnet
Twitter: True @ https://twitter.com/inquirerdotnet

[CNA Insider]
Website: True @ https://cna.asi
LinkedIn: False @ None
Facebook: True @ https://www.facebook.com/cnainsider
Twitter: False @ None

[

---