In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from googleapiclient.discovery import build
from pprint import pprint
import regex as re
from jellyfish import jaro_similarity
from tqdm import tqdm

In [2]:
filename = "covid_philippines"

path = "datasets/" + filename + "/" + filename + ".csv"
df = pd.read_csv(path)

video_channel = df[["video_id", "video_title", "channel_id", "channel_title"]]
unique_channels = df["channel_title"].unique()
view_like_comment = np.array(df[["view_count", "like_count", "comment_count"]])
query_tail = [
    " LinkedIn",
    " Wiki",
    " Official Website",
    " Facebook",
    " Twitter"
]

In [3]:
unique_channels

array(['CNN Philippines', 'ANC 24/7', 'South China Morning Post',
       'Rappler', 'INQUIRER.net', 'CNA Insider', 'Al Jazeera English',
       'CNA', 'ABS-CBN News', 'Manila Bulletin Online',
       'UNTV News and Rescue', 'BBC News', 'Global News', 'WION',
       'The Telegraph', 'UNICEF USA', 'Reuters', 'DW News',
       'UNICEF Philippines', 'GMA Integrated News', 'FRANCE 24 English',
       'Voice of America', 'Bloomberg Quicktake', 'World Bank',
       'The Straits Times', 'The Star', 'Behind Philippines', 'FEATR',
       'Hindustan Times', 'Gulf News', 'Diseases Simplified', 'TVUP',
       'Bongbong Marcos', 'Esco Lifesciences Group', 'Doctor Wessam Atif',
       'Doc Fate Cunanan', 'Asian Development Bank',
       'Adventures in America', 'Philstar News', 'HeyoLeah',
       'MedCram - Medical Lectures Explained CLEARLY', 'FinnSnow',
       'Cold Chain Innovation Hub Philippines'], dtype=object)

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,video_id,video_title,description,video_dop,view_count,like_count,comment_count,channel_id,channel_title,channel_dop,sub_count,total_videos
0,0,m3P-bmt3Uqw,JN.1 COVID-19 subvariant causing spike in cases,'An infectious disease expert says the new COV...,2023-12-25,6810,46,19,UCj6spMO3ybZPobE0T5perHA,CNN Philippines,2015-03-16,1400000,22903
1,1,sYI97jv-pZg,"PH records 2,725 new COVID cases from Dec. 12 ...",'The World Tonight: The daily average of the P...,2023-12-18,2132,14,3,UCvi6hEzLM-Z_unKPSuuzKvg,ANC 24/7,2010-01-29,1520000,71172
2,2,aLZ85hb4wjE,Normally crowded streets of Philippine capital...,'Subscribe to our YouTube channel for free her...,2020-04-02,323944,3285,619,UC4SUWizzKc1tptprBkWjX2Q,South China Morning Post,2007-01-18,3810000,16958
3,3,3YFpjgIQqEo,WATCH: DOH Usec. Tayag on rise of COVID-19 cas...,'Dateline Philippines: Karmina Constantino tal...,2023-12-07,9464,80,17,UCvi6hEzLM-Z_unKPSuuzKvg,ANC 24/7,2010-01-29,1520000,71172
4,4,n-j5rK1XOUc,W.H.O.: COVID-19 remains as health threat | Ne...,'Government agencies are set to convene after ...,2023-05-08,15037,90,25,UCj6spMO3ybZPobE0T5perHA,CNN Philippines,2015-03-16,1400000,22903


In [5]:
num_query = 0

# Per channel name
for channel_name in unique_channels:
    
    # Per query type
    for j in range(0, len(query_tail)):
        num_query += 1

print("Total number of queries: " + str(num_query))

Total number of queries: 215


## NOTE
Verifiability score is computed <u>PER CHANNEL</u> <br>
<br>
Ranking is computed <u>PER VIDEO</u>

---

Google resource initialization
- Query and Channel name are manually declared as to simulate the search process for a single channel name
- The actual loop for searching and verifying across a dataset of videos will be done in the .py file

In [6]:
# Put your personal API key here
apiKey = 'AIzaSyCIplXpNgYZ2IS44ZYyEi-hXRu1gzl9I58'

# Search engine ID
cseKey = "23c1c70a203ac4852"

google_resource = build("customsearch", "v1", developerKey=apiKey).cse()

In [None]:
channel_name = "CNN Philippines"

---

Finding a LinkedIn Profile

In [None]:
query = channel_name + query_tail[0]

In [None]:
response = google_resource.list(
    q=query,
    cx=cseKey
).execute()

In [None]:
pattern = r'https:\/\/www\.linkedin\.com\/(company|in)\/.+' # Used to find specific profile links

linkedIn = False
for i in range(0,10):
    if re.search(pattern, response.get("items")[i].get("formattedUrl")) != None:
        # Get profile name from search result
        profile_name = re.search(r'\w+\s(\w+)?', response.get("items")[i].get("htmlTitle")).group()
        
        # Get similarity between found profile name and channel name
        # This is to prevent false positives in finding a LinkedIn profile
        similarity = round(jaro_similarity(channel_name, profile_name), 2)
        
        # If n% similar, consider LinkedIn profile as found
        if similarity >= 0.80:
            linkedIn = True
            break
    
if linkedIn:
    print(str(linkedIn) + ", at index [" + str(i) + "] with " + str(similarity * 100) + "% similarity.")
    print("Link found: " + profile_name + " @ " + response.get("items")[i].get("link"))
else:
    print("No LinkedIn profile found.")

---

Finding a Wiki page

In [None]:
query = channel_name + query_tail[1]

In [None]:
response = google_resource.list(
    q=query,
    cx=cseKey
).execute()

In [None]:
pattern = r'https:\/\/\w{2}.wikipedia\.org\/wiki\/.+'

wiki = False
for i in range(0,10):
    if re.search(pattern, response.get("items")[i].get("formattedUrl")) != None:
        # Get Wiki page name from search result
        title = response.get("items")[i].get("title")
        page_name = re.search(r'.+(?=\s-\sWikipedia)', title).group()
        
        # Get similarity between found Wiki page name and channel name
        # This is to prevent false positives in finding a Wiki page
        similarity = round(jaro_similarity(channel_name, page_name), 2)
        
        # If n% similar, consider Wiki page as found
        if similarity >= 0.80:
            wiki = True
            break
    
if wiki:
    print(str(wiki) + ", at index [" + str(i) + "] with " + str(similarity * 100) + "% similarity.")
    print("Link found: "+ title + " @ " + response.get("items")[i].get("link"))
else:
    print("No Wiki page found.")

---

Finding a website

In [None]:
query = channel_name + query_tail[2]

In [None]:
response = google_resource.list(
    q=query,
    cx=cseKey
).execute()

In [None]:
website = False

for i in range(0, 10):
    title = response.get("items")[i].get("title")
    if channel_name.lower() in title.lower():
        link = response.get("items")[i].get("link")
        # RegEx to exclude YouTube, LinkedIn, and Wikipedia pages
        pattern = r'https\:\/\/(\w{2}.wikipedia\.org\/wiki\/.+|www\.(youtube\.com.+|linkedin\.com.+))'
        if re.search(pattern, link) == None:
            # The first result among the filtered at this point is MOST LIKELY the official website
            website = True
            break

if website:
    print(str(website) + ", at index [" + str(i) + "]")
    print("Link found: " + title + " @ " + link)
else:
    print("No official website found.")

---

Finding social media presence <br>
Limited to these social media sites: <br>
- Facebook
- Twitter

In [None]:
query = channel_name + query_tail[3]

In [None]:
fb_response = google_resource.list(
    q=query,
    cx=cseKey
).execute()

In [None]:
query = channel_name + query_tail[4]

In [None]:
twitter_response = google_resource.list(
    q=query,
    cx=cseKey
).execute()

In [None]:
# Searching for a Facebook profile
facebook = False

for i in range(0, 10):
    pattern = r'^https\:\/\/www\.facebook\.com\/.+\/'
    link = fb_response.get("items")[i].get("formattedUrl")
    if re.search(pattern, link) != None:
        title = fb_response.get("items")[i].get("title")
        similarity = round(jaro_similarity(channel_name, title), 2)
        
        if similarity >= 0.80:
            facebook = True
            break
            
if facebook:
    print(str(facebook) + ", at index [" + str(i) + "] with " + str(similarity * 100) + "% similarity.")
    print("Link found: " + title + " @ " + link)
else:
    print("No Facebook profile found.")

In [None]:
# Searching for a Twitter profile
twitter = False

for i in range(0, 10):
    pattern = r'https\:\/\/(twitter|x)\.com\/.+'
    link = twitter_response.get("items")[i].get("formattedUrl")
    if re.search(pattern, link) != None:
        title = twitter_response.get("items")[i].get("title")
        similarity = round(jaro_similarity(channel_name, title), 2)
        
        if similarity >= 0.80:
            twitter = True
            break
            
if twitter:
    print(str(twitter) + ", at index [" + str(i) + "] with " + str(similarity * 100) + "% similarity.")
    print("Link found: " + title + " @ " + link)re.search(r'.+(?=\s-\sWikipedia)', title)
else:
    print("No Twitter profile found.")

---

Compiling everything

In [None]:
def find_linkedIn(channel_name, query):
    li_response = google_resource.list(
        q=query,
        cx=cseKey
    ).execute()

    found = False
    pattern = r'https:\/\/www\.linkedin\.com\/(company|in)\/.+'  # Used to find specific profile links

    for i in range(0, 10):
        link = li_response.get("items")[i].get("formattedUrl")
        if re.search(pattern, link) is not None:
            # Get profile name from search result
            match = re.search(r'\w+\s(\w+)?', li_response.get("items")[i].get("htmlTitle"))
            if match is not None:
                profile_name = match.group()

                # Get similarity between found profile name and channel name
                # This is to prevent false positives in finding a LinkedIn profile
                similarity = round(jaro_similarity(channel_name.lower(), profile_name.lower()), 2)

                # If n% similar, consider LinkedIn profile as found
                if similarity >= 0.80:
                    found = True
                    break

    if not found:
        return found, None
    else:
        return found, link


def find_wiki(channel_name, query):
    wiki_response = google_resource.list(
        q=query,
        cx=cseKey
    ).execute()

    found = False
    pattern = r'https:\/\/\w{2}.wikipedia\.org\/wiki\/.+'

    for i in range(0, 10):
        link = wiki_response.get("items")[i].get("formattedUrl")
        if re.search(pattern, link) is not None:
            # Get Wiki page name from search result
            title = wiki_response.get("items")[i].get("title")
            match = re.search(r'.+(?=\s-\sWikipedia)', title)
            if match is not None:
                page_name = match.group()

                # Get similarity between found Wiki page name and channel name
                # This is to prevent false positives in finding a Wiki page
                similarity = round(jaro_similarity(channel_name.lower(), page_name.lower()), 2)

                # If n% similar, consider Wiki page as found
                if similarity >= 0.80:
                    found = True
                    break

    if not found:
        return found, None
    else:
        return found, link


def find_website(channel_name, query):
    website_response = google_resource.list(
        q=query,
        cx=cseKey
    ).execute()

    found = False
    # RegEx to exclude YouTube, LinkedIn, and Wikipedia pages
    pattern = r'https\:\/\/(\w{2}.wikipedia\.org\/wiki\/.+|www\.(youtube\.com.+|linkedin\.com.+))'

    for i in range(0, 10):
        title = website_response.get("items")[i].get("title")
        link = website_response.get("items")[i].get("link")
        if channel_name.lower() in title.lower():
            if re.search(pattern, link) is None:
                # The first result among the filtered at this point is MOST LIKELY the official website
                found = True
                break

    if not found:
        return found, None
    else:
        return found, link


def find_fb(channel_name, query):
    fb_response = google_resource.list(
        q=query,
        cx=cseKey
    ).execute()

    found = False
    pattern = r'^https\:\/\/www\.facebook\.com\/.+\/'

    for i in range(0, 10):
        link = fb_response.get("items")[i].get("formattedUrl")
        if re.search(pattern, link) is not None:
            title = fb_response.get("items")[i].get("title")
            similarity = round(jaro_similarity(channel_name.lower(), title.lower()), 2)

            if similarity >= 0.80:
                found = True
                break

    if not found:
        return found, None
    else:
        return found, link


def find_twitter(channel_name, query):
    twitter_response = google_resource.list(
        q=query,
        cx=cseKey
    ).execute()

    found = False
    pattern = r'https\:\/\/(twitter|x)\.com\/.+'

    for i in range(0, 10):
        link = twitter_response.get("items")[i].get("formattedUrl")
        if re.search(pattern, link) is not None:
            title = twitter_response.get("items")[i].get("title")
            similarity = round(jaro_similarity(channel_name.lower(), title.lower()), 2)

            if similarity >= 0.80:
                found = True
                break

    if not found:
        return found, None
    else:
        return found, link


def find_sources(channel_names, channel_IDs):
    pbar = tqdm(total=len(channel_names))
    pbar.set_description("Finding sources...")

    source_scores = []
    ss_cols = [
        "channel_id", "channel_title",
        "profiles", "website", "social_media_presence",
        "vs"
    ]

    source_links = []
    sl_cols = [
        "channel_id", "channel_title",
        "LinkedIn", "Wiki", "Website",
        "Twitter", "Facebook"
    ]

    query_tail = [
        " LinkedIn",
        " Wiki",
        " Official Website",
        " Facebook",
        " Twitter"
    ]

    for channel_name in channel_names:

        linkedIn_found = find_linkedIn(channel_name, channel_name + query_tail[0])
        wiki_found = find_wiki(channel_name, channel_name + query_tail[1])
        site_found = find_website(channel_name, channel_name + query_tail[2])
        fb_found = find_fb(channel_name, channel_name + query_tail[3])
        twitter_found = find_twitter(channel_name, channel_name + query_tail[4])

        profiles = 0
        website = 0
        social_media_presence = 0

        if linkedIn_found[0] and wiki_found[0]:
            profiles = 3
        elif linkedIn_found[0] and not wiki_found[0]:
            profiles = 2
        elif not linkedIn_found[0] and wiki_found[0]:
            profiles = 1

        if site_found[0]:
            website = 2

        if fb_found[0] or twitter_found[0]:
            social_media_presence = 1

        # Source scores ---
        ss_record = [
            channel_IDs.get(channel_name),  # channel_id
            channel_name,  # channel_title
            profiles,  # profiles
            website,  # website
            social_media_presence,  # social_media_presence
            np.nan  # vs
        ]
        source_scores.append(ss_record)
        # -----------------

        # Source links ---
        fb_link = None
        twitter_link = None

        if fb_found[0]:
            fb_link = fb_found[1]

        if twitter_found[0]:
            twitter_link = twitter_found[1]

        sl_record = [
            channel_IDs.get(channel_name),
            channel_name,
            linkedIn_found[1],
            wiki_found[1],
            site_found[1],
            twitter_link,
            fb_link
        ]
        source_links.append(sl_record)
        # -----------------
        pbar.update(1)
    pbar.close()


    ss_nparray = np.array(source_scores)
    sl_nparray = np.array(source_links)

    ss_df = pd.DataFrame(ss_nparray, columns=ss_cols)
    sl_df = pd.DataFrame(sl_nparray, columns=sl_cols)

    ss_df.to_csv("source_scores.csv")
    sl_df.to_csv("source_links.csv")

In [None]:
channel_names = df["channel_title"].unique()
channel_IDs = df[["channel_id", "channel_title"]].groupby("channel_title").first().to_dict().get("channel_id")

In [None]:
find_sources(channel_names[0:5], channel_IDs)

---

In [17]:
from math import sqrt

In [18]:
ss_df = pd.read_csv("datasets/covid_philippines/source_scores.csv")
ss_df.drop("Unnamed: 0", axis=1, inplace=True)
ss_df

Unnamed: 0,channel_id,channel_title,profiles,website,social_media_presence,vs
0,UCj6spMO3ybZPobE0T5perHA,CNN Philippines,1,2,1,0.456696
1,UCvi6hEzLM-Z_unKPSuuzKvg,ANC 24/7,0,2,1,0.310572
2,UC4SUWizzKc1tptprBkWjX2Q,South China Morning Post,1,2,1,0.456696
3,UCdnZdQxYXnbN4uWJg96oGxw,Rappler,3,2,1,1.0
4,UCvRAX-ujvZ0eTMLGG2vki9w,INQUIRER.net,1,2,1,0.456696
5,UC_Lnb8ZHqqgLbp-7hltuT9w,CNA Insider,0,2,1,0.310572
6,UCNye-wNBqNL5ZzHSJj3l8Bg,Al Jazeera English,3,2,1,1.0
7,UC83jt4dlz1Gjl58fzQrrKZg,CNA,3,2,1,1.0
8,UCE2606prvXQc_noEqKxVJXA,ABS-CBN News,1,2,1,0.456696
9,UC5664f6TkaeHgwBly50DWZQ,Manila Bulletin Online,3,0,1,0.689428


In [19]:
sl_df = pd.read_csv("datasets/covid_philippines/source_links.csv")
sl_df.drop("Unnamed: 0", axis=1, inplace=True)
sl_df

Unnamed: 0,channel_id,channel_title,LinkedIn,Wiki,Website,Twitter,Facebook
0,UCj6spMO3ybZPobE0T5perHA,CNN Philippines,,https://en.wikipedia.org/wiki/CNN_Philippines,https://cnnphilippines.com/news,,https://www.facebook.com/CNNPhilippines/
1,UCvi6hEzLM-Z_unKPSuuzKvg,ANC 24/7,,,https://www.facebook.com/ANCalerts/,,https://www.facebook.com/ANCalerts/
2,UC4SUWizzKc1tptprBkWjX2Q,South China Morning Post,,https://en.wikipedia.org/wiki/South_China_Morn...,https://www.scmp.com/,https://twitter.com/SCMPNews,https://www.facebook.com/scmp/
3,UCdnZdQxYXnbN4uWJg96oGxw,Rappler,https://www.linkedin.com/company/rappler,https://en.wikipedia.org/wiki/Rappler,https://www.rappler.com/,,https://www.facebook.com/rapplerdotcom/
4,UCvRAX-ujvZ0eTMLGG2vki9w,INQUIRER.net,,https://en.wikipedia.org/wiki/Inquirer,https://www.inquirer.net/,,https://www.facebook.com/inquirerdotnet/
5,UC_Lnb8ZHqqgLbp-7hltuT9w,CNA Insider,,,https://www.channelnewsasia.com/cna-insider,,https://www.facebook.com/cnainsider/
6,UCNye-wNBqNL5ZzHSJj3l8Bg,Al Jazeera English,https://www.linkedin.com/company/aljazeera,https://en.wikipedia.org/wiki/Al_Jazeera_English,https://www.instagram.com/aljazeeraenglish/?hl=en,https://twitter.com/ajenglish?lang=en,https://www.facebook.com/aljazeera/
7,UC83jt4dlz1Gjl58fzQrrKZg,CNA,https://www.linkedin.com/company/cna_3,https://en.wikipedia.org/wiki/CNA,https://www.cna.com/,,https://www.facebook.com/ChannelNewsAsia/
8,UCE2606prvXQc_noEqKxVJXA,ABS-CBN News,,https://en.wikipedia.org/wiki/ABS-CBN,https://news.abs-cbn.com/,https://twitter.com/abscbnnews?lang=en,https://www.facebook.com/abscbnNEWS/
9,UC5664f6TkaeHgwBly50DWZQ,Manila Bulletin Online,https://www.linkedin.com/company/manilabulleti...,https://en.wikipedia.org/wiki/Manila_Bulletin,,,https://www.facebook.com/manilabulletin/


In [20]:
ss_df[["profiles", "website", "social_media_presence", "vs"]].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
profiles,12.0,1.416667,1.240112,0.0,0.75,1.0,3.0,3.0
website,12.0,1.833333,0.57735,0.0,2.0,2.0,2.0,2.0
social_media_presence,12.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
vs,12.0,0.575385,0.275469,0.310572,0.420165,0.456696,0.767071,1.0


In [66]:
main_df = df.drop("Unnamed: 0", axis=1)
main_df.head()

Unnamed: 0,video_id,video_title,description,video_dop,view_count,like_count,comment_count,channel_id,channel_title,channel_dop,sub_count,total_videos
0,m3P-bmt3Uqw,JN.1 COVID-19 subvariant causing spike in cases,'An infectious disease expert says the new COV...,2023-12-25,6810,46,19,UCj6spMO3ybZPobE0T5perHA,CNN Philippines,2015-03-16,1400000,22903
1,sYI97jv-pZg,"PH records 2,725 new COVID cases from Dec. 12 ...",'The World Tonight: The daily average of the P...,2023-12-18,2132,14,3,UCvi6hEzLM-Z_unKPSuuzKvg,ANC 24/7,2010-01-29,1520000,71172
2,aLZ85hb4wjE,Normally crowded streets of Philippine capital...,'Subscribe to our YouTube channel for free her...,2020-04-02,323944,3285,619,UC4SUWizzKc1tptprBkWjX2Q,South China Morning Post,2007-01-18,3810000,16958
3,3YFpjgIQqEo,WATCH: DOH Usec. Tayag on rise of COVID-19 cas...,'Dateline Philippines: Karmina Constantino tal...,2023-12-07,9464,80,17,UCvi6hEzLM-Z_unKPSuuzKvg,ANC 24/7,2010-01-29,1520000,71172
4,n-j5rK1XOUc,W.H.O.: COVID-19 remains as health threat | Ne...,'Government agencies are set to convene after ...,2023-05-08,15037,90,25,UCj6spMO3ybZPobE0T5perHA,CNN Philippines,2015-03-16,1400000,22903


In [67]:
ss_dict = {}
temp_dict = ss_df[["channel_id", "profiles", "website", "social_media_presence"]].to_dict()
for i in range(0, ss_df.shape[0]):
    ss_dict[temp_dict.get("channel_id").get(i)] = {
        "profiles": temp_dict.get("profiles").get(i),
        "website": temp_dict.get("website").get(i),
        "social_media_presence": temp_dict.get("social_media_presence").get(i)
    }

In [68]:
main_dict = main_df.to_dict()
main_dict["profiles"] = {}
main_dict["website"] = {}
main_dict["social_media_presence"] = {}

for i in range(0, main_df.shape[0]):
    channel_id = main_dict.get("channel_id").get(i)
    if channel_id in ss_dict:
        main_dict["profiles"].update({i: ss_dict.get(channel_id).get("profiles")})
        main_dict["website"].update({i: ss_dict.get(channel_id).get("website")})
        main_dict["social_media_presence"].update({i: ss_dict.get(channel_id).get("social_media_presence")})
    else:
        main_dict["profiles"].update({i: 0.0})
        main_dict["website"].update({i: 0.0})
        main_dict["social_media_presence"].update({i: 0.0})

main_df = pd.DataFrame.from_dict(main_dict)
main_df.head()

Unnamed: 0,video_id,video_title,description,video_dop,view_count,like_count,comment_count,channel_id,channel_title,channel_dop,sub_count,total_videos,profiles,website,social_media_presence
0,m3P-bmt3Uqw,JN.1 COVID-19 subvariant causing spike in cases,'An infectious disease expert says the new COV...,2023-12-25,6810,46,19,UCj6spMO3ybZPobE0T5perHA,CNN Philippines,2015-03-16,1400000,22903,1.0,2.0,1.0
1,sYI97jv-pZg,"PH records 2,725 new COVID cases from Dec. 12 ...",'The World Tonight: The daily average of the P...,2023-12-18,2132,14,3,UCvi6hEzLM-Z_unKPSuuzKvg,ANC 24/7,2010-01-29,1520000,71172,0.0,2.0,1.0
2,aLZ85hb4wjE,Normally crowded streets of Philippine capital...,'Subscribe to our YouTube channel for free her...,2020-04-02,323944,3285,619,UC4SUWizzKc1tptprBkWjX2Q,South China Morning Post,2007-01-18,3810000,16958,1.0,2.0,1.0
3,3YFpjgIQqEo,WATCH: DOH Usec. Tayag on rise of COVID-19 cas...,'Dateline Philippines: Karmina Constantino tal...,2023-12-07,9464,80,17,UCvi6hEzLM-Z_unKPSuuzKvg,ANC 24/7,2010-01-29,1520000,71172,0.0,2.0,1.0
4,n-j5rK1XOUc,W.H.O.: COVID-19 remains as health threat | Ne...,'Government agencies are set to convene after ...,2023-05-08,15037,90,25,UCj6spMO3ybZPobE0T5perHA,CNN Philippines,2015-03-16,1400000,22903,1.0,2.0,1.0


In [69]:
weights = {
    "profiles": 0.35,
    "website": 0.20,
    "social_media_presence": 0.10,
    "view_count": 0.15,
    "like_count": 0.05,
    "comment_count": 0.05,
    "sub_count": 0.05,
    "total_videos": 0.05,
}

In [72]:
def topsis(scores, weights):
    wndm = {}

    for column in weights.keys():
        temp_list = []
        x = 0
        for i in range(0, scores.shape[0]):
            num = scores.iloc[i][column] ** 2
            x += num
        denominator = sqrt(x)

        # Normalize scores
        for i in range(0, scores.shape[0]):
            norm_score = scores.iloc[i][column] / denominator
            temp_list.append(norm_score)

        # Apply weight
        for i in range(0, len(temp_list)):
            temp_list[i] *= weights.get(column)

        wndm.update({column: temp_list})

    wndm_df = pd.DataFrame.from_dict(wndm)
    ideal_best = wndm_df.max()
    ideal_worst = wndm_df.min()

    dist_from_best = []
    dist_from_worst = []

    # Euclidean distance from ideal best
    for i in range(0, wndm_df.shape[0]):
        temp_num = 0
        for column in wndm_df.columns:
            temp_num += (wndm_df.iloc[i][column] - ideal_best[column]) ** 2
        dist_from_best.append(sqrt(temp_num))

    # Euclidean distance from ideal worst
    for i in range(0, wndm_df.shape[0]):
        temp_num = 0
        for column in wndm_df.columns:
            temp_num += (wndm_df.iloc[i][column] - ideal_worst[column]) ** 2
        dist_from_worst.append(sqrt(temp_num))

    performance_rank = []
    for i in range(0, wndm_df.shape[0]):
        performance_rank.append(dist_from_worst[i] / (dist_from_best[i] + dist_from_worst[i]))

    performance_rank = pd.Series(np.array(performance_rank))

    return performance_rank

In [78]:
main_df["rank"] = topsis(main_df, weights)
main_df.sort_values(by="rank", ascending=False).head()

Unnamed: 0,video_id,video_title,description,video_dop,view_count,like_count,comment_count,channel_id,channel_title,channel_dop,sub_count,total_videos,profiles,website,social_media_presence,rank
23,E56W-5xVOss,Banta ng COVID 19 | Rated K,"'""Rated K"" breaks down the COVID-19 cases in t...",2020-03-15,1517054,10483,947,UCE2606prvXQc_noEqKxVJXA,ABS-CBN News,2009-10-22,15700000,220721,1.0,2.0,1.0,0.66261
28,MQ5aYS4YFlQ,COVID-19 In Philippines: The Starving Urban Po...,'The people in the Philippines are suffering f...,2020-09-16,1331557,12849,6504,UC_Lnb8ZHqqgLbp-7hltuT9w,CNA Insider,2014-03-12,1620000,4158,0.0,2.0,1.0,0.650895
98,6DBFwIlT4fg,Coronavirus spreads to India and Philippines |...,'The World Health Organization is meeting to d...,2020-01-30,1256280,8804,3144,UCknLrEdhRCp1aegoMqRaCZg,DW News,2007-09-04,4990000,32944,0.0,0.0,0.0,0.564409
75,Ji47WRv2tQE,Philippines Cremate Coronavirus Victims,"""Crematoriums in the Philippines cremate coron...",2020-05-03,1362652,2907,230,UCVSNOxehfALut52NbkfRBaA,Voice of America,2008-03-14,3150000,46591,0.0,0.0,0.0,0.523426
242,C780y-J3TzY,"WATCH: Robredo on SONA 2020, Philippines&#39; ...","""Subscribe: https://www.youtube.com/@Rappler/\...",2020-07-29,211643,7592,6101,UCdnZdQxYXnbN4uWJg96oGxw,Rappler,2011-12-02,1950000,50434,3.0,2.0,1.0,0.475464
