In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
import regex as re
from jellyfish import jaro_similarity
from tqdm import tqdm

In [None]:
filename = "covid_philippines"

In [None]:
path = "datasets/" + filename + "/" + filename + ".csv"
df = pd.read_csv(path).drop("Unnamed: 0", axis=1)

'''
video_channel = df[["video_id", "video_title", "channel_id"]]
unique_channels = df["channel_name"].unique()
view_like_comment = np.array(df[["view_count", "like_count", "comment_count"]])
'''

query_tail = [
    " LinkedIn",
    " Wiki",
    " Official Website",
    " Facebook",
    " Twitter"
]

In [None]:
df.head()

In [None]:
path = path = "datasets/" + filename + "/" + filename + "_channels.csv"
channel_df = pd.read_csv(path).drop("Unnamed: 0", axis=1)
channel_df.head()

In [None]:
num_query = 0
unique_channels = channel_df[["channel_id", "channel_name"]]

# Per channel name
for channel_id in unique_channels["channel_id"].to_list():
    
    # Per query type
    for j in range(0, len(query_tail)):
        num_query += 1

print("Total number of queries: " + str(num_query))

## NOTE
Verifiability score is computed <u>PER CHANNEL</u> <br>
<br>
Ranking is computed <u>PER VIDEO</u>

---

Google resource initialization
- Query and Channel name are manually declared as to simulate the search process for a single channel name
- The actual loop for searching and verifying across a dataset of videos will be done in the .py file

In [None]:
# Put your personal API key here
apiKey = 'AIzaSyCIplXpNgYZ2IS44ZYyEi-hXRu1gzl9I58'

# Search engine ID
cseKey = "23c1c70a203ac4852"

google_resource = build("customsearch", "v1", developerKey=apiKey).cse()

In [None]:
channel_name = "CNN Philippines"

---

Finding a LinkedIn Profile

In [None]:
query = channel_name + query_tail[0]

In [None]:
response = google_resource.list(
    q=query,
    cx=cseKey
).execute()

In [None]:
pattern = r'https:\/\/www\.linkedin\.com\/(company|in)\/.+' # Used to find specific profile links

linkedIn = False
for i in range(0,10):
    if re.search(pattern, response.get("items")[i].get("formattedUrl")) != None:
        # Get profile name from search result
        profile_name = re.search(r'\w+\s(\w+)?', response.get("items")[i].get("htmlTitle")).group()
        
        # Get similarity between found profile name and channel name
        # This is to prevent false positives in finding a LinkedIn profile
        similarity = round(jaro_similarity(channel_name, profile_name), 2)
        
        # If n% similar, consider LinkedIn profile as found
        if similarity >= 0.80:
            linkedIn = True
            break
    
if linkedIn:
    print(str(linkedIn) + ", at index [" + str(i) + "] with " + str(similarity * 100) + "% similarity.")
    print("Link found: " + profile_name + " @ " + response.get("items")[i].get("link"))
else:
    print("No LinkedIn profile found.")

---

Finding a Wiki page

In [None]:
query = channel_name + query_tail[1]

In [None]:
response = google_resource.list(
    q=query,
    cx=cseKey
).execute()

In [None]:
pattern = r'https:\/\/\w{2}.wikipedia\.org\/wiki\/.+'

wiki = False
for i in range(0,10):
    if re.search(pattern, response.get("items")[i].get("formattedUrl")) != None:
        # Get Wiki page name from search result
        title = response.get("items")[i].get("title")
        page_name = re.search(r'.+(?=\s-\sWikipedia)', title).group()
        
        # Get similarity between found Wiki page name and channel name
        # This is to prevent false positives in finding a Wiki page
        similarity = round(jaro_similarity(channel_name, page_name), 2)
        
        # If n% similar, consider Wiki page as found
        if similarity >= 0.80:
            wiki = True
            break
    
if wiki:
    print(str(wiki) + ", at index [" + str(i) + "] with " + str(similarity * 100) + "% similarity.")
    print("Link found: "+ title + " @ " + response.get("items")[i].get("link"))
else:
    print("No Wiki page found.")

---

Finding a website

In [None]:
query = channel_name + query_tail[2]

In [None]:
response = google_resource.list(
    q=query,
    cx=cseKey
).execute()

In [None]:
website = False

for i in range(0, 10):
    title = response.get("items")[i].get("title")
    if channel_name.lower() in title.lower():
        link = response.get("items")[i].get("link")
        # RegEx to exclude YouTube, LinkedIn, and Wikipedia pages
        pattern = r'https\:\/\/(\w{2}.wikipedia\.org\/wiki\/.+|www\.(youtube\.com.+|linkedin\.com.+))'
        if re.search(pattern, link) == None:
            # The first result among the filtered at this point is MOST LIKELY the official website
            website = True
            break

if website:
    print(str(website) + ", at index [" + str(i) + "]")
    print("Link found: " + title + " @ " + link)
else:
    print("No official website found.")

---

Finding social media presence <br>
Limited to these social media sites: <br>
- Facebook
- Twitter

In [None]:
query = channel_name + query_tail[3]

In [None]:
fb_response = google_resource.list(
    q=query,
    cx=cseKey
).execute()

In [None]:
query = channel_name + query_tail[4]

In [None]:
twitter_response = google_resource.list(
    q=query,
    cx=cseKey
).execute()

In [None]:
# Searching for a Facebook profile
facebook = False

for i in range(0, 10):
    pattern = r'^https\:\/\/www\.facebook\.com\/.+\/'
    link = fb_response.get("items")[i].get("formattedUrl")
    if re.search(pattern, link) != None:
        title = fb_response.get("items")[i].get("title")
        similarity = round(jaro_similarity(channel_name, title), 2)
        
        if similarity >= 0.80:
            facebook = True
            break
            
if facebook:
    print(str(facebook) + ", at index [" + str(i) + "] with " + str(similarity * 100) + "% similarity.")
    print("Link found: " + title + " @ " + link)
else:
    print("No Facebook profile found.")

In [None]:
# Searching for a Twitter profile
twitter = False

for i in range(0, 10):
    pattern = r'https\:\/\/(twitter|x)\.com\/.+'
    link = twitter_response.get("items")[i].get("formattedUrl")
    if re.search(pattern, link) != None:
        title = twitter_response.get("items")[i].get("title")
        similarity = round(jaro_similarity(channel_name, title), 2)
        
        if similarity >= 0.80:
            twitter = True
            break
            
if twitter:
    print(str(twitter) + ", at index [" + str(i) + "] with " + str(similarity * 100) + "% similarity.")
    print("Link found: " + title + " @ " + link)re.search(r'.+(?=\s-\sWikipedia)', title)
else:
    print("No Twitter profile found.")

---

Compiling everything

In [None]:
# Functions
def find_linkedIn(channel_name, query):
    found = False
    pattern = r'https:\/\/www\.linkedin\.com\/(company|in)\/.+'  # Used to find specific profile links

    try:
        li_response = google_resource.list(
            q=query,
            cx=cseKey
        ).execute()

        for i in range(0, 10):
            link = li_response.get("items")[i].get("formattedUrl")
            if re.search(pattern, link) is not None:
                # Get profile name from search result
                match = re.search(r'\w+\s(\w+)?', li_response.get("items")[i].get("title"))
                if match is not None:
                    profile_name = match.group()

                    # Get similarity between found profile name and channel name
                    # This is to prevent false positives in finding a LinkedIn profile
                    similarity = round(jaro_similarity(channel_name.lower(), profile_name.lower()), 2)

                    # If n% similar, consider LinkedIn profile as found
                    if similarity >= 0.80:
                        found = True
                        break
    except HttpError:
        pass

    if not found:
        return found, None
    else:
        return found, link


def find_wiki(channel_name, query):
    found = False
    pattern = r'https:\/\/\w{2}.wikipedia\.org\/wiki\/.+'

    try:
        wiki_response = google_resource.list(
            q=query,
            cx=cseKey
        ).execute()

        for i in range(0, 10):
            link = wiki_response.get("items")[i].get("formattedUrl")
            if re.search(pattern, link) is not None:
                # Get Wiki page name from search result
                title = wiki_response.get("items")[i].get("title")
                match = re.search(r'.+(?=\s-\sWikipedia)', title)
                if match is not None:
                    page_name = match.group()

                    # Get similarity between found Wiki page name and channel name
                    # This is to prevent false positives in finding a Wiki page
                    similarity = round(jaro_similarity(channel_name.lower(), page_name.lower()), 2)

                    # If n% similar, consider Wiki page as found
                    if similarity >= 0.80:
                        found = True
                        break
    except HttpError:
        pass

    if not found:
        return found, None
    else:
        return found, link


def find_website(channel_name, query):
    found = False
    # RegEx to exclude YouTube, LinkedIn, and Wikipedia pages
    pattern = r'https\:\/\/(\w{2}.wikipedia\.org\/wiki\/.+|www\.(youtube\.com.+|linkedin\.com.+))'

    try:
        website_response = google_resource.list(
            q=query,
            cx=cseKey
        ).execute()

        for i in range(0, 10):
            title = website_response.get("items")[i].get("title")
            link = website_response.get("items")[i].get("link")
            if channel_name.lower() in title.lower():
                if re.search(pattern, link) is None:
                    # The first result among the filtered at this point is MOST LIKELY the official website
                    found = True
                    break
    except HttpError:
        pass

    if not found:
        return found, None
    else:
        return found, link


def find_fb(channel_name, query):
    found = False
    pattern = r'^https\:\/\/www\.facebook\.com\/.+\/'

    try:
        fb_response = google_resource.list(
            q=query,
            cx=cseKey
        ).execute()

        for i in range(0, 10):
            link = fb_response.get("items")[i].get("formattedUrl")
            if re.search(pattern, link) is not None:
                title = fb_response.get("items")[i].get("title")
                similarity = round(jaro_similarity(channel_name.lower(), title.lower()), 2)

                if similarity >= 0.80:
                    found = True
                    break
    except HttpError:
        pass

    if not found:
        return found, None
    else:
        return found, link


def find_twitter(channel_name, query):
    found = False
    pattern = r'https\:\/\/(twitter|x)\.com\/.+'

    try:
        twitter_response = google_resource.list(
            q=query,
            cx=cseKey
        ).execute()

        for i in range(0, 10):
            link = twitter_response.get("items")[i].get("formattedUrl")
            if re.search(pattern, link) is not None:
                title = twitter_response.get("items")[i].get("title")
                similarity = round(jaro_similarity(channel_name.lower(), title.lower()), 2)

                if similarity >= 0.80:
                    found = True
                    break
    except HttpError:
        pass

    if not found:
        return found, None
    else:
        return found, link


def check_desc(channel_name, videos_df, pattern):
    # Get first 5 videos of channel from videos_df
    videos_df = videos_df.loc[videos_df["channel_name"] == channel_name].reset_index().drop("index", axis=1).head()
    found = (False, None)

    # For each video
    for i in range(0, videos_df.shape[0]):
        # Get description
        desc = repr(videos_df.iloc[i]["description"]).replace("\\n", " ").replace("  ", " ")

        # Using RegEx, find links using given pattern
        match = re.search(pattern, desc)
        if match is not None:
            found = (True, match.group())
            break

    return found


def find_sources(channel_names, channel_IDs, main_df):
    pbar = tqdm(total=len(channel_names))
    pbar.set_description("Finding sources...")

    source_scores = []
    ss_cols = [
        "channel_id", "channel_name",
        "profiles", "website", "social_media_presence",
        "vs"
    ]

    source_links = []
    sl_cols = [
        "channel_id", "channel_name",
        "LinkedIn", "Wiki", "Website",
        "Twitter", "Facebook"
    ]

    query_tail = [
        " LinkedIn",
        " Wiki",
        " Official Website",
        " Facebook",
        " Twitter"
    ]

    # --- Patterns to search for links within video descriptions
    linkedIn_pattern = r"(?<=(Linked(in|In)\:\s))https:\/\/(www\.)?linkedin\.com\/(company|in)\/(\w|\w[-_])+\/"
    website_pattern = r"(?<=(W|w)ebsite((\:)?\s|\sat\s))https:\/\/\w+(\.(\w|\w[-_])+)?\.\w{3}(\.\w{2})?(\/(\w|\w[-_])+)?"
    fb_pattern = r"(?<=((F|f)acebook\:\s))https:\/\/(www\.)?facebook\.com\/(\w|\w[-_])+"
    twitter_pattern = r"(?<=((T|t)witter\:\s))https:\/\/(www\.)?(twitter|x)\.com\/(\w|\w[-_])+"
    # ---

    for channel_name in channel_names:
        # --- Checking descriptions from channel's videos
        linkedIn_found = check_desc(channel_name, main_df, linkedIn_pattern)
        site_found = check_desc(channel_name, main_df, website_pattern)
        fb_found = check_desc(channel_name, main_df, fb_pattern)
        twitter_found = check_desc(channel_name, main_df, twitter_pattern)
        # ---

        # --- If link not found in descriptions, search via Google
        if not linkedIn_found[0]:
            linkedIn_found = find_linkedIn(channel_name, channel_name + query_tail[0])

        if not site_found[0]:
            site_found = find_website(channel_name, channel_name + query_tail[2])

        if not fb_found[0]:
            fb_found = find_fb(channel_name, channel_name + query_tail[3])

        if not twitter_found[0]:
            twitter_found = find_twitter(channel_name, channel_name + query_tail[4])

        wiki_found = find_wiki(channel_name, channel_name + query_tail[1])
        # ---

        profiles = 0
        website = 0
        social_media_presence = 0

        if linkedIn_found[0] and wiki_found[0]:
            profiles = 3
        elif linkedIn_found[0] and not wiki_found[0]:
            profiles = 2
        elif not linkedIn_found[0] and wiki_found[0]:
            profiles = 1

        if site_found[0]:
            website = 2

        if fb_found[0] or twitter_found[0]:
            social_media_presence = 1

        # Source scores ---
        ss_record = [
            channel_IDs.get(channel_name),  # channel_id
            channel_name,  # channel_name
            profiles,  # profiles
            website,  # website
            social_media_presence,  # social_media_presence
            np.nan  # vs
        ]
        source_scores.append(ss_record)
        # ---

        # Source links ---
        fb_link = None
        twitter_link = None

        if fb_found[0]:
            fb_link = fb_found[1]

        if twitter_found[0]:
            twitter_link = twitter_found[1]

        sl_record = [
            channel_IDs.get(channel_name),
            channel_name,
            linkedIn_found[1],
            wiki_found[1],
            site_found[1],
            twitter_link,
            fb_link
        ]
        source_links.append(sl_record)
        # ---

        pbar.update(1)
    pbar.close()

    ss_nparray = np.array(source_scores)
    sl_nparray = np.array(source_links)

    ss_df = pd.DataFrame(ss_nparray, columns=ss_cols)
    sl_df = pd.DataFrame(sl_nparray, columns=sl_cols)

    ss_df.to_csv("source_scores.csv")
    sl_df.to_csv("source_links.csv")

    print("Complete.")


def topsis(scores, weights):
    wndm = {}

    for column in weights.keys():
        temp_list = []
        x = 0
        for i in range(0, scores.shape[0]):
            num = scores.iloc[i][column] ** 2
            x += num
        denominator = sqrt(x)
        print(f"{column}: {denominator}")

        # Normalize scores
        for i in range(0, scores.shape[0]):
            norm_score = scores.iloc[i][column] / denominator
            temp_list.append(norm_score)

        # Apply weight
        for i in range(0, len(temp_list)):
            temp_list[i] *= weights.get(column)

        wndm.update({column: temp_list})

    wndm_df = pd.DataFrame.from_dict(wndm)
    ideal_best = wndm_df.max()
    ideal_worst = wndm_df.min()

    dist_from_best = []
    dist_from_worst = []

    # Euclidean distance from ideal best
    for i in range(0, wndm_df.shape[0]):
        temp_num = 0
        for column in wndm_df.columns:
            temp_num += (wndm_df.iloc[i][column] - ideal_best[column]) ** 2
        dist_from_best.append(sqrt(temp_num))

    # Euclidean distance from ideal worst
    for i in range(0, wndm_df.shape[0]):
        temp_num = 0
        for column in wndm_df.columns:
            temp_num += (wndm_df.iloc[i][column] - ideal_worst[column]) ** 2
        dist_from_worst.append(sqrt(temp_num))

    performance_rank = []
    for i in range(0, wndm_df.shape[0]):
        performance_rank.append(dist_from_worst[i] / (dist_from_best[i] + dist_from_worst[i]))

    performance_rank = pd.Series(np.array(performance_rank))

    return performance_rank

---

Testing find_sources (WARNING: Be mindful of daily quota for Custom Search API queries)

In [None]:
main_df = df.drop("Unnamed: 0", axis=1)
channel_names = main_df["channel_name"].unique()
channel_IDs = main_df[["channel_id", "channel_name"]].groupby("channel_name").first().to_dict().get("channel_id")

In [None]:
find_sources(channel_names, channel_IDs, main_df)

---

Testing TOPSIS Algorithm - Getting video rank

In [None]:
from math import sqrt

In [None]:
ss_df = pd.read_csv("datasets/covid_philippines/source_scores.csv")
ss_df.drop("Unnamed: 0", axis=1, inplace=True)
ss_df.head()

In [None]:
ss_df[["profiles", "website", "social_media_presence", "vs"]].describe().T

In [None]:
main_df = df.drop("Unnamed: 0", axis=1)
main_df.head()

In [None]:
ss_dict = {}
temp_dict = ss_df[["channel_id", "profiles", "website", "social_media_presence"]].to_dict()
for i in range(0, ss_df.shape[0]):
    ss_dict[temp_dict.get("channel_id").get(i)] = {
        "profiles": temp_dict.get("profiles").get(i),
        "website": temp_dict.get("website").get(i),
        "social_media_presence": temp_dict.get("social_media_presence").get(i)
    }

In [None]:
main_dict = main_df.to_dict()
main_dict["profiles"] = {}
main_dict["website"] = {}
main_dict["social_media_presence"] = {}

for i in range(0, main_df.shape[0]):
    channel_id = main_dict.get("channel_id").get(i)
    if channel_id in ss_dict:
        main_dict["profiles"].update({i: ss_dict.get(channel_id).get("profiles")})
        main_dict["website"].update({i: ss_dict.get(channel_id).get("website")})
        main_dict["social_media_presence"].update({i: ss_dict.get(channel_id).get("social_media_presence")})
    else:
        main_dict["profiles"].update({i: 0.0})
        main_dict["website"].update({i: 0.0})
        main_dict["social_media_presence"].update({i: 0.0})

main_df = pd.DataFrame.from_dict(main_dict)
main_df.head()

In [None]:
weights = {
    "profiles": 0.40,
    "website": 0.25,
    "social_media_presence": 0.10,
    "view_count": 0.05,
    "like_count": 0.05,
    "comment_count": 0.05,
    "sub_count": 0.05,
    "total_videos": 0.05,
}

In [None]:
main_df["rank"] = topsis(main_df, weights)
main_df.sort_values(by="rank", ascending=False).head()

---

In [None]:
weights = {
    "profiles": 0.40,
    "website": 0.25,
    "social_media_presence": 0.10,
    "view_count": 0.05,
    "like_count": 0.05,
    "comment_count": 0.05,
    "sub_count": 0.05,
    "total_videos": 0.05,
}

In [None]:
test_df = main_df[[
    "video_id", "channel_name", "profiles", "website",
    "social_media_presence", "view_count", "like_count",
    "comment_count", "sub_count", "total_videos"]].head()
test_df

In [None]:
test_df["rank"] = topsis(test_df, weights)
test_df

In [None]:
weights = {
    "profiles": 0.50,
    "website": 0.35,
    "social_media_presence": 0.15
}

In [None]:
test_df = ss_df[["channel_name", "profiles", "website", "social_media_presence"]].head()
test_df["vs"] = topsis(test_df, weights)
test_df

In [None]:
sl_df = pd.read_csv("datasets/covid_philippines/source_links.csv")
sl_df.drop("Unnamed: 0", axis=1, inplace=True)
sl_df.head()

In [None]:
similarity = round(jaro_similarity("ANC 24/7", "ABS-CBN News Channel"), 2)
similarity

---

Checking video descriptions to look for profile links

In [None]:
main_df = df.drop("Unnamed: 0", axis=1)
channel_names = main_df["channel_name"].unique()
channel_IDs = main_df[["channel_id", "channel_name"]].groupby("channel_name").first().to_dict().get("channel_id")

In [None]:
linkedIn_pattern = r"(?<=(Linked(in|In)\:\s))https:\/\/(www\.)?linkedin\.com\/(company|in)\/(\w|\w[-_])+\/"
website_pattern = r"(?<=(W|w)ebsite((\:)?\s|\sat\s))https:\/\/\w+(\.(\w|\w[-_])+)?\.\w{3}(\.\w{2})?(\/(\w|\w[-_])+)?"
fb_pattern = r"(?<=((F|f)acebook\:\s))https:\/\/(www\.)?facebook\.com\/(\w|\w[-_])+"
twitter_pattern = r"(?<=((T|t)witter\:\s))https:\/\/(www\.)?(twitter|x)\.com\/(\w|\w[-_])+"

for channel_name in channel_names:
    print(f"[{channel_name}]")
    
    linkedIn_found = check_desc(channel_name, main_df, linkedIn_pattern)
    website_found = check_desc(channel_name, main_df, website_pattern)
    fb_found = check_desc(channel_name, main_df, fb_pattern)
    twitter_found = check_desc(channel_name, main_df, twitter_pattern)
    
    print(f"Website: {str(website_found[0])} @ {website_found[1]}")
    print(f"LinkedIn: {str(linkedIn_found[0])} @ {linkedIn_found[1]}")
    print(f"Facebook: {str(fb_found[0])} @ {fb_found[1]}")
    print(f"Twitter: {str(twitter_found[0])} @ {twitter_found[1]}")
    print()

---

In [None]:
from bs4 import BeautifulSoup
import requests
import json

In [None]:
def check_about_links(pattern, links):
    found = (False, None)
    
    for i in range(0, len(links)):
        match = re.search(pattern, links[i][1])
        if match is not None:
            found = (True, match.group())
            links.pop(i)
            break
    
    return found, links

In [None]:
# --- Patterns to search for links within About sections in channel pages
about_website_pattern = r"(W|w)ebsite"
about_fb_pattern = r"facebook\.com\/.+"
about_linkedIn_pattern = r"linkedin\.com\/(company|in)\/.+"
about_twitter_pattern = r"twitter\.com\/.+"
# ---

for i in range(0, unique_channels.shape[0]):
    channel_id = channel_df.iloc[i]["channel_id"]
    channel_name = channel_df.iloc[i]["channel_name"]
    
    about_page = requests.get(f'https://www.youtube.com/channel/{channel_id}/about')
    soup = BeautifulSoup(about_page.content, 'html.parser')
    script_tags = soup.find_all("script")

    for script in script_tags:
        results = re.search(r"var ytInitialData = {.*}", script.text)
        if results is not None:
            object = results.group(0).replace("var ytInitialData = ", "")
            try:
                link_information =  (json.loads(object) ['onResponseReceivedEndpoints'][0]
                                    ['showEngagementPanelEndpoint']
                                    ['engagementPanel']
                                    ['engagementPanelSectionListRenderer']
                                    ['content']
                                    ['sectionListRenderer']
                                    ['contents'][0]
                                    ['itemSectionRenderer']
                                    ['contents'][0]
                                    ['aboutChannelRenderer']
                                    ['metadata']
                                    ['aboutChannelViewModel']
                                    ['links']
                                    )
            except:
                print("No links provided for ", {channel_name})
            else:
                # print("Available links provided for:", {channel_name})
                links = []
                for link in link_information:   #print all available links from the about modal
                    link_title = link['channelExternalLinkViewModel']['title']['content']
                    url = link['channelExternalLinkViewModel']['link']['content']
                    links.append([link_title, url])
                
                print(links)
                
                fb_found, links = check_about_links(about_fb_pattern, links)
                twitter_found, links = check_about_links(about_twitter_pattern, links)
                linkedIn_found, links = check_about_links(about_linkedIn_pattern, links)
                
                site_found = (False, None)
                
                for i in range(0, len(links)):
                    match = re.search(about_website_pattern, links[i][0])
                    if match is not None:
                        site_found = (True, links[i][1])
                        break
                
                for i in range(0, len(links)):
                    link_title = links[i][0]
                    similarity = round(jaro_similarity(channel_name, link_title), 2)
                    if similarity >= 0.60:
                        match = re.search("youtube\.com\/.+", links[i][1])
                        if match is None:
                            site_found = (True, links[i][1])
                            break
                
                print(fb_found)
                print(twitter_found)
                print(linkedIn_found)
                print(site_found)
                
            print("---------------------------------------")

---

In [None]:
path = "datasets/" + filename + "/source_check.csv"
sc_df = pd.read_csv(path).drop("Unnamed: 0", axis=1)
sc_df.head()

In [None]:
path = "datasets/" + filename + "/source_links.csv"
sl_df = pd.read_csv(path).drop("Unnamed: 0", axis=1)
sl_df.head()

In [None]:
sc_df.loc[sc_df["channel_name"] == "The Straits Times"]