In [85]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)

import requests
from bs4 import BeautifulSoup
from unidecode import unidecode
from typing import Sequence

from transformers import BertTokenizer, BertForSequenceClassification
import torch

from scraping import (
    parse_podcats
)
from nlp_utils import (
    hash_str
)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
url_podcasts = "https://lexfridman.com/podcast"

Retrieving of the podcasts.

In [4]:
res = requests.get(url_podcasts)
html_content = res.content
soup_podcasts = BeautifulSoup(html_content, 'html.parser')
    
podcasts = parse_podcats(soup_podcasts)
titles = [p.title for p in podcasts]

First attempt at classifying the podcasts based on their title.

In [5]:
model_name = "bert-base-uncased" 
# model_name = "distilbert-base-uncased" 

tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)
labels = ['related to AI', 'not related to AI']

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
for title in titles[:10]:

    inputs = tokenizer(title, return_tensors='pt', padding=True, truncation=True, max_length=512)
    outputs = model(**inputs) 
    logits = outputs.logits
    probabilities = torch.nn.functional.softmax(logits, dim=1)
    predicted_class = torch.argmax(probabilities, dim=1).item()

    label = labels[predicted_class]

    print(f'{title}\n{label}')

Politics, Family, Real Estate, Fashion, Music, and Life
not related to AI
Focus, Controversy, Politics, and Relationships
not related to AI
Perplexity CEO on Future of AI, Search & the Internet
not related to AI
Physics of Life, Time, Complexity, and Aliens
not related to AI
Power, Controversy, Betrayal, Truth & Love in Film and Life
not related to AI
Dangers of Superintelligent AI
not related to AI
Human Memory, Imagination, Deja Vu, and False Memories
not related to AI
Jungle, Apex Predators, Aliens, Uncontacted Tribes, and God
not related to AI
General Relativity, Quantum Mechanics, Black Holes & Aliens
not related to AI
Judo, Olympics, Winning, Losing, and the Champion Mindset
not related to AI


**Plan of action:**

- Fine tune distilled Bert.
    - create a training set.
        - Use guest to labellize past titles.
        - Use traduction and synonms to do data augmentation.
        - scrap some data.
    - Do the fine tuning
- Test it of a validation set.  

#### Training dataset creation

In [68]:
df_podcast = pd.DataFrame(podcasts)


cols_to_pp = ["guest", "title"]

for col in cols_to_pp:
    df_podcast[col] = np.vectorize(lambda title: unidecode(title))(df_podcast[col])
    df_podcast[col] = df_podcast[col].str.lower()

# df_podcast.insert(0, 'id', df_podcast.reset_index(drop=True).index + 1)
hash_str_vec = np.vectorize(hash_str)
df_podcast.insert(0, 'id', hash_str_vec(df_podcast.guest + df_podcast.title))

assert df_podcast.id.nunique() == len(df_podcast), "colision in the hashing to create unique id!"

The positive titles will be:
- titles of podcast whose guest is a AI/ML/DS person I am interested in.

or

- titles with ML terms that I will validate by hand.

In [73]:
data_science_guests = [
    "Aravind Srinivas",
    "Andrej Karpathy",
    "Sam Altman",
    "Yann LeCun",
    "Joscha Bach",
    "Max Tegmark",
    "Noam Brown",
    "Rana el Kaliouby",
    "Ray Kurzweil",
    "Oriol Vinyals",
    "Demis Hassabis",
    "Travis Oliphant",
    "Jay McClelland",
    "Douglas Lenat",
    "Wojciech Zaremba",
    "Ishan Misra",
    "Risto Miikkulainen",
    "Max Tegmark",
    "Dan Kokotov",
    "Michael Littman",
    "Charles Isbell",
    "François Chollet",
    "Dileep George",
    "Jitendra Malik",
    "Sergey Levine",
    "Matt Botvinick",
    "Ben Goertzel",
    "Dawn Song",
    "Ilya Sutskever",
    "Daphne Koller",
    "David Silver",
    "Marcus Hutter",
    "Michael I. Jordan",
    "Andrew Ng",
    "Gary Marcus",
    "Peter Norvig",
    "Regina Barzilay",
    "Jeremy Howard",
    "Rajat Monga",
    "Ian Goodfellow",
    "Greg Brockman",
    "Tomaso Poggio",
    "Juergen Schmidhuber",
    "Pieter Abbeel",
    "Stuart Russell",
    "Yoshua Bengio",
    "Vladimir Vapnik",
]
data_science_guests_pp = [unidecode(guest).lower() for guest in data_science_guests]

words_to_check = [
    "Neural Nets",
    "neural networks",
    "Deep Learning",
    "Machine learning",
    "Reinforcement Learning",
    "Data science",
    "AI",
    "AGI",
    "artificial intelligence"
]
words_to_check_pp = [unidecode(word).lower() for word in words_to_check]

Manual validation based on regex.

In [140]:
def is_guest_relevant(guest_candidate: str, data_science_guests: Sequence[str]) -> bool:
    """
    Check if any data science guest is contained in the given guest_candidate.
    Necessary because some guest_candidate contained multiple guests.
    """
    return any(relevant_guest in guest_candidate for relevant_guest in data_science_guests)

is_guest_relevant_vec = np.vectorize(
    is_guest_relevant,
    excluded=['data_science_guests'],
    signature='(),(n)->()'
)
df_podcast["has_relevant_guest"] = is_guest_relevant_vec(df_podcast.guest, data_science_guests_pp).astype(int)

re_not_character_or_beginning = r'(?:[^a-z]|^)'
re_not_character_or_end = r'(?:[^a-z]|$)'

pattern = (
    re_not_character_or_beginning +
    (f'{re_not_character_or_end}|{re_not_character_or_beginning}').join(words_to_check_pp) + 
    re_not_character_or_end
)
df_podcast["has_ml_word"] = df_podcast.title.str.contains(pattern, case=False, regex=True).astype(int)

In [141]:
batch = 0
increment = 9

df_to_check = df_podcast.loc[
    (df_podcast.has_relevant_guest == 0) & (df_podcast.has_ml_word == 1) 
]

for _, (id, guest, title) in df_to_check.loc[:,["id", "guest", "title"]].iloc[batch*increment: batch*increment + increment].iterrows():
    print(f"{id}: {guest}\n{title}\n")

98195452: roman yampolskiy
dangers of superintelligent ai

86440844: guillaume verdon
beff jezos, e/acc movement, physics, computation & agi

68071720: elon musk
war, ai, aliens, politics, physics, video games, and humanity

37124594: george hotz
tiny corp, twitter, ai safety, self-driving, and god

60879511: marc andreessen
future of the internet, technology, and ai

99072863: mark zuckerberg
future of ai at meta, facebook, instagram, and whatsapp

41763735: chris lattner
future of programming and ai

68276965: manolis kellis
evolution of human civilization and superintelligent ai

62663336: eliezer yudkowsky
dangers of ai and the end of human civilization



In [144]:
manual_positive_id = [
    98195452,
    41763735,
    68276965,
    59789751,
    60449962,
    93271390,
    79695739,
    31945141,
    58779362,
]

df_podcast["manual_label"] = df_podcast.id.isin(manual_positive_id).astype(int)

df_podcast["label"] = (
    df_podcast.manual_label | df_podcast.has_relevant_guest
).astype(int)

In [145]:
df_podcast.sample(10)

Unnamed: 0,id,guest,title,url_transcript,has_ml_word,has_relevant_guest,manual_label,label
381,26277046,ray dalio,principles,,0,0,0,0
281,12126081,max tegmark,ai and physics,,1,1,0,1
118,73511281,nick lane,"origin of life, evolution, aliens, and biology",,0,0,0,0
335,80058926,joscha bach,artificial consciousness,,0,1,0,1
211,29278313,jeffrey shainline,optoelectronic intelligence,,0,0,0,0
44,76164777,joscha bach,"life, intelligence, consciousness, ai & the fu...",https://lexfridman.com/joscha-bach-3-transcript,1,1,0,1
279,39097670,natalya bailey,rocket engines and electric spacecraft propulsion,,0,0,0,0
301,99900770,charles isbell,"computing, ai & race in america",,1,1,0,1
392,13010577,gary marcus,hybrid of deep learning and symbolic ai,,1,1,0,1
388,94642723,sean carroll,quantum mechanics and many-worlds,,0,0,0,0
