In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)

import requests
from itertools import product
from html import unescape

from unidecode import unidecode
from typing import Sequence

import translators as ts
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from bs4 import BeautifulSoup

from scraping import (
    parse_podcats
)
from nlp_utils import (
    hash_str
)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
!pip install translators

In [2]:
url_podcasts = "https://lexfridman.com/podcast"

Retrieving of the podcasts.

In [3]:
res = requests.get(url_podcasts)
html_content = res.content
soup_podcasts = BeautifulSoup(html_content, 'html.parser')
    
podcasts = parse_podcats(soup_podcasts)
titles = [p.title for p in podcasts]

First attempt at classifying the podcasts based on their title.

In [5]:
model_name = "bert-base-uncased" 
# model_name = "distilbert-base-uncased" 

tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)
labels = ['related to AI', 'not related to AI']

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
for title in titles[:10]:

    inputs = tokenizer(title, return_tensors='pt', padding=True, truncation=True, max_length=512)
    outputs = model(**inputs) 
    logits = outputs.logits
    probabilities = torch.nn.functional.softmax(logits, dim=1)
    predicted_class = torch.argmax(probabilities, dim=1).item()

    label = labels[predicted_class]

    print(f'{title}\n{label}')

Politics, Family, Real Estate, Fashion, Music, and Life
not related to AI
Focus, Controversy, Politics, and Relationships
not related to AI
Perplexity CEO on Future of AI, Search & the Internet
not related to AI
Physics of Life, Time, Complexity, and Aliens
not related to AI
Power, Controversy, Betrayal, Truth & Love in Film and Life
not related to AI
Dangers of Superintelligent AI
not related to AI
Human Memory, Imagination, Deja Vu, and False Memories
not related to AI
Jungle, Apex Predators, Aliens, Uncontacted Tribes, and God
not related to AI
General Relativity, Quantum Mechanics, Black Holes & Aliens
not related to AI
Judo, Olympics, Winning, Losing, and the Champion Mindset
not related to AI


**Plan of action:**

- Fine tune distilled Bert.
    - create a training set.
        - Use guest to labellize past titles.
        - Use traduction and synonms to do data augmentation.
        - scrap some data.
    - Do the fine tuning
- Test it of a validation set.  

#### Training dataset creation

In [4]:
df_podcast = pd.DataFrame(podcasts)


cols_to_pp = ["guest", "title"]

for col in cols_to_pp:
    df_podcast[col] = np.vectorize(lambda title: unidecode(title))(df_podcast[col])
    df_podcast[col] = df_podcast[col].str.lower()

# df_podcast.insert(0, 'id', df_podcast.reset_index(drop=True).index + 1)
hash_str_vec = np.vectorize(hash_str)
df_podcast.insert(0, 'id', hash_str_vec(df_podcast.guest + df_podcast.title))

assert df_podcast.id.nunique() == len(df_podcast), "colision in the hashing to create unique id!"

The positive titles will be:
- titles of podcast whose guest is a AI/ML/DS person I am interested in.

or

- titles with ML terms that I will validate by hand.

In [5]:
data_science_guests = [
    "Aravind Srinivas",
    "Andrej Karpathy",
    "Sam Altman",
    "Yann LeCun",
    "Joscha Bach",
    "Max Tegmark",
    "Noam Brown",
    "Rana el Kaliouby",
    "Ray Kurzweil",
    "Oriol Vinyals",
    "Demis Hassabis",
    "Travis Oliphant",
    "Jay McClelland",
    "Douglas Lenat",
    "Wojciech Zaremba",
    "Ishan Misra",
    "Risto Miikkulainen",
    "Max Tegmark",
    "Dan Kokotov",
    "Michael Littman",
    "Charles Isbell",
    "François Chollet",
    "Dileep George",
    "Jitendra Malik",
    "Sergey Levine",
    "Matt Botvinick",
    "Ben Goertzel",
    "Dawn Song",
    "Ilya Sutskever",
    "Daphne Koller",
    "David Silver",
    "Marcus Hutter",
    "Michael I. Jordan",
    "Andrew Ng",
    "Gary Marcus",
    "Peter Norvig",
    "Regina Barzilay",
    "Jeremy Howard",
    "Rajat Monga",
    "Ian Goodfellow",
    "Greg Brockman",
    "Tomaso Poggio",
    "Juergen Schmidhuber",
    "Pieter Abbeel",
    "Stuart Russell",
    "Yoshua Bengio",
    "Vladimir Vapnik",
]
data_science_guests_pp = [unidecode(guest).lower() for guest in data_science_guests]

words_to_check = [
    "Neural Nets",
    "neural networks",
    "Deep Learning",
    "Machine learning",
    "Reinforcement Learning",
    "Data science",
    "AI",
    "AGI",
    "artificial intelligence"
]
words_to_check_pp = [unidecode(word).lower() for word in words_to_check]

Manual validation based on regex.

In [6]:
def is_guest_relevant(guest_candidate: str, data_science_guests: Sequence[str]) -> bool:
    """
    Check if any data science guest is contained in the given guest_candidate.
    Necessary because some guest_candidate contained multiple guests.
    """
    return any(relevant_guest in guest_candidate for relevant_guest in data_science_guests)

is_guest_relevant_vec = np.vectorize(
    is_guest_relevant,
    excluded=['data_science_guests'],
    signature='(),(n)->()'
)
df_podcast["has_relevant_guest"] = is_guest_relevant_vec(df_podcast.guest, data_science_guests_pp).astype(int)

re_not_character_or_beginning = r'(?:[^a-z]|^)'
re_not_character_or_end = r'(?:[^a-z]|$)'

pattern = (
    re_not_character_or_beginning +
    (f'{re_not_character_or_end}|{re_not_character_or_beginning}').join(words_to_check_pp) + 
    re_not_character_or_end
)
df_podcast["has_ml_word"] = df_podcast.title.str.contains(pattern, case=False, regex=True).astype(int)

In [141]:
batch = 0
increment = 9

df_to_check = df_podcast.loc[
    (df_podcast.has_relevant_guest == 0) & (df_podcast.has_ml_word == 1) 
]

for _, (id, guest, title) in df_to_check.loc[:,["id", "guest", "title"]].iloc[batch*increment: batch*increment + increment].iterrows():
    print(f"{id}: {guest}\n{title}\n")

98195452: roman yampolskiy
dangers of superintelligent ai

86440844: guillaume verdon
beff jezos, e/acc movement, physics, computation & agi

68071720: elon musk
war, ai, aliens, politics, physics, video games, and humanity

37124594: george hotz
tiny corp, twitter, ai safety, self-driving, and god

60879511: marc andreessen
future of the internet, technology, and ai

99072863: mark zuckerberg
future of ai at meta, facebook, instagram, and whatsapp

41763735: chris lattner
future of programming and ai

68276965: manolis kellis
evolution of human civilization and superintelligent ai

62663336: eliezer yudkowsky
dangers of ai and the end of human civilization



In [9]:
manual_positive_id = [
    98195452,
    41763735,
    68276965,
    59789751,
    60449962,
    93271390,
    79695739,
    31945141,
    58779362,
]

df_podcast["manual_label"] = df_podcast.id.isin(manual_positive_id).astype(int)

df_podcast["label"] = (
    df_podcast.manual_label | df_podcast.has_relevant_guest
).astype(int)

In [148]:
df_podcast.sample(10)

Unnamed: 0,id,guest,title,url_transcript,has_ml_word,has_relevant_guest,manual_label,label
207,86694947,richard wrangham,"violence, sex, and fire in human evolution",,0,0,0,0
397,11647340,francois chollet,"keras, deep learning, and the progress of ai",,1,1,0,1
229,46475578,chris duffin,powerlifting and the engineering of strength,,0,0,0,0
284,48181633,dan gable,olympic wrestling,,0,0,0,0
29,86440844,guillaume verdon,"beff jezos, e/acc movement, physics, computati...",https://lexfridman.com/guillaume-verdon-transc...,1,0,0,0
288,59789751,charles isbell and michael littman,machine learning and education,,1,1,1,1
322,46097646,russ tedrake,underactuated robotics,,0,0,0,0
362,39332756,andrew ng,"deep learning, education, and real-world ai",,1,1,0,1
50,60879511,marc andreessen,"future of the internet, technology, and ai",,1,0,0,0
198,34629302,francis collins,national institutes of health,,0,0,0,0


#### Data augmentation

In [1]:
import nltk
nltk.download('wordnet')
nltk.download('stopwords')

from nltk.corpus import wordnet as wn


[nltk_data] Downloading package wordnet to /home/pierre/nltk_data...
[nltk_data] Downloading package stopwords to /home/pierre/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [34]:
get_synonyms("ape")

['copycat', 'caricature', 'emulator', 'anthropoid', 'imitator', 'aper']

In [33]:
word = "ape"

syns = wn.synsets(word)

for syn in syns:

    print(f"syn: {syn}")
    lemmas = syn.lemmas()

    print(f"lemmas: {lemmas}")
    print(f"lemma name: {[lemma.name() for lemma in lemmas]}")
    print('\r')

syn: Synset('ape.n.01')
lemmas: [Lemma('ape.n.01.ape')]
lemma name: ['ape']

syn: Synset('copycat.n.01')
lemmas: [Lemma('copycat.n.01.copycat'), Lemma('copycat.n.01.imitator'), Lemma('copycat.n.01.emulator'), Lemma('copycat.n.01.ape'), Lemma('copycat.n.01.aper')]
lemma name: ['copycat', 'imitator', 'emulator', 'ape', 'aper']

syn: Synset('anthropoid.n.01')
lemmas: [Lemma('anthropoid.n.01.anthropoid'), Lemma('anthropoid.n.01.ape')]
lemma name: ['anthropoid', 'ape']

syn: Synset('ape.v.01')
lemmas: [Lemma('ape.v.01.ape')]
lemma name: ['ape']

syn: Synset('caricature.v.01')
lemmas: [Lemma('caricature.v.01.caricature'), Lemma('caricature.v.01.ape')]
lemma name: ['caricature', 'ape']



In [3]:
def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    if word in synonyms:
        synonyms.remove(word)
    return list(synonyms)

In [10]:
get_synonyms('car')

['machine',
 'cable_car',
 'gondola',
 'motorcar',
 'railcar',
 'railroad_car',
 'railway_car',
 'elevator_car',
 'automobile',
 'auto']

In [None]:
def synonym_replacement(words, n):
    new_words = words.copy()
    random_word_list = list(set([word for word in words if word not in stop_words]))
    np.random.shuffle(random_word_list)
    num_replaced = 0
    for random_word in random_word_list:
        synonyms = get_synonyms(random_word)
        if len(synonyms) >= 1:
            synonym = random.choice(list(synonyms))
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        if num_replaced >= n:  # only replace up to n words
            break
    sentence = ' '.join(new_words)
    return sentence


In [152]:
wordnet

<WordNetCorpusReader in '.../corpora/wordnet' (not loaded yet)>

##### Back translation

In [13]:
print(ts.translators_pool)

['alibaba', 'apertium', 'argos', 'baidu', 'bing', 'caiyun', 'cloudTranslation', 'deepl', 'elia', 'google', 'hujiang', 'iciba', 'iflytek', 'iflyrec', 'itranslate', 'judic', 'languageWire', 'lingvanex', 'niutrans', 'mglip', 'mirai', 'modernMt', 'myMemory', 'papago', 'qqFanyi', 'qqTranSmart', 'reverso', 'sogou', 'sysTran', 'tilde', 'translateCom', 'translateMe', 'utibet', 'volcEngine', 'yandex', 'yeekit', 'youdao']


In [33]:
txt = "Hello mummy, you are the prettiest! I am so glad I am your son."
_translator = "google" #| "google"
intermediate_language = "fr" # "fr"

txt_translated = ts.translate_text(
    txt,
    translator=_translator,
    from_language="en",
    to_language=intermediate_language,
)

txt_back_translated = ts.translate_text(
    txt_translated,
    translator=_translator,
    from_language=intermediate_language,
    to_language="en",
)

print(txt)
print(txt_translated)
print(txt_back_translated)

Hello mummy, you are the prettiest! I am so glad I am your son.
Bonjour maman, tu es la plus jolie! Je suis tellement content d'être ton fils.
Hello mom, you are the prettiest! I'm so happy to be your son.


In [53]:
def back_translate(
    txt: str,
    translator: str,
    intermediate_language: str, 
    original_language: str="en"
    ) -> str:

    txt_translated = ts.translate_text(
        txt,
        translator=translator,
        from_language=original_language,
        to_language=intermediate_language,
    )

    txt_back_translated = ts.translate_text(
        txt_translated,
        translator=translator,
        from_language=intermediate_language,
        to_language=original_language,
    )

    return txt_back_translated

def augment_with_backtranslation(
    txt:str,
    translators: Sequence[str],
    intermediate_languages: Sequence[str],
    )-> Sequence[str]:
    """
    Agument the given str by back translating to and from all given intermediate languages
    using all the given product (cross product between the two).
    Return the created str in lower case.
    """
    new_txts = []
    for translator, language in product(translators, intermediate_languages):
        new_txt = back_translate(txt, translator, language)
        new_txts.append(unescape(new_txt).lower())

    new_txts_deduplicated = [new_txt for new_txt in set(new_txts) if new_txt != txt]

    return new_txts_deduplicated

In [26]:
translators = [
    "google",
    "alibaba",
]

intermediate_languages = [
    "ja", # Japenese
    "fr", # French
    "es", # Spanish
    "zh", # Chinese
    "de", # German
    "no" # Norwegian
]


In [54]:
new_txts = augment_with_backtranslation(
    titles[3],
    translators,
    intermediate_languages,
)

In [50]:
titles[3]

'Perplexity CEO on Future of AI, Search & the Internet'

In [55]:
new_txts

['confusion -ceo for future of ai, search & the internet',
 'ceo of perplexity about the future of ai, the search and internet',
 'perplecomplexity ceo on the future of ai, research and the internet',
 'the ceo of perplexity on the future of ai, search and the internet',
 'ceo of perplexity on the future of ai, search & internet',
 'regarding ai, search and the future of the internet confusing ceo',
 'ceo at a loss about the future of ai, search & internet',
 'ai, search, internet future perplexityceo',
 'prison ceo for the future of ai, search and internet',
 'ceo in puzzlement over ai, search and internet future',
 'future managing director of ai, search & internet',
 'ceo confusion on artificial intelligence, search and the future of the internet']

In [None]:
def augment_back_translation(
    translators=["alibaba", "google"],
    translators=["alibaba", "google"],
)