# Blendle Topic & Event Extraction NL

In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

import re
import pickle
import json
import os

import umap
import hdbscan

from tqdm import tqdm
from summarizer import Summarizer

from top2vec import Top2Vec

from nltk.corpus import stopwords
import spacy

## Importing and preprocessing data

The following functions are used to load the JSON data in as a dataframe and preprocess it

In [2]:
def import_json_data(path):
    """
    import_json_data imports JSON data and keeps only the id, date,
    headline and content of an article.

    :path: path to raw JSON data
    :return: list of dicts where each dict is a article.
    """ 
    articles = []
    data = [json.loads(line) for line in open(path, 'r')]
    
    for d in data:
        article = {
            'id' : d['id'],
            'date' : d['date'],
        }
        for i in d['body']:
            try:
                if i['type'] == 'hl1':
                    article['headline'] = i['content']
            except:
                break
        content = []
        for i in d['body']:
            try:
                if i['type'] == 'p':
                    content.append(i['content'])
            except:
                continue
        article['content'] = ' '.join(content)
        articles.append(article)
    
    return articles

In [3]:
def preprocess_data(data):
    """
    preprocess_data takes in list of dicts and returns
    Pandas DataFrame with keys as columns and value
    as rows, and removes HTML tags from text.

    :data: list of dicts of articles.
    
    :return: dataframe.
    """
    TAG_RE = re.compile(r'<[^>]+>')
    
    dataframe = pd.DataFrame(data)
    
    # drop duplicate articles
    dataframe = dataframe.drop_duplicates(subset='content', keep="last")
    
    # remove HTML
    dataframe['content'] = dataframe['content'].apply(lambda x: TAG_RE.sub('', x))
    dataframe['content'] = dataframe['content'].apply(lambda x: x.replace("&nbsp;", " "))
    
    # create new article column
    dataframe['article'] = dataframe['headline'] + ' ' + dataframe['content']
    
    # make date column datetime
    dataframe['date'] = pd.to_datetime(dataframe['date'])
    
    return dataframe.dropna().drop_duplicates()

In [4]:
def create_df(path):
    """
    Loops over directory and calls above functions to
    return dataframe.

    :path: path to JSON files.
    
    :return: dataframe.
    """
    frames = []
    for file in tqdm(os.listdir(path)):
        data = import_json_data(path + file)
        frames.append(preprocess_data(data))
    
    df = pd.concat(frames).reset_index(drop=True)
    
    # add index for topics
    df['index'] = df.index
    
    return df

In [5]:
PATH = 'data_nl/'
df = create_df(PATH)
df

100%|██████████| 2/2 [01:51<00:00, 55.51s/it]


Unnamed: 0,id,date,headline,content,article,index
0,bnl-kijk-20200101-5_1,2020-01-01 00:00:00+00:00,KESSELER,ETENSCHAP Vanaf dit nummer neemt Eric de Kruij...,KESSELER ETENSCHAP Vanaf dit nummer neemt Eric...,0
1,bnl-kijk-20200101-8_2,2020-01-01 00:00:00+00:00,GESTROOMLIJND,Terwijl mensen iedere dag honderden kilometers...,GESTROOMLIJND Terwijl mensen iedere dag honder...,1
2,bnl-kijk-20200101-6_2,2020-01-01 00:00:00+00:00,GENERATIEKLOOF,In 2016 kwam in een rapport naar buiten dat he...,GENERATIEKLOOF In 2016 kwam in een rapport naa...,2
3,bnl-kijk-20200101-79_1,2020-01-01 00:00:00+00:00,VELDHUIZEN WINTERDIP,Zucht. Kreun. De donkere dagen zijn er weer. M...,VELDHUIZEN WINTERDIP Zucht. Kreun. De donkere ...,3
4,bnl-kijk-20200101-68_2,2020-01-01 00:00:00+00:00,Minderfoon,KICKSTART! Een minimalistische telefoon. Dat i...,Minderfoon KICKSTART! Een minimalistische tele...,4
...,...,...,...,...,...,...
790029,bnl-nieuwerevu-20190710-605334bb3a9,2019-07-10 00:00:00+00:00,Julio Iglesias,Hij werd geboren op 23 september 1943 als zoon...,Julio Iglesias Hij werd geboren op 23 septembe...,790029
790030,bnl-womenshealth-20190314-785d6d95f51,2019-03-14 00:00:00+00:00,De babyvraag,‘Wanneer gaan jullie eindelijk eens aan kinder...,De babyvraag ‘Wanneer gaan jullie eindelijk ee...,790030
790031,bnl-lhomo-20190416-dbb7d713c58,2019-04-16 00:00:00+00:00,MICHAEL,“In de operawereld geldt: geen gezichtsbeharin...,MICHAEL “In de operawereld geldt: geen gezicht...,790031
790032,bnl-knack-20190109-d4ae02635a9,2019-01-09 00:00:00+00:00,"Leidt duaal leren tot jobs, jobs, jobs?",We weten het al langer: er zijn in ons land te...,"Leidt duaal leren tot jobs, jobs, jobs? We wet...",790032


Lets checkout an article.

In [85]:
df['article'][0]

'KESSELER ETENSCHAP Vanaf dit nummer neemt Eric de Kruijk ons maandelijks mee de keuken in. Niet om allerlei fijne gerechten klaar te maken - want dat zou niet zo bij dit blad passen - maar om te koken op de KIJK-manier. Nou heb ik op zich niet zoveel met eten bereiden. Het is niet mijn hobby en het helpt zeker niet als ik een of andere chef-kok op tv hoor praten over ‘een zuurtje’ in een gerecht. Eentje die het niet over lekkere, maar over een ‘mooie’ wijn heeft. Of zo’n witmuts die niet alleen het woord ‘zuurtje’ in de mond neemt, maar alles wat hij of zij vastpakt in verkleinwoordjes beschrijft. “Dan pakken we er een paprikaatje bij, we snipperen met ons mesje een uitje en dat doen we dan in het pannetje…” Ik heb dan ineens geen trek meer. En breek me de bek niet open over mensen die zonder een spoor van cynisme beweren: “Ja, ik hou heel erg van lekker eten.” Oh, really…!? Ik ben ook, moet ik eerlijk bekennen, niet zo’n keukenprins. Mijn vrouw kan met een halve tomaat, een scheut yo

Now we need to remove numbers, punctuation and other non-alphabetical items. Futhermore, we set everything to lowercase and remove the stop words and single letters. The author of *Top2Vec* actually says this isn't neccesarry, as the HDBSCAN cluster algorithm will automatically declare these words as outliers. But we thought it wouldn't hurt to try.

Following code is from: https://github.com/Tempelman45286

In [86]:
stop_words_extra = ['aan','aangaande','aangezien','al','aldaar','aldus','alhoewel',
 'alias','alle','allebei','alsnog','altijd','altoos','ander','andere','anders',
 'anderszins','behalve','behoudens','beide','beiden','ben','beneden','bent','bepaald','betreffende',
 'bij','binnenin','boven','bovenal','bovendien','bovengenoemd','bovenstaand','bovenvermeld',
 'daar','daarheen','daarin','daarna','daarnet','daarom','daarop','daarvanlangs','dan',
 'dat','de','die','dikwijls','dit','door','doorgaand','dus','echter','eerdat',
 'eerlang','elk','elke','en','enig','enigszins','er','erdoor','even','eveneens',
 'evenwel','gauw','gedurende','gehad','gekund','geleden','gemoeten','gemogen',
 'geweest','gewoon','gewoonweg','haar','had','hadden','hare','heb','hebben','hebt','heeft','hem',
 'hen','het','hierbeneden','hierboven','hij','hoe','hoewel','hun','hunne','ik','ikzelf','in',
 'inmiddels','inzake','is','jezelf','jij','jijzelf','jou','jouw','jouwe','juist','jullie','kan',
 'klaar','kon','konden','krachtens','kunnen','kunt','later','liever','maar','mag','meer','met',
 'mezelf','mij','mijn','mijnent','mijner','mijzelf','misschien','mocht','mochten','moest','moesten',
 'moet','moeten','mogen', 'n', 'na','naar','nadat','net','niet','noch','nog','nogal','nu','of','ofschoon',
 'om','omdat','omstreeks','omtrent','omver','onder','ondertussen','ongeveer','ons',
 'onszelf','onze','ook','op','opnieuw','opzij','over','overigens','pas','precies','reeds',
 'rondom','sedert','sinds','s', 'sindsdien','sommige','spoedig','steeds','tamelijk','tenzij',
 'terwijl','thans','tijdens','toch','toen','toenmaals','toenmalig','tot','totdat','tussen','uit',
 'vaak','van','vandaan','vanuit','vanwege','veeleer','verder','vervolgens','vol',
 'volgens','voor','vooraf','vooral','vooralsnog','voorbij','voordat','voorheen',
 'vrij','vroeg','waar','waarom','wanneer','want','waren','was','wat',
 'wegens','wel', 'we', 'weldra','welk','welke','wie','wiens','wier','wij','wijzelf','zal',
 'ze','zelfs','zichzelf','zij','zijn','zijne','zo','zodra','zonder','zou','zouden','zowat','zulke',
 'zullen','zult', 'zegt']

In [87]:
# remove numbers, punctuation and other non-alphabetical items
df['article'] = df['article'].str.replace("[^a-zA-Z#]", " ")
df['article'] = df['article'].str.replace("'\'", " ")

# normalize to lowercase
df['article'] = df['article'].apply(lambda x: x.lower())

# stop word removal
stop_words = stopwords.words('dutch')
stop_words = stop_words + stop_words_extra
df['article'] = df['article'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

# remove single letters
df['article'] = df['article'].apply(lambda x: ' '.join([word for word in x.split() if len(word) > 1]))

Lets see an article again after preprocessing.

In [88]:
df['article'][0]

'kesseler etenschap vanaf nummer neemt eric kruijk maandelijks mee keuken allerlei fijne gerechten maken blad passen koken kijk manier nou zoveel eten bereiden hobby helpt zeker chef kok tv hoor praten zuurtje gerecht eentje lekkere mooie wijn witmuts alleen woord zuurtje mond neemt vastpakt verkleinwoordjes beschrijft pakken paprikaatje snipperen mesje uitje pannetje ineens trek breek bek open mensen spoor cynisme beweren hou heel erg lekker eten oh really eerlijk bekennen keukenprins vrouw halve tomaat scheut yoghurt overgebleven ham verloren paprika volledige maaltijd tafel toveren ga koken alleen gedetailleerd aanvalsplan welhaast militaire precisie uitgevoerd komt boodschappenlijst gooi supermarktkar nodig eenmaal thuis verdeel zowel kookgerei ingredi nten strategisch inzetgebied niks peper zout smaak toevoegen geef maten hoeveelheden harde cijfers kortom koken doe gevoel eric kookidee kwam desondanks volledige aandacht laat kennismaken wetenschappelijk koken eierpan gebruiken vaa

In [91]:
df.head()

Unnamed: 0,id,date,headline,content,article,index
0,bnl-kijk-20200101-5_1,2020-01-01 00:00:00+00:00,KESSELER,ETENSCHAP Vanaf dit nummer neemt Eric de Kruij...,kesseler etenschap vanaf nummer neemt eric kru...,0
1,bnl-kijk-20200101-8_2,2020-01-01 00:00:00+00:00,GESTROOMLIJND,Terwijl mensen iedere dag honderden kilometers...,gestroomlijnd mensen iedere dag honderden kilo...,1
2,bnl-kijk-20200101-6_2,2020-01-01 00:00:00+00:00,GENERATIEKLOOF,In 2016 kwam in een rapport naar buiten dat he...,generatiekloof kwam rapport buiten amerikaanse...,2
3,bnl-kijk-20200101-79_1,2020-01-01 00:00:00+00:00,VELDHUIZEN WINTERDIP,Zucht. Kreun. De donkere dagen zijn er weer. M...,veldhuizen winterdip zucht kreun donkere dagen...,3
4,bnl-kijk-20200101-68_2,2020-01-01 00:00:00+00:00,Minderfoon,KICKSTART! Een minimalistische telefoon. Dat i...,minderfoon kickstart minimalistische telefoon ...,4


Now we convert the article column to a list for training.

In [92]:
# create list of articles to cluster model
train = df.article.tolist()
print(f'There are {len(train)} articles.')

There are 790034 articles.


## Top2Vec

For more information see:
https://github.com/ddangelov/Top2Vec

Basically what this does is:

1. Create jointly embedded document and word vectors using Doc2Vec.
2. Create lower dimensional embedding of document vectors using UMAP.
3. Find dense areas of documents using HDBSCAN.
4. For each dense area calculate the centroid of document vectors in original dimension, this is the topic vector.
5. Find n-closest word vectors to the resulting topic vector.

The embedding part took quite long (10+ hours) so we provided the embeddings for you already.

### Do not run the following cell unless you want to run the embeddings yourself

In [19]:
model = Top2Vec(train)

2021-01-26 11:04:09,620 - top2vec - INFO - Pre-processing documents for training
2021-01-26 11:10:09,821 - top2vec - INFO - Creating joint document/word embedding
2021-01-26 22:16:02,787 - top2vec - INFO - Creating lower dimension embedding of documents
2021-01-27 00:16:41,074 - top2vec - INFO - Finding dense areas of documents
2021-01-27 00:17:40,481 - top2vec - INFO - Finding topics


### Use the following cell to download and load embeddings

In [209]:
!wget https://www.dropbox.com/s/n6g1uega1jj9456/modeltop2vec.zip
!unzip modeltop2vec.zip

In [None]:
model = Top2Vec.load("modeltop2vec")

## Event extraction

Now everything is clustered we add the the topic, topic score, topic words, and word scores to our dataframe.

In [212]:
def get_topic(index):
    """
    Returns the topic number, topic score, topic words, and
    word scores.
    
    :index: the index of the article
    
    returns: topic_num, topic_score, topic_words, word_scores
    """
    topic_num, topic_score, topic_words, word_scores = model.get_documents_topics([index])
    return topic_num[0], topic_score[0], topic_words[0], word_scores[0]

def add_topic(dataframe, model):
    """
    Uses get_topic to add the return values to the dataframe.
    
    :dataframe: dataframe with index column
    :model: Top2Vec model
    """
    df = dataframe.copy()
    tqdm.pandas()
    df['topic'], df['topic_score'], df['topic_words'], df['word_scores'] = zip(*df['index'].progress_map(get_topic))
    
    return df

In [129]:
clean_df = add_topic(df, model)
clean_df.head()

100%|██████████| 790034/790034 [04:54<00:00, 2680.96it/s]


Unnamed: 0,id,date,headline,content,article,index,topic,topic_score,topic_words,word_scores
0,bnl-kijk-20200101-5_1,2020-01-01 00:00:00+00:00,KESSELER,ETENSCHAP Vanaf dit nummer neemt Eric de Kruij...,kesseler etenschap vanaf nummer neemt eric kru...,0,256,0.483014,"(kookboek, kookboeken, recepten, ottolenghi, c...","(0.88991475, 0.8513851, 0.8509706, 0.81747836,..."
1,bnl-kijk-20200101-8_2,2020-01-01 00:00:00+00:00,GESTROOMLIJND,Terwijl mensen iedere dag honderden kilometers...,gestroomlijnd mensen iedere dag honderden kilo...,1,194,0.578721,"(biologen, soortgenoten, prooien, diertjes, di...","(0.7165615, 0.71246326, 0.70897686, 0.7057693,..."
2,bnl-kijk-20200101-6_2,2020-01-01 00:00:00+00:00,GENERATIEKLOOF,In 2016 kwam in een rapport naar buiten dat he...,generatiekloof kwam rapport buiten amerikaanse...,2,106,0.439223,"(draadloos, draadloze, bluetooth, batterijduur...","(0.82850146, 0.8162906, 0.81580406, 0.80249894..."
3,bnl-kijk-20200101-79_1,2020-01-01 00:00:00+00:00,VELDHUIZEN WINTERDIP,Zucht. Kreun. De donkere dagen zijn er weer. M...,veldhuizen winterdip zucht kreun donkere dagen...,3,294,0.301987,"(nachtrust, melatonine, slaap, slaapritme, sla...","(0.83320475, 0.8173072, 0.8153957, 0.8125104, ..."
4,bnl-kijk-20200101-68_2,2020-01-01 00:00:00+00:00,Minderfoon,KICKSTART! Een minimalistische telefoon. Dat i...,minderfoon kickstart minimalistische telefoon ...,4,106,0.56064,"(draadloos, draadloze, bluetooth, batterijduur...","(0.82850146, 0.8162906, 0.81580406, 0.80249894..."


We can use the following method to search for similar words as the keyword provided

In [206]:
words, word_scores = model.similar_words(keywords=["blm"], keywords_neg=[], num_words=20)
for word, score in zip(words, word_scores):
    print(f"{word} {score}")

lives 0.8804712409869928
matter 0.8528234634574865
antiracisme 0.7437705344080188
politiegeweld 0.7354127967521926
institutioneel 0.6994888348163792
racisme 0.6793980254152143
antiracistische 0.675056658168099
activisme 0.6426319480951161
raciale 0.6401922077735156
racismedebat 0.6260608625020267
floyd 0.62501968649201
minneapolis 0.6223313075288885
antifa 0.5962253562344414
systemisch 0.5953122551738332
racisten 0.5933446765651947
woke 0.5910441653407361
racist 0.5814439985771073
zwarten 0.5814288005758323
black 0.5814079241371566
floyds 0.5807496749544157


The following functions help us get the topic number for a keyword.

In [213]:
def get_topic_num(model, keyword, num_topics=5):
    """
    Returns the topic words and number of a specific keyword
    
    :model: Top2Vec model
    :keyword: keyword (str) of topic you want to find
    :num_topics: max number of topics returned
    
    returns: topic_words, topic_nums
    """
    try:
        topic_words, _, _, topic_nums = model.search_topics(keywords=[keyword], num_topics=5)
        return topic_words, topic_nums
    except:
        print('No topic found for keyword!')

In [249]:
get_topic_num(model, 'blm')

([array(['minneapolis', 'politiegeweld', 'floyds', 'kenosha', 'ongewapende',
         'breonna', 'chauvin', 'floyd', 'politieoptreden', 'louisville',
         'lives', 'rittenhouse', 'politiekogels', 'plunderaars',
         'plunderingen', 'matter', 'blm', 'traangas', 'rubberkogels',
         'betoger', 'rellen', 'ordediensten', 'ordehandhavers', 'trayvon',
         'demonstrant', 'arrestant', 'politieagent', 'protestmarsen',
         'rassenrellen', 'politieagenten', 'demonstranten', 'betogers',
         'straatgeweld', 'ordetroepen', 'betogingen', 'relschoppers',
         'nekklem', 'vreedzaam', 'hardhandig', 'oproerpolitie', 'tomy',
         'ongewapend', 'protestgolf', 'manifestanten', 'breathe', 'antifa',
         'portland', 'vreedzame', 'charlottesville', 'politiekorps'],
        dtype='<U15'),
  array(['racisme', 'huidskleur', 'institutioneel', 'antiracisme',
         'antiracistische', 'racistisch', 'antiracisten', 'gediscrimineerd',
         'raciale', 'discriminatie', 'racis

As you can see; we get multiple topics per keyword. **The lower the topic number, the bigger the cluster.**
Some topic clusters are about a specific event inside a bigger story (e.g. the whole thing about destroying statues and BLM) so it can be usefull to combine some topics when creating the timeline.

The function below returns true or false depending on how similar the topic words are. Using this function we can combine topics about the same general story.

In [225]:
def should_combine(topic_words1, topic_words2, similar_entries = 5):
    """
    Returns boolean stating whether 1 and 2 have more then 'similar_entries' 
    of the same words
    
    :topic_words1: 1st list of topic words
    :topic_words2: 2st list of topic words
    :similar_entries: amount of required same words
    
    :returns: boolean
    """
    temp = np.append(topic_words1, topic_words2)
    
    return np.unique(temp).size + similar_entries < 2 * topic_words1.size

In [247]:
def combine_topics(model, dataframe, topic_words, topic_nums):
    """
    Returns combined dataframe of multiple topics
    
    :model: Top2Vec model
    :dataframe: dataframe with articles with topic column
    :topic_words: list of lists of topic words
    :topic_nums: list of topic numbers
    
    :returns: dataframe
    """
    result = dataframe.iloc[0:0]
    
    # the topic_words list is sorted from most relevant to least relevant using
    # cosine similarity
    best_topic_words = topic_words[0]
    
    for i in range(topic_nums.size):
        if should_combine(best_topic_words, topic_words[i]):
            result = result.append(get_topic_df(dataframe, topic_nums[i]))
    
    return result

In [250]:
blm_words, blm_nums = get_topic_num(model, 'blm')
blm_df = combine_topics(model, clean_df, blm_words, blm_nums)
blm_df.head()

Unnamed: 0,id,date,headline,content,article,index,topic,topic_score,topic_words,word_scores
5725,bnl-destandaard-20200224-b83ff06e_5661_11ea_b9...,2020-02-24 00:00:00+00:00,43 doden bij rellen na honderden gifaanvallen,Al enige tijd worden in Zambia mensen in hun h...,doden rellen honderden gifaanvallen enige tijd...,5725,296,0.36791,"(minneapolis, politiegeweld, floyds, kenosha, ...","(0.85683495, 0.84303415, 0.77328396, 0.7662617..."
17623,bnl-vkn-20200425-11976273,2020-04-25 00:00:00+00:00,Een rechte rug zegt niet alles,Deze week: de rug recht Er zijn politici die l...,rechte rug week rug recht politici langzaam gr...,17623,296,0.354175,"(minneapolis, politiegeweld, floyds, kenosha, ...","(0.85683495, 0.84303415, 0.77328396, 0.7662617..."
28012,bnl-par-20200530-12049459,2020-05-30 00:00:00+00:00,George Floyd,Wekenlang werden Amerikaanse overheidsgebouwen...,george floyd wekenlang werden amerikaanse over...,28012,296,0.485351,"(minneapolis, politiegeweld, floyds, kenosha, ...","(0.85683495, 0.84303415, 0.77328396, 0.7662617..."
28097,bnl-destandaard-20200602-9e4d50c0_a404_11ea_82...,2020-06-02 00:00:00+00:00,Het is genoeg geweest,"Zwart zijn zou geen doodstraf mogen zijn’, zei...",genoeg zwart doodstraf zei jacob frey burgemee...,28097,296,0.600254,"(minneapolis, politiegeweld, floyds, kenosha, ...","(0.85683495, 0.84303415, 0.77328396, 0.7662617..."
28210,bnl-detijd-20200530-94855,2020-05-30 00:00:00+00:00,Raciaal protest ontaardt in Amerikaanse binnen...,V oor de derde dag op rij is de Amerikaanse st...,raciaal protest ontaardt amerikaanse binnenste...,28210,296,0.737295,"(minneapolis, politiegeweld, floyds, kenosha, ...","(0.85683495, 0.84303415, 0.77328396, 0.7662617..."


We get the most important event by grouping all articles of a specific topic by date, and then, if there are more than 1, choose the article with the highest topic score.

In [318]:
def get_event(dataframe, n=1):
    """
    Extracts key event per day of dataframe of topic

    :dataframe: Pandas DataFrame of specific topic number
    :n: how many articles one a day are needed for something to be classified as a event (int)

    :return: new dataframe
    """
    result = dataframe.iloc[0:0]
    articles = dataframe.set_index("date")

    for name, group in articles.groupby(pd.Grouper(freq="D")):
        if len(group.index) > max(n - 1, n):
            idx = group.topic_score.argmax()
            result = result.append(group.iloc[idx])
        elif len(group.index) == 0:
            continue
        elif n == 1:
            result = result.append(group)

    return result


In [319]:
def get_topic_df(dataframe, topic):
    """
    Returns new dataframe based on topic number

    :dataframe: Pandas DataFrame
    :topic: number (int)
    
    :return: new dataframe
    """
    return dataframe.loc[dataframe['topic'] == topic]


In [317]:
def summerize_article(dataframe, n_sentences=3):
    """
    Creates new dataframe column with summarize of articles

    :dataframe: Pandas DataFrame
    :n_sentences: number of sentences the summary should be

    :return: dataframe
    """
    model = Summarizer()

    tqdm.pandas()
    dataframe["summary"] = dataframe["content"].progress_apply(
        lambda x: model(x, num_sentences=n_sentences)
    )

    return dataframe


In [315]:
def create_timeline(
    dataframe,
    topic,
    filename,
    model,
    start_date=False,
    end_date=False,
    summarize=True,
    n=1,
):
    """
    Saves timeline about a topic as text file to drive.

    :dataframe: Pandas DataFrame containing articles.
    :topic: topic number (int).
    :filename: output text file name.
    :model: Top2Vec model.
    :start_date: start date of event. Default is using whole DataFrame.
    :end_date: start date of event. Default is using whole DataFrame.
    :summarize: set to false if you don't want a
                summarization of the content of an article.
    :n: how many articles one a day are needed for something to be classified as a event (int)

    :return: None
    """
    df1 = get_topic_df(clean_df, topic)
    df2 = get_event(df1, n)

    if start_date and end_date:
        df2 = df2.loc[start_date:end_date]

    if summarize:
        df3 = summerize_article(df2)
        with open(filename, "w") as f:
            for article in tqdm(df3.itertuples()):
                f.write(
                    f"Date: {article.Index} \nHeadline: {article.headline} \nSummary: {article.summary} \n\n"
                )
    else:
        with open(filename, "w") as f:
            for article in tqdm(df2.itertuples()):
                f.write(f"Date: {article.Index} \nHeadline: {article.headline} \n\n")


In [316]:
def create_timeline_from_keyword(
    dataframe,
    keyword,
    filename,
    model,
    start_date=False,
    end_date=False,
    summarize=True,
    n=1,
):
    """
    Saves timeline about a topic as text file to drive.

    :dataframe: Pandas DataFrame containing articles.
    :keyword: keyword of topic (e.g. 'coronavirus')
    :filename: output text file name.
    :model: Top2Vec model.
    :start_date: start date of event. Default is using whole DataFrame.
    :end_date: start date of event. Default is using whole DataFrame.
    :summarize: set to false if you don't want a
                summarization of the content of an article.
    :n: how many articles one a day are needed for something to be classified as a event (int)

    :return: None
    """
    words, topics = get_topic_num(model, keyword)
    df1 = combine_topics(model, dataframe, words, topics)

    df2 = get_event(df1, n)

    if start_date and end_date:
        df2 = df2.loc[start_date:end_date]

    if summarize:
        df3 = summerize_article(df2)
        with open(filename, "w") as f:
            for article in tqdm(df3.itertuples()):
                f.write(
                    f"Date: {article.Index} \nHeadline: {article.headline} \nSummary: {article.summary} \n\n"
                )
    else:
        with open(filename, "w") as f:
            for article in tqdm(df2.itertuples()):
                f.write(f"Date: {article.Index} \nHeadline: {article.headline} \n\n")


## Example: BLM

In the first example we look at a timeline of the BLM events starting with the dead of George Floyd. We give the article dataframe, the keyterm of our topic, the output file, our Top2Vec model, and the start and end date. We also didn't change the summary toggle, so we will get a 3 sentence summary of all the articles besides the headline and date.

In [281]:
create_timeline_from_keyword(clean_df, 'blm', 'blm_timeline.txt', model, start_date='2020-05-24', end_date='2020-12-31')

  from pandas import Panel
100%|██████████| 163/163 [07:17<00:00,  2.69s/it]
163it [00:00, 58679.22it/s]


In [285]:
create_timeline(clean_df, 296, 'blm_timeline2.txt', model, start_date='2020-05-24', end_date='2020-12-31')

  from pandas import Panel
100%|██████████| 133/133 [06:59<00:00,  3.16s/it]
133it [00:00, 82716.85it/s]


## Example: Coronavirus

The second example is about the Coronavirus. Here we added 'n=3' to the function input. This will make sure that only days where more than 3 articles are about the same topic are classified as an event.

In [321]:
create_timeline(clean_df, 347, 'covid_timeline.txt', model, start_date='2019-10-01', end_date='2020-12-31', n=3)

  from pandas import Panel
100%|██████████| 47/47 [02:21<00:00,  3.01s/it]
47it [00:00, 17060.35it/s]


## Example: F1

In our F1 example, we set the summarize to False, so we only get the date and headlines

In [301]:
create_timeline(clean_df, 633, 'f1.txt', model, summarize=False)

145it [00:00, 129124.01it/s]


## Example: Brexit

Finally, we did the same for Brexit

In [304]:
create_timeline(clean_df, 5, 'brexit.txt', model, summarize=False)

540it [00:00, 116132.09it/s]


As you can see there are many options to tweak this to fit to your liking. Thanks for reading!