In [1]:
import pandas as pd
import numpy as np
import re as re
import transliterate
from transliterate import translit, get_available_language_codes


In [3]:
def log_progress(sequence, every=None, size=None, name='Items'):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / size)     
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{name}: {index} / ?'.format(
                        name=name,
                        index=index
                    )
                else:
                    progress.value = index
                    label.value = u'{name}: {index} / {size}'.format(
                        name=name,
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = "{name}: {index}".format(
            name=name,
            index=str(index or '?')
        )

## Sputnik 

In [121]:
sputnik = pd.read_csv("sputnik.csv", sep = "\t")

In [122]:
titles = sputnik['title'].tolist()
texts = sputnik['text'].tolist()
lead_texts = sputnik['lead_text'].tolist()
categories = sputnik['category'].tolist()

## Edit and take cyrillic characters to Latin

In [123]:
titles = [re.search('name">(.+?)</h1>', title).group(1) for title in titles]
lead_texts = [re.sub('[<p>|</p>]', "", lead_text) for lead_text in lead_texts]
texts = [''.join(re.findall('p>(.+?)</p', text)) for text in texts]
texts = [re.sub(r"\xa0|<p>|\u200b",  " ", text) for text in texts]
categories = [re.search('>(.+?)</', category).group(1) for category in categories]
whole_text = [m+n for m,n in zip(lead_texts,texts)]
whole_text = [re.sub(".]", ".", text) for text in whole_text]
whole_text = [whole[1:] for whole in whole_text]
whole_text = [re.sub('\\.,', ".", text) for text in whole_text]
sputnik['date'] = pd.to_datetime(sputnik['date'], format='%Y-%m-%d')
whole_text = [translit(text, 'sr', reversed = True) for text in whole_text]
titles = [translit(text, 'sr', reversed = True) for text in titles]
categories = [translit(text, 'sr', reversed = True) for text in categories]

In [18]:
sputnik_clean = pd.DataFrame({
    'title' : titles,
    'text' : whole_text,
    'category' : categories
})
sputnik_clean['date'] = sputnik['date']
sputnik_clean['source'] = 'Sputnik-RU'
sputnik_clean.sort_values(by=["date"], inplace= True)
start_date = '2018-11-01'
end_date = '2019-02-28'
mask = (sputnik_clean['date'] > start_date) & (sputnik_clean['date'] <= end_date)
sputnik_clean = sputnik_clean.loc[mask]

We check that all data processing went as expected. 

In [19]:
sputnik_clean['category'].value_counts()

Politika                 1146
Svet                      732
Kultura                   129
Ekonomija                  98
Naoružanje                 34
Rusija                     12
Evropa                      2
Komentari i Analitika       1
Region                      1
Analize i mišljenja         1
Intervjui                   1
Name: category, dtype: int64

In [20]:
sputnik_clean.isna().sum()

title       0
text        0
category    0
date        0
source      0
dtype: int64

In [22]:
sputnik_clean.nunique()

title       2157
text        2157
category      11
date        2150
source         1
dtype: int64

In [21]:
len(sputnik_clean)

2157

For now we keep everything until we see the rest of the data. I need to translate the categories. 

In [22]:
sputnik_clean.to_csv('sputnik_clean.csv', sep=',', encoding='utf-8')

## Vostok

In [124]:
vostok = pd.read_csv('vostok.csv', sep = '\t')

In [125]:
titles = vostok['title'].tolist()
texts_and_dates = vostok['texts_and_dates'].tolist()
categories = vostok['categories'].tolist()

In [126]:
titles_clean = []
for title in titles:
    try:
        title = re.search('heading">(.+?)</', title).group(1)
        titles_clean.append(title)
    except:
        titles_clean.append('no_title')
dates = [re.search('Објављено:(.+?).год.', date).group(1) for date in texts_and_dates]
dates = [date[2:] for date in dates]
texts = [re.sub('<.+?>', "", text) for text in texts_and_dates]
texts = [text.partition("idnovost=")[2] for text in texts]
texts = [re.sub('\\r|\\n|\\xa0|Фото дана|Читајте више на| „|, ,', "", text) for text in texts]
texts = [text[20:] for text in texts]
texts = [text[:-1] for text in texts]
texts = [re.sub('\\.,', ".", text) for text in texts]
categories = [re.search('\d+', category).group(0) for category in categories]
texts = [translit(text, 'sr', reversed = True) for text in texts]
titles_clean = [translit(text, 'sr', reversed = True) for text in titles_clean]

In [127]:
vostok_clean = pd.DataFrame({
    'title' : titles_clean,
    'text' : texts,
    'category' : categories,
    'date' : dates, 
})
vostok_clean['source'] = 'Vostok-RU'
vostok_clean = vostok_clean[vostok_clean.title != 'no_title']
vostok_clean['date'] = pd.to_datetime(vostok_clean['date'], format = '%d.%m.%Y')
vostok_clean.sort_values(by=["date"], inplace= True)
start_date = '2018-11-01'
end_date = '2019-02-28'
mask = (vostok_clean['date'] > start_date) & (vostok_clean['date'] <= end_date)
vostok_clean = vostok_clean.loc[mask]
def f(row):
    if row['category'] == '1':
        val = 'Rusija'
    elif row['category'] == '2':
        val = 'Politika'
    elif row['category'] == '7':
        val = 'Bezbednost'
    elif row['category'] == '6':
        val = 'Ekonomijа'
    elif row['category'] == '3':
        val = 'Društvo'
    elif row['category'] == '17':
        val = 'Region'
    elif row['category'] == '8':
        val = 'Analize'
    else: 
        val = 'Kultura'
    return val
vostok_clean['category'] = vostok_clean.apply(f, axis=1)

We get a sense of the category distribution and publishing by Vostok. 

In [128]:
vostok_clean['category'].value_counts()

Politika      726
Rusija        703
Bezbednost    556
Ekonomijа     216
Društvo       187
Region        154
Analize        69
Kultura        45
Name: category, dtype: int64

We know that some data collection did not work out as intended so we drop all rows where the text is shorter than 60 characters. This way we remove 256 articles. 

In [129]:
vostok_clean = vostok_clean[~(vostok_clean.text.str.len() < 60)]

In [130]:
vostok_clean['category'].value_counts()

Politika      649
Rusija        630
Bezbednost    504
Ekonomijа     188
Društvo       178
Region        150
Analize        63
Kultura        42
Name: category, dtype: int64

In [131]:
vostok_clean.isna().sum()

title       0
text        0
category    0
date        0
source      0
dtype: int64

In [132]:
len(vostok_clean)

2404

In [133]:
vostok_clean.nunique()

title       2317
text        2317
category       8
date         119
source         1
dtype: int64

In [134]:
print(any(vostok_clean.duplicated()))

False


No duplicate rows. That means articles are either re-used on different days or re-used across categories.

In [135]:
print(any(vostok_clean['text'].duplicated()))

True


Examining this data shows that duplicates arise from articles being used across several categories. So we will drop all rows where text and title are repeated. 

In [136]:
vostok_clean = vostok_clean.drop_duplicates('text')
vostok_clean = vostok_clean.drop_duplicates('title')

In [137]:
len(vostok_clean)

2313

In [138]:
vostok_clean.nunique()

title       2313
text        2313
category       8
date         119
source         1
dtype: int64

In [139]:
vostok_clean['category'].value_counts()

Politika      632
Rusija        609
Bezbednost    475
Ekonomijа     179
Društvo       175
Region        141
Analize        62
Kultura        40
Name: category, dtype: int64

In [140]:
vostok_clean.isna().sum()

title       0
text        0
category    0
date        0
source      0
dtype: int64

In [141]:
vostok_clean.head(10)

Unnamed: 0,title,text,category,date,source
3601,Beogradski dani slobode u Lugansku,"PovodomDana slobode“, kojim se od ove godine u...",Društvo,2018-11-02,Vostok-RU
3573,Peskov: Sankcije Ukrajini iznuđena recipročna ...,Uvodeći ograničenja Rusija je recipročno odgov...,Rusija,2018-11-02,Vostok-RU
729,„Rosatom“ pustio u rad reaktor prve plutajuće ...,"""Rosatom"" je uspešno pustio u rad reaktor prve...",Ekonomijа,2018-11-02,Vostok-RU
940,Mandiću i Kneževiću opet oduzimaju pasoše,Crnogorsko pravosuđe nastavilo je progon lider...,Region,2018-11-02,Vostok-RU
1289,U Hagu donete dve suprotne odluke: Ne zna se k...,Sudije u Hagu potpuno su se podelile u slučaju...,Društvo,2018-11-02,Vostok-RU
2480,„Zabrana ulaska u Crnu Goru Bećkoviću osim što...,"Zabrana ulaska u Crnu Goru Matiji Bećkoviću, Č...",Region,2018-11-02,Vostok-RU
2199,Putin: Danas se odlučuje kakav će biti svet u ...,Danas se odlučuje kakav će biti svet u naredni...,Rusija,2018-11-02,Vostok-RU
501,Konte se nada nastavku kontakata sa Putinom,Premijer Italije Đuzepe Konte kaže da se nada ...,Politika,2018-11-02,Vostok-RU
2363,Savčenko: Teroristi isporučili dva kontejnera ...,Teroristi su isporučili dva kontejnera sa hlor...,Bezbednost,2018-11-02,Vostok-RU
3532,Zaharova: Izjava Hanta o Rusiji oštra retorika...,Najnovija izjava britanskog ministra spoljnih ...,Rusija,2018-11-02,Vostok-RU


In [142]:
vostok_clean.to_csv('vostok_clean.csv', sep=',', encoding='utf-8')

## N1

In [143]:
n1 = pd.read_csv("n1.csv", sep = ",")

In [144]:
dates = n1['date'].tolist()

In [145]:
dates = [re.search('date">(.+?).<', date).group(1) for date in dates]
n1['date'] = pd.DataFrame(dates)
n1['date'] = pd.to_datetime(n1['date'])
n1.sort_values(by=["date"], inplace= True)
start_date = '2018.11.01'
end_date = '2019.02.28'
mask = (n1['date'] > start_date) & (n1['date'] <= end_date)
n1 = n1.loc[mask]

In [146]:
len(n1)

3765

In [147]:
titles = n1['title'].tolist()
texts = n1['text'].tolist()
categories = n1['category'].tolist()

titles_clean = []
for title in titles:
    try:
        title = re.search('title">(.+?)</', title).group(1)
        titles_clean.append(title)
    except:
        titles_clean.append('no_title')
categories_clean = []
for category in categories:
    try:
        category = re.search('.com/(.+?)">', category).group(1)
        categories_clean.append(category)
    except:
        categories_clean.append('no_category')
texts = [re.sub('\\r|\\n|<.*?>|<.*?>,|</.*?>,|</.*?>| ,|Tagovi:|\\\xa0', "", text) for text in texts]
texts = [re.sub('\\.,', ".", text) for text in texts]

date = n1['date'].tolist()

In [148]:
n1_clean = pd.DataFrame({
    'title' : titles_clean,
    'text' : texts,
    'category': categories_clean, 
    'date' : date
})
n1_clean = n1_clean[n1_clean.title != 'no_title']
n1_clean = n1_clean[n1_clean.category !='no_category']
n1_clean['source'] = 'N1-USA'
n1_clean.head(10)

Unnamed: 0,title,text,category,date,source
0,Đukanović i Tači o dijalogu Beograda i Prištine,[Predsednik Crne Gore Milo Đukanović razgovara...,Region,2018-11-09,N1-USA
1,Kolinda Grabar Kitarović udomila psa,[Povodom izmeštanja predsedničke kancelarije u...,Region,2018-11-10,N1-USA
2,Podignuta optužnica protiv Atifa Dudakovića,[Tužilaštvo Bosne i Hercegovine podiglo je opt...,Region,2018-11-10,N1-USA
3,Počelo suđenje gradonačelniku Zagreba i saradn...,[Pred zagrebačkim Županijskim sudom počelo je ...,Region,2018-11-10,N1-USA
4,Bivši pripadnik Vojske RS optužen za ratni zlo...,"[Zoran Adamović, nekadašnji pripadnik Vojske R...",Region,2018-11-10,N1-USA
5,Hrvatska poslanica šokirala Sabor: Vezali su m...,[Poslanica Ivana Ninčević Lesandrić je u vrlo ...,Region,2018-11-10,N1-USA
6,Vinarija iz BiH prodaje vino sa slikom Ante Pa...,[Jedna vinarija iz Širog Brijega u Hercegovini...,Region,2018-11-10,N1-USA
7,Na protestu protiv Komšića u Mostaru oko 10.00...,[Oko 10.000 Hrvata učestvovalo je na protestu ...,Region,2018-11-10,N1-USA
8,Pet godina zatvora komandantu Zvorničke brigad...,"[Ostoja Stanišić, komandant Šestog bataljona Z...",Region,2018-11-10,N1-USA
9,Vlasti Mjanmara spremne za povratak izbeglica ...,[Vlasti Mjanmara izrazile su spremnost za povr...,Svet,2018-11-11,N1-USA


In [149]:
n1_clean.isna().sum()

title       0
text        0
category    0
date        0
source      0
dtype: int64

In [151]:
len(n1_clean)

3764

In [152]:
n1_clean.nunique()

title       3763
text        3764
category       4
date          86
source         1
dtype: int64

In [153]:
n1_clean = n1_clean.drop_duplicates('title')
len(n1_clean)

3763

In [154]:
n1_clean.nunique()

title       3763
text        3763
category       4
date          86
source         1
dtype: int64

In [155]:
n1_clean['category'].value_counts()

Vesti      1655
Svet       1411
Region      569
Kultura     128
Name: category, dtype: int64

In [156]:
n1_clean.to_csv('n1_clean.csv', sep=',', encoding='utf-8')

## RFE

In [157]:
rfe = pd.read_csv('se.csv', sep = '\t')

In [158]:
rfe.head(10)

Unnamed: 0.1,Unnamed: 0,title,date,text
0,0,problem,"[<time datetime=""2019-01-03T17:50:00+01:00"">\r...",problem
1,1,"[<h1 class=""pg-title"">\r\nPočela Generalna sku...","[<time datetime=""2018-11-18T10:13:15+01:00"">\r...",[<p>Generalna skupština Interpola počela je u ...
2,2,"[<h1 class=""pg-title"">\r\nPompeo uveren da će ...","[<time datetime=""2019-01-12T15:52:35+01:00"">\r...",[<p>Američke vlasti uverene su u mogućnost da ...
3,3,"[<h1 class=""pg-title"">\r\nCIK potvrdio rezulta...","[<time datetime=""2018-11-06T17:41:20+01:00"">\r...",[<p>Centralna izborna komisija (CIK) BiH danas...
4,4,"[<h1 class=""pg-title"">\r\nMalinari iz Užica: T...","[<time datetime=""2018-12-21T12:20:05+01:00"">\r...",[<p>Predstavnici udruženja malinara iz užičkog...
5,5,"[<h1 class=""pg-title"">\r\nNove kosovske sudije...","[<time datetime=""2019-01-21T16:29:47+01:00"">\r...",[<p>Predsednik Kosova Hašim Tači potpisao je u...
6,6,"[<h1 class=""pg-title"">\r\nAmbasada SAD: Opozic...","[<time datetime=""2019-02-25T14:13:33+01:00"">\r...",[<p>Pitanje eventualnog izbornog i bojkota u S...
7,7,"[<h1 class=""pg-title"">\r\nProtest 'Jedan od pe...","[<time datetime=""2019-02-22T20:40:48+01:00"">\r...","[<p>Protesti pod nazivom ""Jedan od pet miliona..."
8,8,"[<h1 class=""pg-title"">\r\nMećave zahvatile Evr...","[<time datetime=""2019-01-08T20:46:50+01:00"">\r...",[<p>Hladan talas sa mećavama i danas pogađa Ev...
9,9,"[<h1 class=""pg-title"">\r\nIstraživanje: Evrops...","[<time datetime=""2018-12-10T20:56:41+01:00"">\r...",[<p>Mnogi Jevreji u Evropi kriju svoju religij...


In [159]:
titles = rfe['title'].tolist()
dates = rfe['date'].tolist()
texts = rfe['text'].tolist()

In [160]:
titles_clean = []
for title in titles:
    try:
        title = re.search('title">\\r\\n(.+?)\\r\\n</h1>', title).group(1)
        titles_clean.append(title)
    except:
        titles_clean.append('no_title')
titles_clean[:10]

dates_clean = []
for date in dates:
    try:
        date = re.search('datetime="(.+?)">', date).group(1)
        dates_clean.append(date)
    except:
        dates_clean.append('no_date')
dates_clean[:10]
texts = [re.sub('\\r|\\n|<.*?>|<.*?>,|</.*?>,|</.*?>', "", text) for text in texts]
texts = [re.sub('\\.,', ".", text) for text in texts]
texts = [text[1:] for text in texts]
texts = [text[:-1] for text in texts]

In [161]:
rfe = pd.DataFrame({
    'title' : titles_clean,
    'text' : texts,
    'date' : dates_clean, 
})
rfe = rfe[rfe.title != 'no_title']
rfe = rfe[rfe.date != 'no_date']
rfe['date'] = pd.to_datetime(rfe['date'], format = '%Y-%m-%d')
rfe.sort_values(by=["date"], inplace= True)
start_date = '2018-11-01'
end_date = '2019-02-28'
mask = (rfe['date'] > start_date) & (rfe['date'] <= end_date)
rfe = rfe.loc[mask]
rfe['source'] = 'RFE-USA'

In [162]:
len(rfe)

1695

In [163]:
rfe.isna().sum()

title     0
text      0
date      0
source    0
dtype: int64

In [165]:
rfe.nunique()

title     1695
text      1695
date      1695
source       1
dtype: int64

In [166]:
rfe.head(10)

Unnamed: 0,title,text,date,source
1313,Pančić: Društvo uglednih osuđenika,"Piše: Teofil Pančić, (Mišljenja izrečena u kom...",2018-11-01 08:42:26,RFE-USA
1773,Narodni ili partijski poslanik?,U Srbiji je u toku kampanja Građanske inicijat...,2018-11-01 12:15:46,RFE-USA
1506,Porošenko: Ruske sankcije kao vrsta 'nagrade',Ukrajinski predsjednik Petro Porošenko uspored...,2018-11-01 14:05:09,RFE-USA
1238,SAD: Ubojica iz sinagage se ne 'osjeća krivim',Vozač kamiona koji je ubio 11 ljudi u sinagogi...,2018-11-01 14:28:59,RFE-USA
71,Brnabić: Nema konkretnog plana oko Kosova,Premijerka Srbije Ana Brnabić izjavila je dana...,2018-11-01 14:53:20,RFE-USA
894,Dodik: Srbi neće raditi na štetu Bošnjaka i Hr...,Predsjednik Republike Srpske i novoizabrani sr...,2018-11-01 15:05:41,RFE-USA
1491,Vučić: Srbija će reagovati na crnogorsku zabranu,Predsjednik Srbije Aleksandar Vučić potvrdio j...,2018-11-01 15:27:00,RFE-USA
883,Šesti BUNT festival u Beogradu,Šesti muzički festival kamerne muzike BUNT (Be...,2018-11-01 15:29:13,RFE-USA
273,Vučićeva savetnica Suzana Vasiljević podnela o...,Savetnica za medije predsednika Srbije Suzana ...,2018-11-01 15:52:20,RFE-USA
1005,Sud u Skoplju blokirao imovinu VMRO DPMNE,Krivični sud u Skoplju potvrdio je da je privr...,2018-11-01 16:44:26,RFE-USA


In [90]:
rfe.to_csv('rfe_clean.csv', sep=',', encoding='utf-8')

## VOA

In [167]:
voa = pd.read_csv('voa.csv', sep = '\t')

In [168]:
voa.head(10)

Unnamed: 0.1,Unnamed: 0,title,date,text,category
0,0,"[<h1 class=""pg-title"">\r\nPosmatrači: Velike a...","[<time datetime=""2018-11-23T10:33:02+01:00"">\r...",[<p>Veliki vojni konvoj iz arapskih zemalja ra...,"<div class=""category"">\r\n<a class="""" href=""/z..."
1,1,"[<h1 class=""pg-title"">\r\nSrbiji solidna trojk...","[<time datetime=""2018-12-20T21:19:06+01:00"">\r...",[<p><strong>Pristup Srbije reformama u domenim...,"<div class=""category"">\r\n<a class="""" href=""/z..."
2,2,"[<h1 class=""pg-title"">\r\nEK: U Srbiji 42 odst...","[<time datetime=""2018-12-26T21:38:38+01:00"">\r...",[<p><strong>Poslednja anketa koju je sprovela ...,"<div class=""category"">\r\n<a class="""" href=""/z..."
3,3,"[<h1 class=""pg-title"">\r\nRešena misterija nes...","[<time datetime=""2017-07-06T19:35:26+01:00"">\r...","[<p>Na crnoj beloj fotografiji vidi se žena, l...","<div class=""category"">\r\n<a class="""" href=""/z..."
4,4,"[<h1 class=""pg-title"">\r\nIzbori u SAD 2018.\r...","[<time datetime=""2018-11-05T21:29:00+01:00"">\r...",problem,"<div class=""category"">\r\n<a href=""/z/2086"">Am..."
5,5,"[<h1 class=""pg-title"">\r\nMej traži od parlame...","[<time datetime=""2019-02-10T15:18:01+01:00"">\r...",[<p>Britanska vlada zatražila je u nedelju od ...,"<div class=""category"">\r\n<a class="""" href=""/z..."
6,6,"[<h1 class=""pg-title"">\r\nBeli božur šefu misi...","[<time datetime=""2019-01-18T18:04:29+01:00"">\r...",[<p><strong>Udruženje novinara Srbije na Kosov...,"<div class=""category"">\r\n<a class="""" href=""/z..."
7,7,"[<h1 class=""pg-title"">\r\nNovi ministar finans...","[<time datetime=""2018-05-10T04:44:33+01:00"">\r...",[<p><strong>Kako će biti realizovan i kako će ...,"<div class=""category"">\r\n<a class="""" href=""/z..."
8,8,"[<h1 class=""pg-title"">\r\nTramp: Obustaviti br...","[<time datetime=""2018-11-10T20:06:43+01:00"">\r...",[<p><strong>Brojanje glasova na Floridi nastav...,"<div class=""category"">\r\n<a class="""" href=""/z..."
9,9,"[<h1 class=""pg-title"">\r\nTužioci o Trampovoj ...","[<time datetime=""2018-12-08T18:22:08+01:00"">\r...",[<p>Tužioci iz Njujorka i specijalni tužilac R...,"<div class=""category"">\r\n<a class="""" href=""/z..."


In [169]:
titles = voa['title'].tolist()
dates = voa['date'].tolist()
texts = voa['text'].tolist()
categories = voa['category'].tolist()

In [170]:
titles_clean = []
for title in titles:
    try:
        title = re.search('title">\\r\\n(.+?)\\r\\n</h1>', title).group(1)
        titles_clean.append(title)
    except:
        titles_clean.append('no_title')
titles_clean[:10]

dates_clean = []
for date in dates:
    try:
        date = re.search('datetime="(.+?)">', date).group(1)
        dates_clean.append(date)
    except:
        dates_clean.append('no_date')
dates_clean[:10]

texts = [re.sub('\\r|\\n|<.*?>|<.*?>,|</.*?>,|</.*?>| ,|', "", text) for text in texts]
texts = [re.sub('\\.,', ".", text) for text in texts]
texts = [text[1:] for text in texts]
texts = [text[:-1] for text in texts]

categories_clean = []
for category in categories:
    try:
        category = re.search('href="/z/(.+?)</a>', category).group(1)
        categories_clean.append(category)
    except:
        categories_clean.append('no_category')
categories_clean = [category[6:] for category in categories_clean]

In [171]:
voa = pd.DataFrame({
    'title' : titles_clean,
    'text' : texts,
    'date' : dates_clean, 
    'category' : categories_clean,
})
voa = voa[voa.title != 'no_title']
voa = voa[voa.date != 'no_date']
voa['date'] = pd.to_datetime(voa['date'], format = '%Y-%m-%d')
voa.sort_values(by=["date"], inplace= True)
start_date = '2018-11-01'
end_date = '2019-02-28'
mask = (voa['date'] > start_date) & (voa['date'] <= end_date)
voa = voa.loc[mask]
voa['source'] = 'VOA-USA'

In [172]:
voa.isna().sum()

title       0
text        0
date        0
category    0
source      0
dtype: int64

In [173]:
voa['category'].value_counts()

Aktuelno                 543
Balkan                    57
Amerika                   42
Globalne teme             17
Društvo                    6
Kultura                    3
Nauka i tehnologija        2
Ekonomija i finansije      1
Intervju                   1
Name: category, dtype: int64

In [174]:
len(voa)

672

In [175]:
voa.nunique()

title       672
text        671
date        672
category      9
source        1
dtype: int64

In [176]:
voa = voa.drop_duplicates('text')
len(voa)

671

In [178]:
voa.isna().sum()

title       0
text        0
date        0
category    0
source      0
dtype: int64

In [179]:
voa['category'].value_counts()

Aktuelno                 543
Balkan                    57
Amerika                   41
Globalne teme             17
Društvo                    6
Kultura                    3
Nauka i tehnologija        2
Ekonomija i finansije      1
Intervju                   1
Name: category, dtype: int64

In [180]:
voa.head(10)

Unnamed: 0,title,text,date,category,source
509,"Bolton: Sankcije zamišljene da naštete Iranu, ...","Savetnik za nacionalnu bezbednost SAD, Džon Bo...",2018-11-01 10:45:36,Aktuelno,VOA-USA
942,Botovi lažnim vestima menjaju politički diskurs,Najveću pretnju po Sjedinjene Države predstavl...,2018-11-01 11:31:04,Aktuelno,VOA-USA
655,Radnici Gugla protestuju zbog zataškavanja sek...,Stotine zaposlenih u Guglu u Aziji izašlo je s...,2018-11-01 14:34:18,Aktuelno,VOA-USA
748,"Sloboda interneta u 2017: Kontrola, manipulaci...",Vlade širom svijeta pooštravaju kontrolu nad u...,2018-11-01 14:54:17,Aktuelno,VOA-USA
926,Pronađena crna kutija indonežanskog aviona,Ronioci u Indoneziji su pronašli jednu od crni...,2018-11-01 15:02:58,Aktuelno,VOA-USA
68,Optuženi za masovno ubistvo u sinagogi negira ...,"Robert Bauers, optužen za ubistvo 11 ljudi u s...",2018-11-01 16:20:27,Aktuelno,VOA-USA
579,Vlada Kosova planira kampanju za zapošljavanje...,Ministarstvo bezbednosnih snaga Kosova uskoro ...,2018-11-01 16:21:08,Balkan,VOA-USA
311,Tramp tvrdi da su demokrate pustile u zemlju o...,"Šest dana pred kongresne izbore, predsednik SA...",2018-11-01 18:19:24,Aktuelno,VOA-USA
954,"""Azil u SAD će moći da traže samo imigranti ko...",Predsednik Donald Tramp najavio je da njegova ...,2018-11-01 20:51:28,Aktuelno,VOA-USA
492,Pompeo: SAD bi uskoro mogle da imaju dovoljno ...,"Državni sekretar SAD, Majk Pompeo, izjavio je ...",2018-11-02 08:44:39,Aktuelno,VOA-USA


In [181]:
voa.to_csv('voa_clean.csv', sep=',', encoding='utf-8')