In [1]:
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import pandas as pd
import pickle
import re
import requests
import seaborn as sns

from db import create_connection
from db import (
    articles,
    services,
    urls
)

In [2]:
connection = create_connection()

# Get data

In [3]:
data = articles.get_articles(connection)

In [4]:
pd.set_option('max_colwidth', 100)
df = pd.DataFrame(data, columns = ['publication_date',
                                   'author',
                                   'title',
                                   'url',
                                   'koronawirus_in_text',
                                   'koronawirus_in_title',
                                   'question_mark',
                                   'exclamation_mark',
                                   'all_words'])

In [5]:
df.sample(frac = 0.002)

Unnamed: 0,publication_date,author,title,url,koronawirus_in_text,koronawirus_in_title,question_mark,exclamation_mark,all_words
3187,NaT,,,https://www.se.pl/lublin/zagraja-dla-3-letniej-marysi-szlazak-dziewczynka-walczy-o-zdrowie-we-wl...,,,,,
5651,NaT,,,https://www.se.pl/lublin/zwloki-w-bystrzycy-cialo-znalazl-przechodzien-1-aa-Tkz1-BUGX-H8ZW.html,,,,,
5417,NaT,,,https://www.se.pl/lublin/smiertelny-wypadek-w-lukowie-nie-zyje-33-letni-kierowca-aa-qdAr-mRUg-1c...,,,,,
2366,NaT,,,https://www.se.pl/lublin/kuchenne-rewolucje-lublin-debowka-karczma-zapiecek-pyszny-zajazd-sezon-...,,,,,
93,2020-08-18 18:17:00,Marek Targoński,Biała Podlaska: SPALIŁ samochód swojej dziewczyny. Powód? Wyrzucili go z imprezy [ZDJĘCIA],https://lublin.se.pl/biala-podlaska-spalil-samochod-swojej-dziewczyny-powod-wyrzucili-go-z-impre...,0.0,0.0,1.0,0.0,238.0
565,2020-06-01 12:17:00,MTA,Koronawirus. Lubelskie: NOWE PRZYPADKI zakażeń. Blisko 500 od początku epidemii. Co dalej?,https://www.se.pl/lublin/koronawirus-lubelskie-nowe-przypadki-zakazen-blisko-500-od-poczatku-epi...,19.0,1.0,4.0,3.0,219.0
6969,NaT,,,https://www.se.pl/lublin/bezplatne-porady-ekspertow-w-lublinie-skorzystaj-z-wiedzy-urzednikow-te...,,,,,
6861,NaT,,,https://www.se.pl/lublin/trasy-rowerowe-po-roztoczu-szalenstwa-na-dwoch-kolkach-dla-fanow-zielen...,,,,,
8104,2020-07-14 06:00:00,Artykuł sponsorowany,KINO PLENEROWE na dachu VIVO! Lublin. „Taras pełen atrakcji” [HARMONOGRAM],https://lublin.se.pl/kino-plenerowe-na-dachu-vivo-lublin-taras-pelen-atrakcji-harmonogram-aa-rfS...,2.0,0.0,3.0,9.0,431.0
2511,NaT,,,https://www.se.pl/lublin/studencie-znajdz-prace-staz-lub-praktyki-zblizaja-sie-dni-kariery-2019-...,,,,,


# Clean data

### initial data check, organize and clearing

In [6]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8549 entries, 0 to 8548
Data columns (total 9 columns):
publication_date        1651 non-null datetime64[ns]
author                  1651 non-null object
title                   1651 non-null object
url                     8549 non-null object
koronawirus_in_text     1651 non-null float64
koronawirus_in_title    1651 non-null float64
question_mark           1651 non-null float64
exclamation_mark        1651 non-null float64
all_words               1651 non-null float64
dtypes: datetime64[ns](1), float64(5), object(3)
memory usage: 2.6 MB


In [7]:
df.nunique()

publication_date        1641
author                   126
title                   1648
url                     8549
koronawirus_in_text       35
koronawirus_in_title       2
question_mark             18
exclamation_mark          15
all_words                383
dtype: int64

In [8]:
df.dropna(inplace=True)
df.shape

(1651, 9)

In [9]:
df['author'] = df['author'].astype('category')
df['koronawirus_in_text'] = df['koronawirus_in_text'].astype('int')
df['koronawirus_in_title'] = df['koronawirus_in_title'].astype('int')
df['question_mark'] = df['question_mark'].astype('int')
df['exclamation_mark'] = df['exclamation_mark'].astype('int')
df['all_words'] = df['all_words'].astype('int')
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1651 entries, 0 to 8548
Data columns (total 9 columns):
publication_date        1651 non-null datetime64[ns]
author                  1651 non-null category
title                   1651 non-null object
url                     1651 non-null object
koronawirus_in_text     1651 non-null int64
koronawirus_in_title    1651 non-null int64
question_mark           1651 non-null int64
exclamation_mark        1651 non-null int64
all_words               1651 non-null int64
dtypes: category(1), datetime64[ns](1), int64(5), object(2)
memory usage: 765.4 KB


In [10]:
df.count()

publication_date        1651
author                  1651
title                   1651
url                     1651
koronawirus_in_text     1651
koronawirus_in_title    1651
question_mark           1651
exclamation_mark        1651
all_words               1651
dtype: int64

### column: publication_day

In [11]:
df.insert(loc=1, column='publication_day', value=df['publication_date'].dt.strftime('%Y-%m-%d'))
df.sample(frac = 0.002)

Unnamed: 0,publication_date,publication_day,author,title,url,koronawirus_in_text,koronawirus_in_title,question_mark,exclamation_mark,all_words
786,2020-03-19 10:06:00,2020-03-19,mt,Koronawirus. Lubelskie: TRZY NOWE przypadki zachorowań! W całym kraju ponad 300 [19.03.20],https://www.se.pl/lublin/koronawirus-lubelskie-trzy-nowe-przypadki-zachorowan-w-calym-kraju-pona...,18,1,2,1,221
51,2020-08-25 12:05:00,2020-08-25,Karolina Januszek,Wielkie cięcia w budżetówce. Co piąty urzędnik może stracić pracę?,https://lublin.se.pl/wielkie-ciecia-w-budzetowce-co-piaty-urzednik-moze-stracic-prace-ak-adR7-dL...,2,0,1,0,181
8112,2020-07-12 12:54:00,2020-07-12,Magdalena Jaśkiewicz-Stawowczyk,"Lublin: Jak przebiega głosowanie, są kolejki do lokali? Kiedy głosować, żeby nie czekać? [WYBORY...",https://lublin.se.pl/lublin-jak-przebiega-glosowanie-sa-kolejki-do-lokali-kiedy-glosowac-zeby-ni...,0,0,0,2,578


In [12]:
df.publication_day.value_counts().head()

2020-06-29    28
2020-07-08    26
2020-07-01    25
2020-07-03    22
2020-08-25    22
Name: publication_day, dtype: int64

In [13]:
df['publication_day'].nunique()

233

In [14]:
#df['publication_day'] = df['publication_day'].astype('category')
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1651 entries, 0 to 8548
Data columns (total 10 columns):
publication_date        1651 non-null datetime64[ns]
publication_day         1651 non-null object
author                  1651 non-null category
title                   1651 non-null object
url                     1651 non-null object
koronawirus_in_text     1651 non-null int64
koronawirus_in_title    1651 non-null int64
question_mark           1651 non-null int64
exclamation_mark        1651 non-null int64
all_words               1651 non-null int64
dtypes: category(1), datetime64[ns](1), int64(5), object(3)
memory usage: 873.4 KB


### column: publication_month

In [15]:
df.insert(loc=1, column='publication_month', value=df['publication_date'].dt.strftime('%Y-%m'))
df.sample(frac = 0.001)

Unnamed: 0,publication_date,publication_month,publication_day,author,title,url,koronawirus_in_text,koronawirus_in_title,question_mark,exclamation_mark,all_words
8085,2020-07-16 15:19:00,2020-07,2020-07-16,AC,"Wyniki Lotto. Sprawdź wyniki Lotto z 16.07.2020 godz. 14 (Aktualne wygrane, wyniki losowania Mul...",https://lublin.se.pl/wyniki-lotto-sprawdz-wyniki-lotto-z-16-07-2020-godz-14-aktualne-wygrane-wyn...,0,0,1,1,180
8039,2020-07-26 13:22:00,2020-07,2020-07-26,Mariusz Mucha,Czarcia Łapa w fontannie. W Lublinie mają nowy pokaz na Placu Litewskim [ZDJĘCIA],https://lublin.se.pl/czarcia-lapa-w-fontannie-w-lublinie-maja-nowy-pokaz-aa-u1DC-KCaD-oAeD.html,0,0,0,1,205


In [16]:
df.publication_month.value_counts().head(8)

2020-07    439
2020-08    301
2020-06    214
2020-01    154
2020-05    153
2020-03    147
2020-04    138
2020-02    105
Name: publication_month, dtype: int64

In [17]:
df['publication_month'].nunique()

8

In [18]:
#df['publication_month'] = df['publication_month'].astype('category')
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1651 entries, 0 to 8548
Data columns (total 11 columns):
publication_date        1651 non-null datetime64[ns]
publication_month       1651 non-null object
publication_day         1651 non-null object
author                  1651 non-null category
title                   1651 non-null object
url                     1651 non-null object
koronawirus_in_text     1651 non-null int64
koronawirus_in_title    1651 non-null int64
question_mark           1651 non-null int64
exclamation_mark        1651 non-null int64
all_words               1651 non-null int64
dtypes: category(1), datetime64[ns](1), int64(5), object(4)
memory usage: 976.6 KB


### column: author

In [19]:
df['author'].value_counts()[:60]

mt                                 335
Marek Targoński                    183
Mucha                              147
Karolina Januszek                  131
Mariusz Mucha                      111
MTA                                 80
Mucha, mta                          55
AP                                  36
KM                                  33
OM                                  32
Bartłomiej Ważny                    23
Monika Kowalewicz                   22
AC                                  21
Michał Michalak                     19
EIB                                 19
Emilia Białecka                     16
MK                                  16
Mateusz Kasiak                      16
Agnieszka Niećko                    15
Jacek Werner                        13
gk                                  13
Olka Mazur                          13
Artykuł sponsorowany                13
Sylwia Sitka-Czerniak               13
Grzegorz Kluczyński                 13
maal                     

In [20]:
df['author'] = df['author'].str.lower()
regex_pattern = re.compile(r'.*mt.*', re.I)
df[df['author'].str.contains(regex_pattern)].count()

publication_date        514
publication_month       514
publication_day         514
author                  514
title                   514
url                     514
koronawirus_in_text     514
koronawirus_in_title    514
question_mark           514
exclamation_mark        514
all_words               514
dtype: int64

In [21]:
df['author'] = df['author'].replace(['mt', 'mt; wideo: Tygodnik Zamojski', 'gał'],'marek targoński')
# df['author'] = df['author'].replace(['Mucha', 'mucha'],'Mariusz Mucha')
# df['author'] = df['author'].replace(['ŁT'],'Łukasz Trybulski')
# df['author'] = df['author'].replace(['Mateusz Kasiak (Radio Eska)'],'Mateusz Kasiak')
# df['author'] = df['author'].replace(['KaJa'],'Karolina Januszek')
# df['author'] = df['author'].replace(['RS'],'Rafał Strzelec')
# df['author'] = df['author'].replace(['gk'],'Grzegorz Kluczyński')
# df['author'] = df['author'].replace([''],'Nieznany')

# regex_pattern = re.compile(r'(.*,.*|.*/.*|Redakcja ESKA INFO|Nieznany|Akcja partnerska)', re.I)
# df['author'] = df['author'].replace(regex_pattern,'Więcej autorów lub nieznany')

regex_pattern = re.compile(r'.*mt.*', re.I)
df['author'] = df['author'].replace(regex_pattern,'marek targoński')

df['author'].value_counts()[:50]

marek targoński                        699
mucha                                  148
karolina januszek                      131
mariusz mucha                          111
ap                                      36
km                                      33
om                                      32
bartłomiej ważny                        23
monika kowalewicz                       22
ac                                      21
michał michalak                         19
eib                                     19
maal                                    17
emilia białecka                         16
mateusz kasiak                          16
mk                                      16
agnieszka niećko                        15
gk                                      13
grzegorz kluczyński                     13
jacek werner                            13
olka mazur                              13
sylwia sitka-czerniak                   13
artykuł sponsorowany                    13
kaja       

In [22]:
df['author'] = df['author'].astype('category')
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1651 entries, 0 to 8548
Data columns (total 11 columns):
publication_date        1651 non-null datetime64[ns]
publication_month       1651 non-null object
publication_day         1651 non-null object
author                  1651 non-null category
title                   1651 non-null object
url                     1651 non-null object
koronawirus_in_text     1651 non-null int64
koronawirus_in_title    1651 non-null int64
question_mark           1651 non-null int64
exclamation_mark        1651 non-null int64
all_words               1651 non-null int64
dtypes: category(1), datetime64[ns](1), int64(5), object(4)
memory usage: 971.2 KB


### column: koronawiorus_text

In [23]:
# df.loc[[919, 1979, 1965], 'koronawirus_in_text'] = 0

### column: koronawiorus_anywhere_sum

In [24]:
df['koronawirus_anywhere'] = 0
df['koronawirus_anywhere'] = df['koronawirus_in_text'] + df['koronawirus_in_title']
# df.loc[df.koronawirus_anywhere > 0, 'koronawirus_anywhere'] = 1
df.head(1)

Unnamed: 0,publication_date,publication_month,publication_day,author,title,url,koronawirus_in_text,koronawirus_in_title,question_mark,exclamation_mark,all_words,koronawirus_anywhere
0,2020-08-31 14:40:00,2020-08,2020-08-31,karolina januszek,Wielkie promocje dla rodzin w Lidlu,https://lublin.se.pl/wielkie-promocje-dla-rodzin-w-lidlu-ak-ACGH-uaXC-Bt3y.html,0,0,0,2,249,0


### column: koronawiorus_anywhere_count

In [25]:
df['koronawirus_anywhere_count'] = 0
df['koronawirus_anywhere_count'] = (df['koronawirus_in_text'] + df['koronawirus_in_title']) / (df['koronawirus_in_text'] + df['koronawirus_in_title'])
df['koronawirus_anywhere_count'] = df['koronawirus_anywhere_count'].fillna(0).astype('int')
df.head(3)

Unnamed: 0,publication_date,publication_month,publication_day,author,title,url,koronawirus_in_text,koronawirus_in_title,question_mark,exclamation_mark,all_words,koronawirus_anywhere,koronawirus_anywhere_count
0,2020-08-31 14:40:00,2020-08,2020-08-31,karolina januszek,Wielkie promocje dla rodzin w Lidlu,https://lublin.se.pl/wielkie-promocje-dla-rodzin-w-lidlu-ak-ACGH-uaXC-Bt3y.html,0,0,0,2,249,0,0
1,2020-08-31 11:31:00,2020-08,2020-08-31,karolina januszek,Polacy tracą pracę! Najgorsze dopiero nadejdzie,https://lublin.se.pl/polacy-traca-prace-najgorsze-dopiero-nadejdzie-ak-vtoS-p5BX-TkBx.html,3,0,0,0,217,3,1
2,2020-08-31 10:49:00,2020-08,2020-08-31,mariusz mucha,Na kradzionym rowerze przyjechał zameldować się na komendzie,https://lublin.se.pl/na-kradzionym-rowerze-przyjechal-zameldowac-sie-na-komendzie-aa-pdqC-pyhi-e...,0,0,0,0,259,0,0


### Export data

In [26]:
df.to_csv('se_lublin.csv')