In [92]:
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import pandas as pd
import pickle
import re
import requests
import seaborn as sns

from db import create_connection
from db import (
    articles,
    services,
    urls
)

In [93]:
connection = create_connection()

# Get data

In [94]:
data = articles.get_articles(connection)

In [95]:
pd.set_option('max_colwidth', 100)
df = pd.DataFrame(data, columns = ['publication_date',
                                   'author',
                                   'title',
                                   'url',
                                   'koronawirus_in_text',
                                   'koronawirus_in_title',
                                   'question_mark',
                                   'exclamation_mark',
                                   'all_words'])

In [96]:
df.sample(frac = 0.002)

Unnamed: 0,publication_date,author,title,url,koronawirus_in_text,koronawirus_in_title,question_mark,exclamation_mark,all_words
947,NaT,,,https://www.se.pl/lublin/stroza-kolonia-smiertelne-potracenie-rowerzysty-sa-utrudnienia-w-ruchu-...,,,,,
850,2020-01-22 10:15:00,mt,Lubelskie: GIGANTYCZNE OPÓŹNIENIA na kolei. Uszkodzona trakcja [AKTUALIZACJA],https://www.se.pl/lublin/lubelskie-gigantyczne-opoznienia-na-kolei-dramatyczna-sytuacja-opoznien...,0.0,0.0,0.0,4.0,242.0
1716,2020-02-18 12:20:00,mt,SILNY WIATR na Lubelszczyźnie! IMGW ostrzega przed WICHURAMI [PROGNOZA POGODY],https://www.se.pl/lublin/silny-wiatr-na-lubelszczyznie-imgw-ostrzega-przed-wichurami-prognoza-po...,0.0,0.0,0.0,3.0,141.0
92,2020-07-31 20:58:00,Mariusz Mucha,Pijany szalał meleksem po Puławach. Spieszył się na...rosół,https://lublin.se.pl/pijany-szalal-meleksem-po-pulawach-spieszyl-sie-na-rosol-aa-tAgy-csyQ-UeWY....,0.0,0.0,0.0,1.0,210.0


# Clean data

### initial data check, organize and clearing

In [97]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1800 entries, 0 to 1799
Data columns (total 9 columns):
publication_date        1507 non-null datetime64[ns]
author                  1507 non-null object
title                   1507 non-null object
url                     1800 non-null object
koronawirus_in_text     1507 non-null float64
koronawirus_in_title    1507 non-null float64
question_mark           1507 non-null float64
exclamation_mark        1507 non-null float64
all_words               1507 non-null float64
dtypes: datetime64[ns](1), float64(5), object(3)
memory usage: 857.5 KB


In [98]:
df.nunique()

publication_date        1497
author                   124
title                   1504
url                     1800
koronawirus_in_text       35
koronawirus_in_title       2
question_mark             17
exclamation_mark          15
all_words                396
dtype: int64

In [99]:
df.dropna(inplace=True)
df.shape

(1507, 9)

In [100]:
df['author'] = df['author'].astype('category')
df['koronawirus_in_text'] = df['koronawirus_in_text'].astype('int')
df['koronawirus_in_title'] = df['koronawirus_in_title'].astype('int')
df['question_mark'] = df['question_mark'].astype('int')
df['exclamation_mark'] = df['exclamation_mark'].astype('int')
df['all_words'] = df['all_words'].astype('int')
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1507 entries, 0 to 1799
Data columns (total 9 columns):
publication_date        1507 non-null datetime64[ns]
author                  1507 non-null category
title                   1507 non-null object
url                     1507 non-null object
koronawirus_in_text     1507 non-null int64
koronawirus_in_title    1507 non-null int64
question_mark           1507 non-null int64
exclamation_mark        1507 non-null int64
all_words               1507 non-null int64
dtypes: category(1), datetime64[ns](1), int64(5), object(2)
memory usage: 701.1 KB


In [101]:
df.count()

publication_date        1507
author                  1507
title                   1507
url                     1507
koronawirus_in_text     1507
koronawirus_in_title    1507
question_mark           1507
exclamation_mark        1507
all_words               1507
dtype: int64

### column: publication_day

In [102]:
df.insert(loc=1, column='publication_day', value=df['publication_date'].dt.strftime('%Y-%m-%d'))
df.sample(frac = 0.002)

Unnamed: 0,publication_date,publication_day,author,title,url,koronawirus_in_text,koronawirus_in_title,question_mark,exclamation_mark,all_words
743,2020-03-09 11:38:00,2020-03-09,mt,Słaba płeć? Ależ skąd! PIĘKNE I SILNE. Oto lubelskie policjantki! [ZDJĘCIA],https://www.se.pl/lublin/slaba-plec-alez-skad-piekne-i-silne-oto-lubelskie-policjantki-zdjecia-a...,0,0,0,2,274
602,2020-04-23 10:20:00,2020-04-23,AP,Ministerstwo Zdrowia: nowe przypadki koronawirusa COVID-19 w Lubelskim,https://www.se.pl/lublin/ministerstwo-zdrowia-nowe-przypadki-koronawirusa-covid-19-w-lubelskim-a...,12,1,0,0,169
260,2020-07-10 10:59:00,2020-07-10,MaAl,Gospodarka neutralan dla klimatu: KE przedstawia nowy system energetyczny,https://lublin.se.pl/gospodarka-neutralan-dla-klimatu-ke-przedstawia-nowy-system-energetyczny-ak...,1,0,0,0,297


In [103]:
df.publication_day.value_counts().head()

2020-06-29    28
2020-07-08    26
2020-07-01    25
2020-07-03    22
2020-07-14    21
Name: publication_day, dtype: int64

In [104]:
df['publication_day'].nunique()

221

In [105]:
#df['publication_day'] = df['publication_day'].astype('category')
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1507 entries, 0 to 1799
Data columns (total 10 columns):
publication_date        1507 non-null datetime64[ns]
publication_day         1507 non-null object
author                  1507 non-null category
title                   1507 non-null object
url                     1507 non-null object
koronawirus_in_text     1507 non-null int64
koronawirus_in_title    1507 non-null int64
question_mark           1507 non-null int64
exclamation_mark        1507 non-null int64
all_words               1507 non-null int64
dtypes: category(1), datetime64[ns](1), int64(5), object(3)
memory usage: 799.7 KB


### column: publication_month

In [106]:
df.insert(loc=1, column='publication_month', value=df['publication_date'].dt.strftime('%Y-%m'))
df.sample(frac = 0.001)

Unnamed: 0,publication_date,publication_month,publication_day,author,title,url,koronawirus_in_text,koronawirus_in_title,question_mark,exclamation_mark,all_words
374,2020-06-29 14:50:00,2020-06,2020-06-29,Mateusz Kasiak,Zmiany w lubelskiej komunikacji miejskiej. Więcej miejsca dla pasażerów [AUDIO],https://lublin.se.pl/zmiany-w-lubelskiej-komunikacji-miejskiej-wiecej-miejsca-dla-pasazerow-audi...,2,0,0,0,290
447,2020-06-12 10:48:00,2020-06,2020-06-12,Mucha,Dęblin: NAJSTARSZA uciekinierka w Polsce? Po KOLIZJI zwiała rowerem,https://lublin.se.pl/deblin-to-najstarsza-uciekinierka-w-polsce-po-wypadku-zwiala-rowerem-aa-w3A...,0,0,0,4,213


In [107]:
df.publication_month.value_counts().head()

2020-07    444
2020-06    214
2020-01    154
2020-05    153
2020-08    152
Name: publication_month, dtype: int64

In [108]:
df['publication_month'].nunique()

8

In [109]:
#df['publication_month'] = df['publication_month'].astype('category')
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1507 entries, 0 to 1799
Data columns (total 11 columns):
publication_date        1507 non-null datetime64[ns]
publication_month       1507 non-null object
publication_day         1507 non-null object
author                  1507 non-null category
title                   1507 non-null object
url                     1507 non-null object
koronawirus_in_text     1507 non-null int64
koronawirus_in_title    1507 non-null int64
question_mark           1507 non-null int64
exclamation_mark        1507 non-null int64
all_words               1507 non-null int64
dtypes: category(1), datetime64[ns](1), int64(5), object(4)
memory usage: 893.8 KB


### column: author

In [110]:
df['author'].value_counts()[:60]

mt                                     335
Mucha                                  147
Marek Targoński                        145
Karolina Januszek                       94
MTA                                     80
Mariusz Mucha                           80
Mucha, mta                              55
AP                                      36
OM                                      32
KM                                      27
Bartłomiej Ważny                        23
Monika Kowalewicz                       22
AC                                      20
Michał Michalak                         17
MK                                      16
Mateusz Kasiak                          16
Agnieszka Niećko                        15
gk                                      15
Jacek Werner                            14
Olka Mazur                              13
Artykuł sponsorowany                    13
Sylwia Sitka-Czerniak                   13
EIB                                     13
Emilia Biał

In [111]:
df['author'] = df['author'].str.lower()
regex_pattern = re.compile(r'.*mt.*', re.I)
df[df['author'].str.contains(regex_pattern)].count()

publication_date        514
publication_month       514
publication_day         514
author                  514
title                   514
url                     514
koronawirus_in_text     514
koronawirus_in_title    514
question_mark           514
exclamation_mark        514
all_words               514
dtype: int64

In [112]:
df['author'] = df['author'].replace(['mt', 'mt; wideo: Tygodnik Zamojski', 'gał'],'marek targoński')
# df['author'] = df['author'].replace(['Mucha', 'mucha'],'Mariusz Mucha')
# df['author'] = df['author'].replace(['ŁT'],'Łukasz Trybulski')
# df['author'] = df['author'].replace(['Mateusz Kasiak (Radio Eska)'],'Mateusz Kasiak')
# df['author'] = df['author'].replace(['KaJa'],'Karolina Januszek')
# df['author'] = df['author'].replace(['RS'],'Rafał Strzelec')
# df['author'] = df['author'].replace(['gk'],'Grzegorz Kluczyński')
# df['author'] = df['author'].replace([''],'Nieznany')

# regex_pattern = re.compile(r'(.*,.*|.*/.*|Redakcja ESKA INFO|Nieznany|Akcja partnerska)', re.I)
# df['author'] = df['author'].replace(regex_pattern,'Więcej autorów lub nieznany')

regex_pattern = re.compile(r'.*mt.*', re.I)
df['author'] = df['author'].replace(regex_pattern,'marek targoński')

df['author'].value_counts()[:50]

marek targoński                        661
mucha                                  148
karolina januszek                       94
mariusz mucha                           80
ap                                      36
om                                      32
km                                      27
bartłomiej ważny                        23
monika kowalewicz                       22
ac                                      20
maal                                    17
michał michalak                         17
mateusz kasiak                          16
mk                                      16
gk                                      15
agnieszka niećko                        15
jacek werner                            14
artykuł sponsorowany                    13
olka mazur                              13
eib                                     13
sylwia sitka-czerniak                   13
emilia białecka                         11
rs                                      10
kaja       

In [113]:
df['author'] = df['author'].astype('category')
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1507 entries, 0 to 1799
Data columns (total 11 columns):
publication_date        1507 non-null datetime64[ns]
publication_month       1507 non-null object
publication_day         1507 non-null object
author                  1507 non-null category
title                   1507 non-null object
url                     1507 non-null object
koronawirus_in_text     1507 non-null int64
koronawirus_in_title    1507 non-null int64
question_mark           1507 non-null int64
exclamation_mark        1507 non-null int64
all_words               1507 non-null int64
dtypes: category(1), datetime64[ns](1), int64(5), object(4)
memory usage: 888.4 KB


### column: koronawiorus_text

In [114]:
# df.loc[[919, 1979, 1965], 'koronawirus_in_text'] = 0

### column: koronawiorus_anywhere_sum

In [115]:
df['koronawirus_anywhere'] = 0
df['koronawirus_anywhere'] = df['koronawirus_in_text'] + df['koronawirus_in_title']
# df.loc[df.koronawirus_anywhere > 0, 'koronawirus_anywhere'] = 1
df.head(1)

Unnamed: 0,publication_date,publication_month,publication_day,author,title,url,koronawirus_in_text,koronawirus_in_title,question_mark,exclamation_mark,all_words,koronawirus_anywhere
0,2020-04-15 10:42:00,2020-04,2020-04-15,marek targoński,Lubelskie: AŻ 25 NOWYCH ZAKAŻEŃ. Koronawirus atakuje z ogromną siłą! [NOWE DANE],https://www.se.pl/lublin/lubelskie-az-25-nowych-zakazen-koronawirus-atakuje-z-ogromna-sila-nowe-...,13,1,0,2,160,14


### column: koronawiorus_anywhere_count

In [116]:
df['koronawirus_anywhere_count'] = 0
df['koronawirus_anywhere_count'] = (df['koronawirus_in_text'] + df['koronawirus_in_title']) / (df['koronawirus_in_text'] + df['koronawirus_in_title'])
df['koronawirus_anywhere_count'] = df['koronawirus_anywhere_count'].fillna(0).astype('int')
df.head(3)

Unnamed: 0,publication_date,publication_month,publication_day,author,title,url,koronawirus_in_text,koronawirus_in_title,question_mark,exclamation_mark,all_words,koronawirus_anywhere,koronawirus_anywhere_count
0,2020-04-15 10:42:00,2020-04,2020-04-15,marek targoński,Lubelskie: AŻ 25 NOWYCH ZAKAŻEŃ. Koronawirus atakuje z ogromną siłą! [NOWE DANE],https://www.se.pl/lublin/lubelskie-az-25-nowych-zakazen-koronawirus-atakuje-z-ogromna-sila-nowe-...,13,1,0,2,160,14,1
1,2020-08-19 21:41:00,2020-08,2020-08-19,mariusz mucha,Tragedia pod Parczewem. Piotrek w SZALE zatłukł cegłą nie tego brata,https://lublin.se.pl/tragedia-pod-parczewem-zatlukl-cegla-nie-tego-brata-aa-pG2R-Mdb2-2su2.html,0,0,1,0,340,0,0
2,2020-08-19 19:31:00,2020-08,2020-08-19,karolina januszek,"Wygraj 200 złotych od Biedronki. Zobacz, jak to zrobić",https://lublin.se.pl/wygraj-200-zlotych-od-biedronki-zobacz-jak-to-zrobic-ak-XEXJ-32sC-qWAi.html,0,0,0,1,349,0,0


### Export data

In [119]:
df.to_csv('se_lublin.csv')