In [125]:
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import os
import pandas as pd
import pickle
import re
import requests
import seaborn as sns

from db import create_connection
from db import (
    articles,
    services,
    urls
)

In [126]:
import os
current_path = os.path.abspath(os.getcwd())
datasets_directory = os.path.join(current_path, 'datasets')

if not os.path.exists(datasets_directory):
    os.makedirs(datasets_directory)

In [127]:
connection = create_connection()

# Get data

In [128]:
data = articles.get_articles(connection)

In [129]:
pd.set_option('max_colwidth', 100)
df = pd.DataFrame(data, columns = ['publication_date',
                                   'author',
                                   'title',
                                   'url',
                                   'koronawirus_in_text',
                                   'koronawirus_in_title',
                                   'question_mark',
                                   'exclamation_mark',
                                   'all_words'])

In [130]:
df.sample(frac = 0.001)

Unnamed: 0,publication_date,author,title,url,koronawirus_in_text,koronawirus_in_title,question_mark,exclamation_mark,all_words
8289,2020-02-10,mt,WICHURY na Lubelszczyźnie! IMGW ostrzega. Będzie NIEBEZPIECZNIE! [AKTUALIZACJA],https://www.se.pl/lublin/wichury-na-lubelszczyznie-imgw-ostrzega-przed-silnym-wiatrem-bedzie-nie...,0.0,0.0,0.0,2.0,221.0
5215,NaT,,,https://www.se.pl/lublin/nauczysz-sie-zonglowac-chodzic-po-tasmach-i-krecic-hula-hop-wszystko-ca...,,,,,
5128,NaT,,,https://www.se.pl/lublin/lekarze-uzyli-szczepionek-ktore-powinny-trafic-na-smietnik-lubelszczyzn...,,,,,
7042,NaT,,,https://www.se.pl/lublin/reprezentacja-lublina-na-igrzyskach-olimpijskich-w-rio-sprawdz-komu-kib...,,,,,
6686,NaT,,,https://www.se.pl/lublin/lublin-gangster-trafil-za-kratki-po-23-latach-ukrywal-sie-w-wielkiej-br...,,,,,
3067,NaT,,,https://www.se.pl/lublin/lubelskie-gmina-obsza-wylamywal-sie-do-automatow-aa-a5R1-YPrS-bHfy.html,,,,,
1518,NaT,,,https://www.se.pl/lublin/makabryczny-wypadek-pod-pulawami-45-latek-przygnieciony-przez-drzewo-aa...,,,,,
672,2020-08-18,Mariusz Mucha,Biała Podlaska: GŁODOMÓR okradał altanki. Zjadał WSZYSTKO! Robił tak wiele razy,https://lublin.se.pl/biala-podlaska-glodomor-okradal-altanki-robil-tak-wiele-razy-aa-kpkP-tB52-W...,0.0,0.0,1.0,0.0,235.0


# Clean data

### initial data check, organize and clearing

In [131]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8362 entries, 0 to 8361
Data columns (total 9 columns):
publication_date        1735 non-null datetime64[ns]
author                  1735 non-null object
title                   1735 non-null object
url                     8362 non-null object
koronawirus_in_text     1735 non-null float64
koronawirus_in_title    1735 non-null float64
question_mark           1735 non-null float64
exclamation_mark        1735 non-null float64
all_words               1735 non-null float64
dtypes: datetime64[ns](1), float64(5), object(3)
memory usage: 2.6 MB


In [132]:
df.nunique()

publication_date         241
author                   126
title                   1732
url                     8362
koronawirus_in_text       35
koronawirus_in_title       2
question_mark             18
exclamation_mark          15
all_words                388
dtype: int64

In [133]:
df.dropna(inplace=True)
df.shape

(1735, 9)

In [134]:
df['author'] = df['author'].astype('category')
df['koronawirus_in_text'] = df['koronawirus_in_text'].astype('int')
df['koronawirus_in_title'] = df['koronawirus_in_title'].astype('int')
df['question_mark'] = df['question_mark'].astype('int')
df['exclamation_mark'] = df['exclamation_mark'].astype('int')
df['all_words'] = df['all_words'].astype('int')
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1735 entries, 0 to 8361
Data columns (total 9 columns):
publication_date        1735 non-null datetime64[ns]
author                  1735 non-null category
title                   1735 non-null object
url                     1735 non-null object
koronawirus_in_text     1735 non-null int64
koronawirus_in_title    1735 non-null int64
question_mark           1735 non-null int64
exclamation_mark        1735 non-null int64
all_words               1735 non-null int64
dtypes: category(1), datetime64[ns](1), int64(5), object(2)
memory usage: 802.6 KB


### column: publication_day

In [135]:
# df.insert(loc=1, column='publication_day', value=df['publication_date'].dt.strftime('%Y-%m-%d'))
# df.sample(frac = 0.002)

In [136]:
df.publication_date.value_counts().head()

2020-06-29    28
2020-07-08    26
2020-07-01    25
2020-07-03    22
2020-08-25    22
Name: publication_date, dtype: int64

In [137]:
df['publication_date'].nunique()

241

In [138]:
#df['publication_day'] = df['publication_day'].astype('category')
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1735 entries, 0 to 8361
Data columns (total 9 columns):
publication_date        1735 non-null datetime64[ns]
author                  1735 non-null category
title                   1735 non-null object
url                     1735 non-null object
koronawirus_in_text     1735 non-null int64
koronawirus_in_title    1735 non-null int64
question_mark           1735 non-null int64
exclamation_mark        1735 non-null int64
all_words               1735 non-null int64
dtypes: category(1), datetime64[ns](1), int64(5), object(2)
memory usage: 802.6 KB


### column: publication_month

In [139]:
df.insert(loc=1, column='publication_month', value=df['publication_date'].dt.strftime('%Y-%m'))
df.sample(frac = 0.001)

Unnamed: 0,publication_date,publication_month,author,title,url,koronawirus_in_text,koronawirus_in_title,question_mark,exclamation_mark,all_words
693,2020-08-04,2020-08,MTA,Lublin: ZAKRWAWIONY mężczyzna leżał na chodniku. Mołdawianie ZAATAKOWALI w biały dzień,https://lublin.se.pl/lublin-zakrwawiony-mezczyzna-lezal-na-chodniku-moldawianie-zaatakowali-w-bi...,0,0,4,2,245
8272,2020-02-21,2020-02,mt,Brutalne MORDERSTWO w Lublinie. Zwłoki w kałuży krwi. Dlaczego 27-latek ZABIŁ?!,https://www.se.pl/lublin/brutalne-morderstwo-w-lublinie-zwloki-w-kaluzy-krwi-dlaczego-27-latek-z...,0,0,2,0,206


In [140]:
df.publication_month.value_counts().head(8)

2020-07    439
2020-08    301
2020-06    214
2020-01    154
2020-05    153
2020-03    147
2020-04    138
2020-02    105
Name: publication_month, dtype: int64

In [141]:
df['publication_month'].nunique()

9

In [142]:
df = df.loc[df['publication_month'].isin(['2020-01', '2020-02', '2020-03', '2020-04', '2020-05', '2020-06', '2020-07', '2020-08'])]

In [144]:
#df['publication_month'] = df['publication_month'].astype('category')
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1651 entries, 27 to 8361
Data columns (total 10 columns):
publication_date        1651 non-null datetime64[ns]
publication_month       1651 non-null object
author                  1651 non-null category
title                   1651 non-null object
url                     1651 non-null object
koronawirus_in_text     1651 non-null int64
koronawirus_in_title    1651 non-null int64
question_mark           1651 non-null int64
exclamation_mark        1651 non-null int64
all_words               1651 non-null int64
dtypes: category(1), datetime64[ns](1), int64(5), object(3)
memory usage: 868.7 KB


### column: author

In [145]:
df['author'].value_counts()[:60]

mt                                 335
Marek Targoński                    183
Mucha                              147
Karolina Januszek                  131
Mariusz Mucha                      112
MTA                                 80
Mucha, mta                          55
AP                                  36
KM                                  33
OM                                  32
Bartłomiej Ważny                    23
Monika Kowalewicz                   22
AC                                  21
Michał Michalak                     19
EIB                                 19
Emilia Białecka                     16
MK                                  16
Mateusz Kasiak                      16
Agnieszka Niećko                    15
Jacek Werner                        13
gk                                  13
Olka Mazur                          13
Artykuł sponsorowany                13
Sylwia Sitka-Czerniak               13
Grzegorz Kluczyński                 12
maal                     

In [146]:
df['author'] = df['author'].str.lower()
regex_pattern = re.compile(r'.*mt.*', re.I)
df[df['author'].str.contains(regex_pattern)].count()

publication_date        514
publication_month       514
author                  514
title                   514
url                     514
koronawirus_in_text     514
koronawirus_in_title    514
question_mark           514
exclamation_mark        514
all_words               514
dtype: int64

In [147]:
df['author'] = df['author'].replace(['mt', 'mt; wideo: Tygodnik Zamojski', 'gał'],'marek targoński')
# df['author'] = df['author'].replace(['Mucha', 'mucha'],'Mariusz Mucha')
# df['author'] = df['author'].replace(['ŁT'],'Łukasz Trybulski')
# df['author'] = df['author'].replace(['Mateusz Kasiak (Radio Eska)'],'Mateusz Kasiak')
# df['author'] = df['author'].replace(['KaJa'],'Karolina Januszek')
# df['author'] = df['author'].replace(['RS'],'Rafał Strzelec')
# df['author'] = df['author'].replace(['gk'],'Grzegorz Kluczyński')
# df['author'] = df['author'].replace([''],'Nieznany')

# regex_pattern = re.compile(r'(.*,.*|.*/.*|Redakcja ESKA INFO|Nieznany|Akcja partnerska)', re.I)
# df['author'] = df['author'].replace(regex_pattern,'Więcej autorów lub nieznany')

regex_pattern = re.compile(r'.*mt.*', re.I)
df['author'] = df['author'].replace(regex_pattern,'marek targoński')

df['author'].value_counts()[:5]

marek targoński      699
mucha                148
karolina januszek    131
mariusz mucha        112
ap                    36
Name: author, dtype: int64

In [148]:
df['author'] = df['author'].astype('category')
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1651 entries, 27 to 8361
Data columns (total 10 columns):
publication_date        1651 non-null datetime64[ns]
publication_month       1651 non-null object
author                  1651 non-null category
title                   1651 non-null object
url                     1651 non-null object
koronawirus_in_text     1651 non-null int64
koronawirus_in_title    1651 non-null int64
question_mark           1651 non-null int64
exclamation_mark        1651 non-null int64
all_words               1651 non-null int64
dtypes: category(1), datetime64[ns](1), int64(5), object(3)
memory usage: 863.3 KB


### column: koronawiorus_text

In [149]:
# df.loc[[919, 1979, 1965], 'koronawirus_in_text'] = 0

### column: koronawiorus_anywhere_sum

In [150]:
df['koronawirus_anywhere'] = 0
df['koronawirus_anywhere'] = df['koronawirus_in_text'] + df['koronawirus_in_title']
# df.loc[df.koronawirus_anywhere > 0, 'koronawirus_anywhere'] = 1
df.head(1)

Unnamed: 0,publication_date,publication_month,author,title,url,koronawirus_in_text,koronawirus_in_title,question_mark,exclamation_mark,all_words,koronawirus_anywhere
27,2020-06-11,2020-06,michał michalak,"Gwałtowne burze już w Polsce, IMGW ostrzega. Gdzie jest burza? [ZAPIS RELACJI NA ŻYWO, RADAR ONL...",https://www.se.pl/lublin/gdzie-jest-burza-imgw-ostrzega-relacja-na-zywo-aa-GucW-i1Ux-tnAB.html,0,0,1,1,341,0


### column: koronawiorus_anywhere_count

In [151]:
df['koronawirus_anywhere_count'] = 0
df['koronawirus_anywhere_count'] = (df['koronawirus_in_text'] + df['koronawirus_in_title']) / (df['koronawirus_in_text'] + df['koronawirus_in_title'])
df['koronawirus_anywhere_count'] = df['koronawirus_anywhere_count'].fillna(0).astype('int')
df.head(3)

Unnamed: 0,publication_date,publication_month,author,title,url,koronawirus_in_text,koronawirus_in_title,question_mark,exclamation_mark,all_words,koronawirus_anywhere,koronawirus_anywhere_count
27,2020-06-11,2020-06,michał michalak,"Gwałtowne burze już w Polsce, IMGW ostrzega. Gdzie jest burza? [ZAPIS RELACJI NA ŻYWO, RADAR ONL...",https://www.se.pl/lublin/gdzie-jest-burza-imgw-ostrzega-relacja-na-zywo-aa-GucW-i1Ux-tnAB.html,0,0,1,1,341,0,0
28,2020-06-11,2020-06,mucha,Siedliszcze: Mateusz zginął w drodze z matury. NOWE FAKTY,https://www.se.pl/lublin/siedliszcze-mateusz-zginal-w-drodze-z-matury-nowe-fakty-aa-Vpzg-nDpX-9S...,0,0,0,0,400,0,0
44,2020-08-31,2020-08,mariusz mucha,"Lublin pożegnał Piotra Szczepanika. WZRUSZAJĄCY pogrzeb artysty [WIDEO, ZDJĘCIA]",https://lublin.se.pl/lublin-pozegnal-piotra-szczepanika-aa-yB55-ND2f-Cn9F.html,0,0,0,0,190,0,0


### Export data

In [152]:
df.to_csv('datasets/1_clean_data.csv')