In [1]:
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import os
import pandas as pd
import pickle
import re
import requests
import seaborn as sns

from db import create_connection
from db import (
    articles,
    services,
    urls
)

In [2]:
import os
current_path = os.path.abspath(os.getcwd())
datasets_directory = os.path.join(current_path, 'datasets')

if not os.path.exists(datasets_directory):
    os.makedirs(datasets_directory)

In [3]:
connection = create_connection()

# Get data

In [4]:
data = articles.get_articles(connection)

In [5]:
pd.set_option('max_colwidth', 100)
df = pd.DataFrame(data, columns = ['publication_date',
                                   'author',
                                   'title',
                                   'url',
                                   'koronawirus_in_text',
                                   'koronawirus_in_title',
                                   'question_mark',
                                   'exclamation_mark',
                                   'all_words'])

In [6]:
df.sample(frac = 0.001)

Unnamed: 0,publication_date,author,title,url,koronawirus_in_text,koronawirus_in_title,question_mark,exclamation_mark,all_words
2178,NaT,,,https://www.se.pl/lublin/matura-radio-eska-lublin-wspomnienia-aa-61Jr-1vcS-FCMp.html,,,,,
3276,NaT,,,https://www.se.pl/lublin/pasazerowie-przez-okno-do-pociagu-chelm-lubelskie-chelm-miasto-pociag-a...,,,,,
5115,NaT,,,https://www.se.pl/lublin/lublin-zamiast-politologii-bedzie-muzeum-zmiany-przy-litewskim-aa-L5CS-...,,,,,
4528,NaT,,,https://www.se.pl/lublin/uwaga-kierowcy-utrudnienia-na-trasie-lublin-lubartow-aa-ToAL-7LAu-d1uN....,,,,,
3824,NaT,,,https://www.se.pl/lublin/me-w-plywaniu-glasgow-2018-zawodnicy-azs-umcs-lublin-wrocili-do-polski-...,,,,,
3994,NaT,,,https://www.se.pl/lublin/protest-sluzb-mundurowych-strajk-mundurowka-protest-strazakow-protest-p...,,,,,
6307,NaT,,,https://www.se.pl/lublin/przejazdzka-prawdziwym-autem-w-mini-wersji-zgarnij-prezent-od-mikolaja-...,,,,,
181,2020-08-11,Karolina Januszek,Pensja minimalna nie wzrośnie? Szokująca propozycja,https://lublin.se.pl/pensja-minimalna-nie-wzrosnie-szokujaca-propozycja-ak-9foe-yUHD-2ezQ.html,4.0,0.0,1.0,0.0,229.0
1217,NaT,,,https://www.se.pl/lublin/zmasakrowany-samochod-28-latek-dachowal-i-roztrzaskal-sie-o-nasyp-zdjec...,,,,,


# Clean data

### initial data check, organize and clearing

In [7]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8610 entries, 0 to 8609
Data columns (total 9 columns):
publication_date        1744 non-null datetime64[ns]
author                  1744 non-null object
title                   1744 non-null object
url                     8610 non-null object
koronawirus_in_text     1744 non-null float64
koronawirus_in_title    1744 non-null float64
question_mark           1744 non-null float64
exclamation_mark        1744 non-null float64
all_words               1744 non-null float64
dtypes: datetime64[ns](1), float64(5), object(3)
memory usage: 2.6 MB


In [8]:
df.nunique()

publication_date         243
author                   126
title                   1741
url                     8610
koronawirus_in_text       35
koronawirus_in_title       2
question_mark             18
exclamation_mark          15
all_words                393
dtype: int64

In [9]:
df.dropna(inplace=True)
df.shape

(1744, 9)

In [10]:
df['author'] = df['author'].astype('category')
df['koronawirus_in_text'] = df['koronawirus_in_text'].astype('int')
df['koronawirus_in_title'] = df['koronawirus_in_title'].astype('int')
df['question_mark'] = df['question_mark'].astype('int')
df['exclamation_mark'] = df['exclamation_mark'].astype('int')
df['all_words'] = df['all_words'].astype('int')
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1744 entries, 0 to 8609
Data columns (total 9 columns):
publication_date        1744 non-null datetime64[ns]
author                  1744 non-null category
title                   1744 non-null object
url                     1744 non-null object
koronawirus_in_text     1744 non-null int64
koronawirus_in_title    1744 non-null int64
question_mark           1744 non-null int64
exclamation_mark        1744 non-null int64
all_words               1744 non-null int64
dtypes: category(1), datetime64[ns](1), int64(5), object(2)
memory usage: 806.1 KB


### column: publication_day

In [12]:
df.publication_date.value_counts().head()

2020-06-29    28
2020-07-08    26
2020-07-01    25
2020-07-03    22
2020-08-25    22
Name: publication_date, dtype: int64

In [13]:
df['publication_date'].nunique()

243

In [14]:
#df['publication_day'] = df['publication_day'].astype('category')
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1744 entries, 0 to 8609
Data columns (total 9 columns):
publication_date        1744 non-null datetime64[ns]
author                  1744 non-null category
title                   1744 non-null object
url                     1744 non-null object
koronawirus_in_text     1744 non-null int64
koronawirus_in_title    1744 non-null int64
question_mark           1744 non-null int64
exclamation_mark        1744 non-null int64
all_words               1744 non-null int64
dtypes: category(1), datetime64[ns](1), int64(5), object(2)
memory usage: 806.1 KB


### column: publication_month

In [15]:
df.insert(loc=1, column='publication_month', value=df['publication_date'].dt.strftime('%Y-%m'))
df.sample(frac = 0.001)

Unnamed: 0,publication_date,publication_month,author,title,url,koronawirus_in_text,koronawirus_in_title,question_mark,exclamation_mark,all_words
777,2020-04-06,2020-04,Michał Michalak,W całej Polsce nie ma chętnych do komisji wyborczych! Największe braki w Lublinie!,https://www.se.pl/lublin/w-calej-polsce-nie-ma-chetnych-do-komisji-wyborczych-najwieksze-braki-w...,0,0,0,0,171
507,2020-06-29,2020-06,"Mucha, mta",Krasnystaw: Podczas remontu odkopali POCISK. Ze sprawnym zapalnikiem,https://lublin.se.pl/krasnystaw-podczas-remontu-odkopali-pocisk-ze-sprawnym-zapalnikiem-aa-XgLU-...,0,0,0,2,120


In [16]:
df.publication_month.value_counts().head(8)

2020-07    439
2020-08    301
2020-06    214
2020-01    154
2020-05    153
2020-03    147
2020-04    138
2020-02    105
Name: publication_month, dtype: int64

In [17]:
df['publication_month'].nunique()

9

In [18]:
df = df.loc[df['publication_month'].isin(['2020-01', '2020-02', '2020-03', '2020-04', '2020-05', '2020-06', '2020-07', '2020-08'])]

In [19]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1651 entries, 57 to 8609
Data columns (total 10 columns):
publication_date        1651 non-null datetime64[ns]
publication_month       1651 non-null object
author                  1651 non-null category
title                   1651 non-null object
url                     1651 non-null object
koronawirus_in_text     1651 non-null int64
koronawirus_in_title    1651 non-null int64
question_mark           1651 non-null int64
exclamation_mark        1651 non-null int64
all_words               1651 non-null int64
dtypes: category(1), datetime64[ns](1), int64(5), object(3)
memory usage: 868.7 KB


### column: author

In [20]:
df['author'].value_counts()[:60]

mt                                 335
Marek Targoński                    183
Mucha                              147
Karolina Januszek                  131
Mariusz Mucha                      112
MTA                                 80
Mucha, mta                          55
AP                                  36
KM                                  33
OM                                  32
Bartłomiej Ważny                    23
Monika Kowalewicz                   22
AC                                  21
Michał Michalak                     19
EIB                                 19
Emilia Białecka                     16
MK                                  16
Mateusz Kasiak                      16
Agnieszka Niećko                    15
Jacek Werner                        13
gk                                  13
Olka Mazur                          13
Artykuł sponsorowany                13
Sylwia Sitka-Czerniak               13
Grzegorz Kluczyński                 12
maal                     

In [21]:
df['author'] = df['author'].str.lower()
regex_pattern = re.compile(r'.*mt.*', re.I)
df[df['author'].str.contains(regex_pattern)].count()

publication_date        514
publication_month       514
author                  514
title                   514
url                     514
koronawirus_in_text     514
koronawirus_in_title    514
question_mark           514
exclamation_mark        514
all_words               514
dtype: int64

In [22]:
df['author'] = df['author'].replace(['mt', 'mt; wideo: Tygodnik Zamojski', 'gał'],'marek targoński')

regex_pattern = re.compile(r'.*mt.*', re.I)
df['author'] = df['author'].replace(regex_pattern,'marek targoński')

df['author'].value_counts()[:5]

marek targoński      699
mucha                148
karolina januszek    131
mariusz mucha        112
ap                    36
Name: author, dtype: int64

In [23]:
df['author'] = df['author'].astype('category')
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1651 entries, 57 to 8609
Data columns (total 10 columns):
publication_date        1651 non-null datetime64[ns]
publication_month       1651 non-null object
author                  1651 non-null category
title                   1651 non-null object
url                     1651 non-null object
koronawirus_in_text     1651 non-null int64
koronawirus_in_title    1651 non-null int64
question_mark           1651 non-null int64
exclamation_mark        1651 non-null int64
all_words               1651 non-null int64
dtypes: category(1), datetime64[ns](1), int64(5), object(3)
memory usage: 863.3 KB


### column: koronawiorus_anywhere_sum

In [25]:
df['koronawirus_anywhere'] = 0
df['koronawirus_anywhere'] = df['koronawirus_in_text'] + df['koronawirus_in_title']
# df.loc[df.koronawirus_anywhere > 0, 'koronawirus_anywhere'] = 1
df.head(1)

Unnamed: 0,publication_date,publication_month,author,title,url,koronawirus_in_text,koronawirus_in_title,question_mark,exclamation_mark,all_words,koronawirus_anywhere
57,2020-08-31,2020-08,mariusz mucha,"Lublin pożegnał Piotra Szczepanika. WZRUSZAJĄCY pogrzeb artysty [WIDEO, ZDJĘCIA]",https://lublin.se.pl/lublin-pozegnal-piotra-szczepanika-aa-yB55-ND2f-Cn9F.html,0,0,0,0,190,0


### column: koronawiorus_anywhere_count

In [26]:
df['koronawirus_anywhere_count'] = 0
df['koronawirus_anywhere_count'] = (df['koronawirus_in_text'] + df['koronawirus_in_title']) / (df['koronawirus_in_text'] + df['koronawirus_in_title'])
df['koronawirus_anywhere_count'] = df['koronawirus_anywhere_count'].fillna(0).astype('int')
df.head(3)

Unnamed: 0,publication_date,publication_month,author,title,url,koronawirus_in_text,koronawirus_in_title,question_mark,exclamation_mark,all_words,koronawirus_anywhere,koronawirus_anywhere_count
57,2020-08-31,2020-08,mariusz mucha,"Lublin pożegnał Piotra Szczepanika. WZRUSZAJĄCY pogrzeb artysty [WIDEO, ZDJĘCIA]",https://lublin.se.pl/lublin-pozegnal-piotra-szczepanika-aa-yB55-ND2f-Cn9F.html,0,0,0,0,190,0,0
58,2020-08-19,2020-08,karolina januszek,"Wygraj 200 złotych od Biedronki. Zobacz, jak to zrobić",https://lublin.se.pl/wygraj-200-zlotych-od-biedronki-zobacz-jak-to-zrobic-ak-XEXJ-32sC-qWAi.html,0,0,0,1,330,0,0
59,2020-08-31,2020-08,karolina januszek,Polacy tracą pracę! Najgorsze dopiero nadejdzie,https://lublin.se.pl/polacy-traca-prace-najgorsze-dopiero-nadejdzie-ak-vtoS-p5BX-TkBx.html,3,0,0,0,216,3,1


### Export data

In [27]:
df.to_csv('datasets/1_clean_data.csv')