# Imports

In [1]:
from typing import List
import requests
import pandas as pd

from scraper import scrape
from downloader import download_and_extract_zips

In [2]:
DATA_PATH_ORIG = '../DATA/original'
DATA_PATH_PREP = '../DATA/prepared'

In [3]:
AUTHORS = {
    'ivan_vazov': ["Българският език", "Отечество любезно, как хубаво си ти", "При Рилския манастир", "Елате ни вижте", "Линее нашто поколенье", "Левски", "Паисий", "Кочо", "Опълченците на Шипка", "Дядо Йоцо гледа", "Чичовци", "Под игото"],
    'aleko-konstantinov': ["Разни хора, разни идеали I", "Разни хора, разни идеали II", "Разни хора, разни идеали III", "Разни хора, разни идеали IV", "Бай Ганьо"],
    'jordan-jovkov': ["Песента на колелетата", "Последна радост", "Шибил", "През чумавото", "Индже", "Албена", "Другоселец", "Серафим"],
    'elin-pelin': ["Ветрената мелница", "Косачи", "Задушница", "Мечтатели", "На оня свят", "Андрешко", "Чорба от греховете на отец Никодим", "Занемелите камбани", "Гераците"],
    'dimityr-dimov': ["Тютюн"],
    'dimityr-talev': ["Железният светилник"],
}

URL = 'https://chitanka.info/person'

# Get scraped data

In [4]:
df = scrape(URL, AUTHORS.keys())
df

Unnamed: 0,author,title,download_link
0,ivan_vazov,Чичовци,https://chitanka.info/text/3757-chichovtsi.txt...
1,ivan_vazov,Под игото,https://chitanka.info/text/3753-pod-igoto.txt.zip
2,ivan_vazov,Нова земя,https://chitanka.info/text/3754-nova-zemja.txt...
3,ivan_vazov,Кардашев на лов,https://chitanka.info/text/3762-kardashev-na-l...
4,ivan_vazov,Най-големият,https://chitanka.info/text/4008-najgolemijat.t...
...,...,...,...
1517,dimityr-talev,Гласовете ви чувам,https://chitanka.info/text/3156-glasovete-vi-c...
1518,dimityr-talev,Щитове каменни,https://chitanka.info/text/4834-shtitove-kamen...
1519,dimityr-talev,Пепеляшка и царският син,https://chitanka.info/text/4961-pepeljashka-i-...
1520,dimityr-talev,Погибел,https://chitanka.info/text/4989-pogibel.txt.zip


In [5]:
all_texts = sum(AUTHORS.values(), [])
all_texts[:5]

# check whether all texts you're going to use are present in the dataframe
all(all(text in set(df.query('author == author')['title']) for text in AUTHORS[author]) for author in AUTHORS)

True

# Subset only texts of interest

In [6]:
df_full = df.query('title in @all_texts').reset_index(drop=True)
df_full

Unnamed: 0,author,title,download_link
0,ivan_vazov,Чичовци,https://chitanka.info/text/3757-chichovtsi.txt...
1,ivan_vazov,Под игото,https://chitanka.info/text/3753-pod-igoto.txt.zip
2,ivan_vazov,Българският език,https://chitanka.info/text/5189-bylgarskijat-e...
3,ivan_vazov,Кочо,https://chitanka.info/text/3851-kocho.txt.zip
4,ivan_vazov,Левски,https://chitanka.info/text/3849-levski.txt.zip
5,ivan_vazov,Линее нашто поколенье,https://chitanka.info/text/5208-linee-nashto-p...
6,ivan_vazov,Опълченците на Шипка,https://chitanka.info/text/3860-opylchentsite-...
7,ivan_vazov,"Отечество любезно, как хубаво си ти",https://chitanka.info/text/5163-otechestvo-lju...
8,ivan_vazov,Паисий,https://chitanka.info/text/3854-paisij.txt.zip
9,ivan_vazov,При Рилския манастир,https://chitanka.info/text/4314-pri-rilskija-m...


In [7]:
for auth in AUTHORS:
    print(auth, end=' => ')
    print(set(df_full.query("author == @auth")['title']) == set(AUTHORS[auth]), end=' | ')
    print(len(df_full.query("author == @auth")['title']) == len(AUTHORS[auth]), end='')
    print()

ivan_vazov => True | True
aleko-konstantinov => True | True
jordan-jovkov => True | False
elin-pelin => True | False
dimityr-dimov => True | False
dimityr-talev => True | True


In [8]:
# jordan-jovkov has Албена as an "Пиеса" and as a "Разказ". We need the second one.
# Its download link includes 7879. Therefore, drop row with index 18.
# There are also two vesions of Серафим. Not sure why. Keeping one of them. Drop row with index 25.

# elin-pelin has Косачи as an "Поезия" and as a "Разказ". We need the second one.
# Its download link includes 7879. Therefore, drop row with index 28.

# dimityr-dimov has two versions of "Тютюн" due to political reasons. We'll work with the most recent one.
# Its download link includes 2760. Therefore, drop row with index 38.

df_full = df_full.drop([18, 25, 28, 38])

for auth in AUTHORS:
    print(auth, end=' => ')
    print(set(df_full.query("author == @auth")['title']) == set(AUTHORS[auth]), end=' | ')
    print(len(df_full.query("author == @auth")['title']) == len(AUTHORS[auth]), end='')
    print()

ivan_vazov => True | True
aleko-konstantinov => True | True
jordan-jovkov => True | True
elin-pelin => True | True
dimityr-dimov => True | True
dimityr-talev => True | True


In [9]:
df_full

Unnamed: 0,author,title,download_link
0,ivan_vazov,Чичовци,https://chitanka.info/text/3757-chichovtsi.txt...
1,ivan_vazov,Под игото,https://chitanka.info/text/3753-pod-igoto.txt.zip
2,ivan_vazov,Българският език,https://chitanka.info/text/5189-bylgarskijat-e...
3,ivan_vazov,Кочо,https://chitanka.info/text/3851-kocho.txt.zip
4,ivan_vazov,Левски,https://chitanka.info/text/3849-levski.txt.zip
5,ivan_vazov,Линее нашто поколенье,https://chitanka.info/text/5208-linee-nashto-p...
6,ivan_vazov,Опълченците на Шипка,https://chitanka.info/text/3860-opylchentsite-...
7,ivan_vazov,"Отечество любезно, как хубаво си ти",https://chitanka.info/text/5163-otechestvo-lju...
8,ivan_vazov,Паисий,https://chitanka.info/text/3854-paisij.txt.zip
9,ivan_vazov,При Рилския манастир,https://chitanka.info/text/4314-pri-rilskija-m...


# Download data

In [10]:
urls = df_full['download_link'].tolist()
output_dir = '../DATA/original'
names = download_and_extract_zips(urls, output_dir)

Extracted 01: ivan_vazov_tbc_chichovtsi.txt
Extracted 02: ivan_vazov_tbc_pod_igoto.txt
Extracted 03: ivan_vazov_bylgarskijat_ezik.txt
Extracted 04: ivan_vazov_kocho.txt
Extracted 05: ivan_vazov_levski.txt
Extracted 06: ivan_vazov_linee_nashto_pokolenxe.txt
Extracted 07: ivan_vazov_opylchentsite_na_shipka.txt
Extracted 08: ivan_vazov_otechestvo_ljubeznokak_hubavo_si_ti.txt
Extracted 09: ivan_vazov_paisij.txt
Extracted 10: ivan_vazov_pri_rilskija_manastir.txt
Extracted 11: ivan_vazov,elate_ni_vizhte.txt
Extracted 12: ivan_vazov_djado_jotso_gleda.txt
Extracted 13: aleko_konstantinov_rhri_razni_horarazni_ideali_i.txt
Extracted 14: aleko_konstantinov_rhri_razni_horarazni_ideali_ii.txt
Extracted 15: aleko_konstantinov_rhri_razni_horarazni_ideali_iii.txt
Extracted 16: aleko_konstantinov_rhri_razni_horarazni_ideali_iv.txt
Extracted 17: aleko_konstantinov_baj_ganxo.txt
Extracted 18: jordan_jovkov_posledna_radost.txt
Extracted 19: jordan_jovkov_albena.txt
Extracted 20: jordan_jovkov_drugoselets.

# Add text

In [12]:
names[:5]

['ivan_vazov_tbc_chichovtsi.txt',
 'ivan_vazov_tbc_pod_igoto.txt',
 'ivan_vazov_bylgarskijat_ezik.txt',
 'ivan_vazov_kocho.txt',
 'ivan_vazov_levski.txt']

In [None]:
def get_text(filename: str) -> List[str]:
    with open(filename, 'r') as f:
        tokens = f.read().split()
    return lines

In [None]:
get_text(df_full['text'][0])

In [None]:
df_full['text'].map(get_text)

# Saving to files

In [None]:
f'{DATA_PATH_PREP}/01_df_full_links.pkl'

In [None]:
df_full.to_pickle(f'{DATA_PATH_PREP}/01_df_full_links.pkl')