# Import required modules

In [1]:
import pandas as pd # For using data frames
import tabulate     # For exporting results
import re           # For using regular expressions

## Process the data

We want to turn `data/ffmpegJuntarTodo.txt` into a clickable table of titles and urls.
In order to do this, we'll need the functions below:

In [2]:
def parse_summaries(file="data/summary_documentosRNE.txt"):
    # Read the file line by line
    with open(file, 'r') as f:
        lines = f.readlines()

    # Initialize an empty list to store the data
    data = []

    # Loop through each line and extract the relevant information
    for line in lines:
        # Use regular expressions to extract the title, album, date, genre, language, and id
        match = re.search(r'-i (\d+)\.mp3.*-metadata title="([^"]+)" -metadata album="([^"]+)" -metadata date="([^"]+)" -metadata genre="([^"]+)" -metadata language="([^"]+)"', line)
        if match:
            id = match.group(1)
            title = match.group(2)
            album = match.group(3)
            date = match.group(4)
            genre = match.group(5)
            language = match.group(6)
            # Append the data to the list
            data.append({'id': id, 'title': title, 'album': album, 'date': date, 'genre': genre, 'language': language})

    # Convert the list to a pandas DataFrame
    df = pd.DataFrame(data)
    
    return(df)

def simplify(df):
    """ Return just title, date, id and url to mp3 file """
    sdf = df[['title', 'date', 'id']]
    sdf['mp3'] = "https://ztnr.rtve.es/ztnr/" + sdf['id'] + ".mp3" # Build the url using the id
    
    sdf.set_index('id', inplace=True)                              # The id is unique, so it constitutes a good index
    sdf = sdf.sort_values('date', ascending=False)                 # Sort by date (newest first)
    
    return(sdf)

We're ready to use the functions.
The result is `sim_df`, a `pandas.DataFrame` object.

In [3]:
raw_df = parse_summaries()
sim_df = simplify(raw_df)
print(sim_df)

                                                      title  date  \
id                                                                  
16192458  La Olimpiada Popular, rebeldía obrera contra l...  2024   
15974153       Luis Rosales, el poeta de la nieve encendida  2024   
16157828   Los Fúcares, banqueros de la monarquía hispánica  2024   
16148468  Tras los silencios de Gustavo Durán: arte y gu...  2024   
16138962  Españoles en el Muro Atlántico. Resistentes y ...  2024   
...                                                     ...   ...   
4035893                                 La guerra de África  2001   
647660            Vicente Aleixandre: la creación o el amor  2001   
4033284   Ramón Sarró y Ángel Garma, precursores del psi...  2001   
4034547                                  Don Juan de Borbón  2001   
4034833                   La Institución Libre de Enseñanza  2001   

                                             mp3  
id                                                


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sdf['mp3'] = "https://ztnr.rtve.es/ztnr/" + sdf['id'] + ".mp3" # Build the url using the id


## Ready for publishing

The data frame above is good for data analysis, but not yet ready for publishing.
Let's translate it to markdown, and dump it into a file.

In [4]:
with open("docsrne.md", "w") as text_file:
    text_file.write("%s" % sim_df.to_markdown())

In [25]:
def create_filename(string, limit=90):
    """Replace spaces and punctuation symbols with underscores and trim to 'limit' characters and add mp3 extension"""
    return re.sub('[\s.,;:\'"]', '_', string)[:limit] + '.mp3'

def create_filename_from_df(df, limit=90):
    """Concatenate date, title, id and '.mp3' extension, replacing spaces and punctuation symbols with underscores"""
    return row['date'] + '-' +  create_filename(row['title'], limit) + "-" + id + ".mp3"

[row['date'] + '-' +  create_filename(row['title']) + "-" + id + ".mp3" for id, row in sim_df.iterrows()]

  return re.sub('[\s.,;:\'"]', '_', string)[:limit] + '.mp3'


['2024-La_Olimpiada_Popular__rebeldía_obrera_contra_los_fascismos.mp3-16192458.mp3',
 '2024-Luis_Rosales__el_poeta_de_la_nieve_encendida.mp3-15974153.mp3',
 '2024-Los_Fúcares__banqueros_de_la_monarquía_hispánica.mp3-16157828.mp3',
 '2024-Tras_los_silencios_de_Gustavo_Durán__arte_y_guerra.mp3-16148468.mp3',
 '2024-Españoles_en_el_Muro_Atlántico__Resistentes_y_esclavos_de_Hitler.mp3-16138962.mp3',
 '2024-Franz_Kafka__el_insecto_en_el_castillo.mp3-16128691.mp3',
 '2024-Los_afrancesados__juramentados_y_traidores_en_la_Guerra_de_la_Independencia.mp3-16108512.mp3',
 '2024-Grupo_Tácito__sembrador_de_consensos_en_la_Transición.mp3-16098755.mp3',
 '2024-Josefina_Carabias__50_años_de_periodismo_todoterreno.mp3-16087460.mp3',
 '2024-Los_fueros_vascos_y_navarros__¿Privilegios_o_derechos?.mp3-16079087.mp3',
 '2024-Matilde_cumple_cien_años__El_siglo_de_la_Telefónica.mp3-16068643.mp3',
 '2024-Patty_Hearst__de_rehén_a_guerrillera.mp3-16047596.mp3',
 '2024-Chicho_Sánchez_Ferlosio__el_singular_canto_de_

In [19]:
[row['date'] + '-' +  row['title'] + "-" + id + ".mp3" for id, row in sim_df.iterrows()]

['2024-La Olimpiada Popular, rebeldía obrera contra los fascismos-16192458.mp3',
 '2024-Luis Rosales, el poeta de la nieve encendida-15974153.mp3',
 '2024-Los Fúcares, banqueros de la monarquía hispánica-16157828.mp3',
 '2024-Tras los silencios de Gustavo Durán: arte y guerra-16148468.mp3',
 '2024-Españoles en el Muro Atlántico. Resistentes y esclavos de Hitler-16138962.mp3',
 '2024-Franz Kafka, el insecto en el castillo-16128691.mp3',
 '2024-Los afrancesados, juramentados y traidores en la Guerra de la Independencia-16108512.mp3',
 '2024-Grupo Tácito, sembrador de consensos en la Transición-16098755.mp3',
 '2024-Josefina Carabias: 50 años de periodismo todoterreno-16087460.mp3',
 '2024-Los fueros vascos y navarros: ¿Privilegios o derechos?-16079087.mp3',
 '2024-Matilde cumple cien años. El siglo de la Telefónica-16068643.mp3',
 '2024-Patty Hearst, de rehén a guerrillera-16047596.mp3',
 '2024-Chicho Sánchez Ferlosio, el singular canto de un gallo rojo-16017893.mp3',
 '2024-Juliana More