# Starting point: end of day 1

In [None]:
import logging
import re

def entries_counter(source_file):
    matches = re.findall('^$', open(source_file, 'r').read(), flags=re.MULTILINE)
    nb_blocks = len(matches)
    logging.debug(f'{nb_blocks} blocks in the file "{source_file}".')
    return nb_blocks

def file_splitter(source_file_name, number_of_files, separator='\n', output_name='split'):
    logging.info(f'Start to split "{source_file_name}" in {number_of_files} files.')
    total_blocs_in_file = entries_counter(source_file=source_file_name)
    max_blocs_in_file = round(total_blocs_in_file/float(number_of_files), 0)
    logging.debug(f'{max_blocs_in_file} blocks per file.')
    
    padding_length = len(str(number_of_files))
    
    with open(source_file_name, 'r') as original_file:
        file_number = 1
        blocs_in_file = 0
        new_file_content = ''

        # Loop through the file, line by line
        for line in original_file:
            # Store the line in a temporary variable
            new_file_content += line
            if line == separator:
                # Count the blocks
                blocs_in_file += 1
            if blocs_in_file > max_blocs_in_file:
                logging.debug(f'Writing {output_name}_{file_number:0{padding_length}}.txt.')
                # If we reach the limit, write the content of the temporary variable in the new file
                with open(f'{output_name}_{file_number:0{padding_length}}.txt', 'w') as new_file:
                    new_file.write(new_file_content)
                # Reset counters
                file_number += 1
                blocs_in_file = 0
                new_file_content = ''
        else:
            logging.debug(f'Writing {output_name}_{file_number:0{padding_length}}.txt.')
            # EOF reached, writing everything we have in the temporary variable
            with open(f'{output_name}_{file_number:0{padding_length}}.txt', 'w') as new_file:
                new_file.write(new_file_content)
    logging.info(f'Done with "{source_file_name}".')

In [None]:
import logging

logging.basicConfig(level=logging.DEBUG)

file_splitter(source_file_name='data/bview.20030809.1600.txt', number_of_files=100, separator='\n', output_name='split')


## Step 1

Let's think a bit how we can make this code more efficient.

Why do we compute the mount of entries? Do we need that? What about using the size of the file instead?

Methods:
* `file.seek`
* `file.tell`

## Step 2


Let's make it better:
* Only open the source file once
* Open as binary file

## Step 3


* Fetch new files when there is something available
    * http://data.ris.ripe.net/rrc00/latest-bview.gz
    * ===> http://docs.python-requests.org/en/master/api/#requests.head & Last-Modified

* Use the library to generate text files:
    * https://bitbucket.org/ripencc/bgpdump/downloads/ (Installation details: https://bitbucket.org/ripencc/bgpdump/wiki/Home.wiki#!building)

    ```
    sh ./bootstrap.sh
    make
    ./bgpdump -T
    ```

    ./bgpdump -O ../data/latest-bview.txt  ../data/original/latest-bview.gz

## Step 4 ++


If you're fast and bored:
* Make it a class (with comments)
* Yield pseudo files (`BytesIO`) instead of writing the files on the disk
* Use `argparse` to make the script more flexible

