# Starting point: end of day 1

In [None]:
import logging
import re

def entries_counter(source_file):
    matches = re.findall('^$', open(source_file, 'r').read(), flags=re.MULTILINE)
    nb_blocks = len(matches)
    logging.debug(f'{nb_blocks} blocks in the file "{source_file}".')
    return nb_blocks

def file_splitter(source_file_name, number_of_files, separator='\n', output_name='split'):
    logging.info(f'Start to split "{source_file_name}" in {number_of_files} files.')
    total_blocs_in_file = entries_counter(source_file=source_file_name)
    max_blocs_in_file = round(total_blocs_in_file/float(number_of_files), 0)
    logging.debug(f'{max_blocs_in_file} blocks per file.')
    
    padding_length = len(str(number_of_files))
    
    with open(source_file_name, 'r') as original_file:
        file_number = 1
        blocs_in_file = 0
        new_file_content = ''

        # Loop through the file, line by line
        for line in original_file:
            # Store the line in a temporary variable
            new_file_content += line
            if line == separator:
                # Count the blocks
                blocs_in_file += 1
            if blocs_in_file > max_blocs_in_file:
                logging.debug(f'Writing {output_name}_{file_number:0{padding_length}}.txt.')
                # If we reach the limit, write the content of the temporary variable in the new file
                with open(f'{output_name}_{file_number:0{padding_length}}.txt', 'w') as new_file:
                    new_file.write(new_file_content)
                # Reset counters
                file_number += 1
                blocs_in_file = 0
                new_file_content = ''
        else:
            logging.debug(f'Writing {output_name}_{file_number:0{padding_length}}.txt.')
            # EOF reached, writing everything we have in the temporary variable
            with open(f'{output_name}_{file_number:0{padding_length}}.txt', 'w') as new_file:
                new_file.write(new_file_content)
    logging.info(f'Done with "{source_file_name}".')

In [None]:
import logging

logging.basicConfig(level=logging.DEBUG)

file_splitter(source_file_name='data/bview.20030809.1600.txt', number_of_files=100, separator='\n', output_name='split')

# Validator

In [17]:
import glob
import hashlib

with open('data/bview.20030809.1600.txt', 'rb') as f:
    hash_source = hashlib.sha256(f.read()).hexdigest()
print(hash_source)

hash_dest = hashlib.sha256()
for out_file in sorted(glob.glob('split_*.txt')):
    with open(out_file, 'rb') as f:
        hash_dest.update(f.read())
dest = hash_dest.hexdigest()
print(dest)

2fb572b7afc2bdebc639e04db86b437236107ecb7d2442d9feae593dedb5ebca
2fb572b7afc2bdebc639e04db86b437236107ecb7d2442d9feae593dedb5ebca


# Profiling

Install dependencies

In [None]:
%%bash

pip install line-profiler memory_profiler

In [3]:
%load_ext memory_profiler
%load_ext line_profiler

## CPU

In [None]:
%time file_splitter(source_file_name='data/bview.20030809.1600.txt', number_of_files=10, separator='\n', output_name='split')

In [None]:
%%timeit

file_splitter(source_file_name='data/bview.20030809.1600.txt', number_of_files=10, separator='\n', output_name='split')

In [None]:
%prun file_splitter(source_file_name='data/bview.20030809.1600.txt', number_of_files=10, separator='\n', output_name='split')

In [None]:
%prun file_splitter(source_file_name='data/bview.20180216.0800.txt', number_of_files=10, separator='\n', output_name='split')

## Memory

In [16]:
%%memit 

file_splitter(source_file_name='data/bview.20030809.1600.txt', number_of_files=10, separator='\n', output_name='split')

peak memory: 143.39 MiB, increment: 96.15 MiB



## Step 1

Let's think a bit how we can make this code more efficient. For that, let's think a bit more about our goals: writing N files of more or less the same length.

* How can we do that without counting the amount of blocs?
* Is tehre a way to move in a file without reading every line?

### TODO:

* Open the file as bytes: https://docs.python.org/3/library/functions.html#open
* Get the size of the file: https://docs.python.org/3/library/os.path.html#os.path.getsize)
* Use `seek` to move in the file, `tell` to figure out where you are, and `read` to copy file chunks (https://docs.python.org/3/tutorial/inputoutput.html?highlight=tell%20file#methods-of-file-objects)

In [15]:
import logging
from pathlib import Path
import os

def file_splitter(source_file_name, number_of_files, separator=b'\n', output_name='split'):
    source_file = Path(source_file_name)
    if not isinstance(separator, bytes):
        separator = separator.encode()
    logging.debug('File size: {}.'.format(os.path.getsize(source_file)))
    chunk_size = round(os.path.getsize(source_file) / number_of_files)
    logging.debug('Chunk size per file: {}.'.format(chunk_size))
    
    with open(source_file, 'rb') as f:  # Required to open the file as bytes for seek
        file_number = 0
        current_position = 0
        while True:
            precedent_position = current_position
            # Jump of "size" from the current place in the file
            f.seek(chunk_size, os.SEEK_CUR)
            s = f.readline()
            while s and s != separator:
                # find the next separator
                s = f.readline()
            # Get the current place
            current_position = f.tell()
            # Copy and write in the new file everything between precedent_position and current_position
            with open(source_file, 'r') as temp:
                temp.seek(precedent_position)
                copy = temp.read(current_position - precedent_position)
            logging.debug('Opening {}.'.format('{}_{}.txt'.format(output_name, file_number)))
            with open('{}_{}.txt'.format(output_name, file_number), 'w') as new_f:
                new_f.write(copy)
                file_number += 1
            if not s:
                break

## Step 2


Let's make it even better.

### TODO

* Only open the source file once
* Use `pathlib.Path`: https://docs.python.org/3/library/pathlib.html#concrete-paths
* Get the size of the file with `stat`: https://docs.python.org/3/library/pathlib.html#pathlib.Path.stat

In [10]:
import logging
from pathlib import Path
import os

def file_splitter(source_file_name, number_of_files, separator=b'\n', output_name='split'):
    source_file = Path(source_file_name)
    if not isinstance(separator, bytes):
        separator = separator.encode()
    logging.debug('File size: {}.'.format(source_file.stat().st_size))                                    
    chunk_size = round(source_file.stat().st_size / number_of_files)
    logging.debug('Chunk size per file: {}.'.format(chunk_size))
    
    with open(source_file, 'rb') as f:  # Required to open the file as bytes for seek
        file_number = 0
        while True:
            # Jump of "size" from the current place in the file
            to_write = f.read(chunk_size)
            while True:
                rest_of_line = f.readline()
                to_write += rest_of_line
                if not rest_of_line or rest_of_line == separator:
                    break
            logging.debug('Opening {}.'.format('{}_{}.txt'.format(output_name, file_number)))
            with open('{}_{}.txt'.format(output_name, file_number), 'wb') as new_f:
                new_f.write(to_write)
                file_number += 1
            if not rest_of_line:
                break

## Step 3


* Fetch new files when there is something available
    * http://data.ris.ripe.net/rrc00/latest-bview.gz
    * ===> http://docs.python-requests.org/en/master/api/#requests.head & Last-Modified

* Use the library to generate text files:
    * https://bitbucket.org/ripencc/bgpdump/downloads/ (Installation details: https://bitbucket.org/ripencc/bgpdump/wiki/Home.wiki#!building)

    ```
    sh ./bootstrap.sh
    make
    ./bgpdump -T
    ```

    ./bgpdump -O ../data/latest-bview.txt  ../data/original/latest-bview.gz

## Step 4 ++


If you're fast and bored:
* Make it a class (with comments)
* Yield pseudo files (`BytesIO`) instead of writing the files on the disk
* Use `argparse` to make the script more flexible

