## Data preparation - Retrieve the first 10 MB of enwik9

This Notebook will download the file '[***enwik9.zip***](http://mattmahoney.net/dc/enwik9.zip)' file to current directory (if the file does not exist) and unzip it, then read the first 10 MB of the 1GB file to write into the file '***enwik.head_10MB***'

In [1]:
#Required libraries
from zipfile import ZipFile      # for handling zip files
import os                        # for os.path.exists
import filecmp                   # for comparing file contents
import errno                     # for throwing errors
import urllib                    # for downloading remote files 

In [2]:
# Global parameters
SOURCE_URL = 'http://mattmahoney.net/dc/enwik9.zip'  # Source url to download 'enwik9.zip' file
DICTIONARY_URL = 'https://github.com/byronknoll/cmix/raw/master/dictionary/english.dic'
dict_file = 'english.dic'
local_zip_file = 'enwik9.zip'                        # Local 'enwik9.zip' file name
enwik9_file = 'enwik9'                               # Local 'enwik9' file after unzip
enwik9_head_file = 'enwik9.head'                     # File name of the first N bytes output of enwik9
BYTES_TO_READ = 10 * 1024 * 1024                     # Size of 10 MB to read from enwik9

In [6]:
# Helper functions
def download(remote_url: str, local_file: str):
    """
    Download from remote_url to local_file, unless local_file already exists.  
    """
    if not os.path.exists(local_file):
        print("Downloading %s..." % local_file)
        filename, headers = urllib.request.urlretrieve(remote_url, local_file)
        print('Done!')
    else:
        print(f'File \'{local_file}\' already existed.')
    return

def unzip(zip_file: str):
    """
    Unzip (extract) all files from zip_file, overwring all existing files
    """
    print(f'Start unzip file \'{zip_file}\'...')
    with ZipFile(zip_file, 'r') as zip: 
        # printing all the contents of the zip file 
        zip.printdir() 

        # extracting all the files 
        print('Extracting all the files now...') 
        zip.extractall() 
        print('Done!') 

def split_first_n_bytes(file_name: str, n: int, output_file:str):
    """
    Get the first n bytes from file_name and output to output_file (overwriting any existing file)
    """
    converted_size = convert_bytes_display(n)
    print(f'Start spliting the first {n} byte(s) ({converted_size}) of \'{file_name}\' into \'{output_file}\'...')
    with open(file_name, 'rb') as fr:
        first_n = fr.read(n)
        with open(output_file, 'wb') as fw:
            fw.write(first_n)
    fr.close()
    fw.close()
    print('Done splitting file!')

def convert_bytes_display(bytes_input:int):
    """
    Convert the bytes number into the appropriate format to display in KB or MB 
    """
    return '%dMB'%(bytes_input/1024 / 1024) if bytes_input%(1024*1024) == 0 else \
        '%.2fMB'%(bytes_input/1024 / 1024) if 1024*1024 <= bytes_input else \
        '%dKB'%(bytes_input/1024) if bytes_input%1024 == 0 else'%.2fKB'%(bytes_input/1024)

def data_prep(bytes_to_read:int):
    """
    This function will download the 'enwik9.zip' file if it does not exist in current directory, 
    then extract it to file 'enwik9' (1GB) and split the first bytes_to_read (bytes) for training the compression 
    algorithm as well as benchmarking.
    Output file will have format: '{enwik9_head_file}_xx.xxXB', where xx(.xx) is the number of MBs or KBs converted 
    from bytes_to_read and XB is MB or KB depending on its value is in which range  
    """
    if not os.path.exists(enwik9_file):
        if not os.path.exists(local_zip_file):
            download(SOURCE_URL, local_zip_file)
            if os.path.exists(local_zip_file):
                print('Download completed!')
        unzip(local_zip_file)
    else:
        print(f'Original file \'{enwik9_file}\' already existed!')

    if os.path.exists(enwik9_file):
        converted_size = convert_bytes_display(bytes_to_read)
        
        output_file = f'{enwik9_head_file}_{converted_size}'
        split_first_n_bytes(file_name=enwik9_file, n=bytes_to_read, output_file=output_file)
        output_file_size = convert_bytes_display(os.path.getsize(output_file))
        print(f'File \'{output_file}\' ({output_file_size}) is ready to be processed in next step!')
        

Call *data_prep()* function to generate the first 10MB of enwik9 (original file ***enwik9.zip*** is from the website http://mattmahoney.net above)

In [5]:
data_prep(BYTES_TO_READ)

Downloading enwik9.zip...
Done!
Download completed!
Start unzip file 'enwik9.zip'...
File Name                                             Modified             Size
enwik9                                         2011-06-01 11:29:40   1000000000
Extracting all the files now...
Done!
Start spliting the first 10485760 byte(s) (10MB) of 'enwik9' into 'enwik9.head_10MB'...
Done splitting file!
File 'enwik9.head_10MB' (10MB) is ready to be processed in next step!
