# Data Retrieval

<font color='green'>This notebook is responsible for retrieving the SNLI dataset if it hasn't already been retrieved. It is also responsible for unzipping the training, validation and test files in text format.</font>

In [1]:
import os
import zipfile
from six.moves.urllib.request import urlretrieve

In [2]:
dataset_dir = "dataset/"
snli_zip_file = "snli_1.0.zip"
snli_train_file = "snli_1.0_train.txt"
snli_test_file = "snli_1.0_test.txt"
snli_val_file = "snli_1.0_dev.txt"

In [3]:
# Create dataset folder
if not os.path.exists("dataset"):
    os.mkdir("dataset")
    
# Retrieve SNLI dataset - 100 MB
if not os.path.exists(dataset_dir+snli_zip_file):
    urlretrieve ("https://nlp.stanford.edu/projects/snli/snli_1.0.zip", dataset_dir+snli_zip_file)

In [4]:
def unzip_files(zip_file, files_to_unzip):
    '''
        Unzip train, test and validation txt files if not already done
    '''
    for file_to_unzip in files_to_unzip:
        
        if not os.path.exists(dataset_dir+file_to_unzip):
            with open(dataset_dir+file_to_unzip, 'wb') as out_file:
                with zipfile.ZipFile(zip_file) as zipped:
                    for info in zipped.infolist():
                        if file_to_unzip in info.filename:
                            with zipped.open(info) as requested_file:
                                out_file.write(requested_file.read())
                                break
                                
files_to_unzip = [
    snli_train_file,
    snli_test_file,
    snli_val_file,
]

unzip_files(dataset_dir+snli_zip_file, files_to_unzip)