## Creating Dask Bags

### `read_txt` function

In [1]:
import dask.bag as db

###  You can specify a glob pattern to load multiple files

In [2]:
user_data = db.read_text('multiple_csvs/gdp_part_*.csv')

In [4]:
user_data.npartitions

10

In [3]:
user_data.take(10)

(',country,gdp,year\n',
 '0,India,2650725335363.83,2017\n',
 '1,India,2290432075123.55,2016\n',
 '2,India,2103587813812.2,2015\n',
 '3,India,2039127446299.3,2014\n',
 '4,India,1856722121394.42,2013\n',
 '5,India,1827637859136.23,2012\n',
 '6,India,1823049927772.05,2011\n',
 '7,India,1675615312693.42,2010\n',
 '8,India,1341886699393.18,2009\n')

### You can specify a function to map to each file while reading data

##### Random JSON generated from : https://www.json-generator.com/

In [5]:
import json
user_data = db.read_text('random.json').map(json.loads)

In [6]:
user_data.take(1)

([{'_id': '5cdb2669d8c733eaa48b5ed5',
   'index': 0,
   'guid': '627a29dd-69de-4e5b-9ae1-205d6a82432b',
   'isActive': False,
   'balance': '$3,330.63',
   'picture': 'http://placehold.it/32x32',
   'age': 34,
   'eyeColor': 'brown',
   'name': 'Padilla Lee',
   'gender': 'male',
   'company': 'ENTROFLEX',
   'email': 'padillalee@entroflex.com',
   'phone': '+1 (827) 527-2949',
   'address': '951 Reeve Place, Wacissa, District Of Columbia, 2894',
   'about': 'Non nulla fugiat id eu qui ullamco ipsum dolor proident adipisicing ut duis nulla id. Velit ullamco et reprehenderit velit fugiat sunt exercitation pariatur voluptate labore. Proident mollit qui anim dolore pariatur laborum laboris occaecat excepteur tempor labore. Exercitation sint eiusmod ad aliqua sit est laborum enim labore aute consequat cillum incididunt fugiat. Laborum fugiat mollit consectetur commodo occaecat qui magna ex aliqua sunt commodo. Velit irure duis enim ut cupidatat elit cupidatat eiusmod duis nisi in occaecat 

### You can even load from compressed files directly

In [7]:
user_data = db.read_text('*.xz', 
                        compression='xz').map(json.loads)

In [8]:
user_data.take(1)

([{'_id': '5cdb2669d8c733eaa48b5ed5',
   'index': 0,
   'guid': '627a29dd-69de-4e5b-9ae1-205d6a82432b',
   'isActive': False,
   'balance': '$3,330.63',
   'picture': 'http://placehold.it/32x32',
   'age': 34,
   'eyeColor': 'brown',
   'name': 'Padilla Lee',
   'gender': 'male',
   'company': 'ENTROFLEX',
   'email': 'padillalee@entroflex.com',
   'phone': '+1 (827) 527-2949',
   'address': '951 Reeve Place, Wacissa, District Of Columbia, 2894',
   'about': 'Non nulla fugiat id eu qui ullamco ipsum dolor proident adipisicing ut duis nulla id. Velit ullamco et reprehenderit velit fugiat sunt exercitation pariatur voluptate labore. Proident mollit qui anim dolore pariatur laborum laboris occaecat excepteur tempor labore. Exercitation sint eiusmod ad aliqua sit est laborum enim labore aute consequat cillum incididunt fugiat. Laborum fugiat mollit consectetur commodo occaecat qui magna ex aliqua sunt commodo. Velit irure duis enim ut cupidatat elit cupidatat eiusmod duis nisi in occaecat 

### Or even pass a list of files

In [9]:
import os

file_list = ['gdp_part_' + str(part) + '.csv'
             for part in range(0, 10)]
complete_path = [os.path.join('multiple_csvs', f_name) 
                for f_name in file_list]

print("File list is {}".format(complete_path))

user_db = db.read_text(complete_path)
user_db.take(10)

File list is ['multiple_csvs/gdp_part_0.csv', 'multiple_csvs/gdp_part_1.csv', 'multiple_csvs/gdp_part_2.csv', 'multiple_csvs/gdp_part_3.csv', 'multiple_csvs/gdp_part_4.csv', 'multiple_csvs/gdp_part_5.csv', 'multiple_csvs/gdp_part_6.csv', 'multiple_csvs/gdp_part_7.csv', 'multiple_csvs/gdp_part_8.csv', 'multiple_csvs/gdp_part_9.csv']


(',country,gdp,year\n',
 '0,India,2650725335363.83,2017\n',
 '1,India,2290432075123.55,2016\n',
 '2,India,2103587813812.2,2015\n',
 '3,India,2039127446299.3,2014\n',
 '4,India,1856722121394.42,2013\n',
 '5,India,1827637859136.23,2012\n',
 '6,India,1823049927772.05,2011\n',
 '7,India,1675615312693.42,2010\n',
 '8,India,1341886699393.18,2009\n')

## Creating Bags from a sequence

In [10]:
first_names = ['John', 'Jimmy', 'Bob', 'Alice', 'Rebbecca',
               'Rahul', 'Robin','Eva', 'Monika', 'Raman', 'Jay',
               'Chris', 'Lee', 'Bruce', 'Alan', 'Arthur',
               'Peter', 'Johnny', 'Sasha', 'Meera', 'Lily',
               'Aaron']


In [11]:
seq_bag = db.from_sequence(first_names, npartitions=3)

In [12]:
second_name_bag = db.from_sequence(first_names, npartitions=3).map(lambda x: x + ' Generic Second Name')

In [13]:
second_name_bag.take(2)

('John Generic Second Name', 'Jimmy Generic Second Name')

## Creating Dask Bags from URL

### Reference : https://www.gutenberg.org/

In [14]:
pride_prejudice_bag = db.from_url('https://www.gutenberg.org/files/1342/1342-0.txt')

In [15]:
pride_prejudice_bag.take(20)

(b'\xef\xbb\xbfThe Project Gutenberg EBook of Pride and Prejudice, by Jane Austen\r\n',
 b'\r\n',
 b'This eBook is for the use of anyone anywhere at no cost and with\r\n',
 b'almost no restrictions whatsoever.  You may copy it, give it away or\r\n',
 b're-use it under the terms of the Project Gutenberg License included\r\n',
 b'with this eBook or online at www.gutenberg.org\r\n',
 b'\r\n',
 b'\r\n',
 b'Title: Pride and Prejudice\r\n',
 b'\r\n',
 b'Author: Jane Austen\r\n',
 b'\r\n',
 b'Posting Date: August 26, 2008 [EBook #1342]\r\n',
 b'Release Date: June, 1998\r\n',
 b'Last Updated: March 10, 2018\r\n',
 b'\r\n',
 b'Language: English\r\n',
 b'\r\n',
 b'Character set encoding: UTF-8\r\n',
 b'\r\n')

#### Other Methods include `read_avro` and `from_delayed`. See http://docs.dask.org/en/latest/bag-creation.html# for more info

# Storing Dask Bags

### `to_text_files` method

In [16]:
books_url = ['https://www.gutenberg.org/files/84/84-0.txt',
            'https://www.gutenberg.org/files/11/11-0.txt',
            'https://www.gutenberg.org/files/98/98-0.txt',
            'https://www.gutenberg.org/files/2600/2600-0.txt']
books_data = db.from_url(books_url)

In [17]:
books_data.npartitions

4

In [18]:
books_data.to_textfiles('user_data/processed_*.txt')

['user_data/processed_0.txt',
 'user_data/processed_1.txt',
 'user_data/processed_2.txt',
 'user_data/processed_3.txt']

### You can specify your custom names as well

In [19]:
def file_name_func(x):
    return 'part_' + str(x)

In [20]:
books_data.to_textfiles('user_data/processed_*.txt',
                                name_function=file_name_func)

['user_data/processed_part_0.txt',
 'user_data/processed_part_1.txt',
 'user_data/processed_part_2.txt',
 'user_data/processed_part_3.txt']