# Parallel Processing with Dask 

## Dask Bags for Unstructured Data

In [None]:
# Import the Dask bag subpackage as db
import dask.bag as db

# Convert the list to a Dask bag
review_bag = db.from_sequence(reviews_list, npartitions=3)

# Print 1 element of the bag
print(review_bag.take(1))

In [None]:
# Load in all the .txt files inside data/tripadvisor_hotel_reviews
review_bag = db.read_text("data/tripadvisor_hotel_reviews/*.txt")

# Count the number of reviews in the bag
review_count = review_bag.count()

# Compute and print the answer
print(review_count.compute())

In [None]:
# Convert all of the reviews to lower case
lowercase_reviews = review_bag.str.lower()

# Count the number of times 'excellent' appears in each review
excellent_counts = lowercase_reviews.str.count("excellent")

# Print the first 10 counts of 'excellent'
print(excellent_counts.take(10))

In [None]:
# Import of the json package
import json

# Read all of the JSON files inside data/politicians
text_bag = db.read_text("data/politicians/*.json")

# Convert the JSON strings into dictionaries
dict_bag = text_bag.map(json.loads)

# Show an example dictionary
print(dict_bag.take(1))


In [None]:
# Print the number of elements in dict_bag
print(dict_bag.count().compute())

# Filter out records using the has_birth_date() function
filtered_bag = dict_bag.filter(has_birth_date)

# Print the number of elements in filtered_bag
print(filtered_bag.count().compute())

In [None]:
# Select the 'birth_date' from each dictionary in the bag
birth_date_bag = filtered_bag.pluck("birth_date")

# Extract the year as an integer from the birth_date strings
birth_year_bag = birth_date_bag.map(lambda x: int(x[:4]))

# Calculate the min, max and mean birth years
min_year = birth_year_bag.min()
max_year = birth_year_bag.max()
mean_year = birth_year_bag.mean()

# Compute the results efficiently and print them
print(dask.compute(min_year, max_year, mean_year))

### Converting unstructured data to DataFrame 

In [None]:
def extract_url(x):
    # Extract the url and assign it to the key 'url'
    x['url'] = x['links'][0]['url']
    return x
  
# Run the function on all elements in the bag.
dict_bag = dict_bag.map(extract_url)

print(dict_bag.take(1))

In [None]:
def select_keys(dictionary, keys_to_keep):
  new_dict = {}
  # Loop through kept keys and add them to new dictionary
  for k in keys_to_keep:
    new_dict[k] = dictionary[k]
  return new_dict

# Use the select_keys to reduce to the 4 required keys
filtered_bag = dict_bag.map(select_keys, keys_to_keep=['gender','name', 'birth_date', 'url'])

# Convert the restructured bag to a DataFrame
df = filtered_bag.to_dataframe()

# Print the first few rows of the DataFrame
print(df.head())