# Using PipelineDP with Apache Spark on AWS Glue

This notebook can be used in AWS Glue to run a job.

To create the job, you can follow the steps [here](https://catalog.us-east-1.prod.workshops.aws/workshops/ee59d21b-4cb8-4b3d-a629-24537cf37bb5/en-US/lab3/etl-job), but uploading this notebook instead of the one they mention.

Make sure to also create a IAM role and add the correct permissions. See how to do this [here](https://docs.aws.amazon.com/glue/latest/ug/notebook-getting-started.html#create-notebook-permissions).

Finally, the file we use is on S3. So remember to add the files you want to handle to S3 as well.

In [None]:
%glue_version 3.0

In [None]:
%additional_python_modules pipeline_dp

In [None]:
print("\nSTARTED")

In [None]:
import pipeline_dp

In [None]:
from dataclasses import dataclass

@dataclass
class MovieView:
    user_id: int
    movie_id: int
    rating: int


def parse_line(line, movie_id):
    # 'line' has format "user_id,rating,date"
    split_parts = line.split(',')
    user_id = int(split_parts[0])
    rating = int(split_parts[1])
    return MovieView(user_id, movie_id, rating)

def parse_partition(iterator):
    movie_id = None
    for line in iterator:
        if line[-1] == ':':
            # 'line' has a format "movie_id:'
            movie_id = int(line[:-1])
        else:
            # 'line' has a format "user_id,rating,date"
            yield parse_line(line, movie_id)

In [None]:
# REPLACE IT WITH YOURS

BUCKET_FILE = "s3://bucketpipelinedp/sample_combined_data_1.txt"

In [None]:
from awsglue.transforms import *
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
  
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
job = Job(glueContext)

movie_views = sc.textFile(BUCKET_FILE).mapPartitions(parse_partition)

backend = pipeline_dp.SparkRDDBackend(glueContext)

In [None]:
import pipeline_dp
from pipeline_dp.private_spark import make_private


# Define the privacy budget available for our computation.
budget_accountant = pipeline_dp.NaiveBudgetAccountant(total_epsilon=1,
                                                      total_delta=1e-6)

# Create a DPEngine instance.
dp_engine = pipeline_dp.DPEngine(budget_accountant, backend)

# Wrap Spark's RDD into its private version
private_movie_views = \
   make_private(movie_views, budget_accountant, lambda mv: mv.user_id)

params = pipeline_dp.AggregateParams(
   noise_kind=pipeline_dp.NoiseKind.LAPLACE,
   metrics=[
      pipeline_dp.Metrics.COUNT, pipeline_dp.Metrics.SUM,
      pipeline_dp.Metrics.MEAN, pipeline_dp.Metrics.VARIANCE
   ],
   # Limits to how much one user can contribute:
   # .. at most two movies rated per user
   max_partitions_contributed=2,
   # .. at most one rating for each movie
   max_contributions_per_partition=1,
   # .. with minimal rating of "1"
   min_value=1,
   # .. and maximum rating of "5"
   max_value=5,
   contribution_bounds_already_enforced=True)

# Specify how to extract privacy_id and value from an
# element of movie view collection.
data_extractors = pipeline_dp.DataExtractors(
   # The aggregation key: we're grouping by movies
   partition_extractor=lambda mv: mv.movie_id,
   # The value we're aggregating: we're summing up ratings
   value_extractor=lambda mv: mv.rating)

# Run aggregation.
dp_result = dp_engine.aggregate(movie_views, params, data_extractors)

budget_accountant.compute_budgets()

In [None]:
dp_result = dp_result.collect()
dp_result