In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
3,application_1721028815278_0004,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [2]:
spark = SparkSession.builder\
    .appName("Marketing Campaign Data Analysis Using PySpark")\
    .enableHiveSupport()\
    .getOrCreate()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
# hdfs file path
hdfs_path1 = '/tmp/datasets/ad_campaigns_data.json'
hdfs_path2 = '/tmp/datasets/user_profile_data.json'
hdfs_path3 = '/tmp/datasets/store_data.json'

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
ad_campaigns = spark.read.format("json").option("multiline","true").option("header","true").load(hdfs_path1)
ad_campaigns.printSchema()

# convert event_time from string to timestamp
ad_campaigns = ad_campaigns.withColumn("event_time", to_timestamp(ad_campaigns["event_time"]))


ad_campaigns.printSchema()
ad_campaigns.show(5)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- campaign_country: string (nullable = true)
 |-- campaign_id: string (nullable = true)
 |-- campaign_name: string (nullable = true)
 |-- device_type: string (nullable = true)
 |-- event_time: string (nullable = true)
 |-- event_type: string (nullable = true)
 |-- os_type: string (nullable = true)
 |-- place_id: string (nullable = true)
 |-- user_id: string (nullable = true)

root
 |-- campaign_country: string (nullable = true)
 |-- campaign_id: string (nullable = true)
 |-- campaign_name: string (nullable = true)
 |-- device_type: string (nullable = true)
 |-- event_time: timestamp (nullable = true)
 |-- event_type: string (nullable = true)
 |-- os_type: string (nullable = true)
 |-- place_id: string (nullable = true)
 |-- user_id: string (nullable = true)

+----------------+-----------+--------------------+-----------+-------------------+----------+-------+---------+-------------------+
|campaign_country|campaign_id|       campaign_name|device_type|         event_time|event_t

In [5]:
user_profile = spark.read.format("json").option("multiline","true").option("header", "true").load(hdfs_path2)
user_profile.schema

user_profile.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- age_group: string (nullable = true)
 |-- category: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- country: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- user_id: string (nullable = true)

In [6]:
store_data = spark.read.format("json").option("multiline","true").load(hdfs_path3)
store_data.schema

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

StructType([StructField('place_ids', ArrayType(StringType(), True), True), StructField('store_name', StringType(), True)])

# Q1. Analyse data for each campaign_id, date, hour, os_type & value to get all the events with counts

In [7]:

# extract hour and date from ad_campaigns
ad_campaigns = ad_campaigns.withColumn("date", to_date(ad_campaigns["event_time"]))
ad_campaigns = ad_campaigns.withColumn("hour", hour(ad_campaigns["event_time"]))
ad_campaigns.show(5)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------------+-----------+--------------------+-----------+-------------------+----------+-------+---------+-------------------+----------+----+
|campaign_country|campaign_id|       campaign_name|device_type|         event_time|event_type|os_type| place_id|            user_id|      date|hour|
+----------------+-----------+--------------------+-----------+-------------------+----------+-------+---------+-------------------+----------+----+
|             USA|    ABCDFAE|Food category tar...|      apple|2018-10-12 13:10:05|impression|    ios|CASSBB-11|1264374214654454321|2018-10-12|  13|
|             USA|    ABCDFAE|Food category tar...|   MOTOROLA|2018-10-12 13:09:04|impression|android|CADGBD-13|1674374214654454321|2018-10-12|  13|
|             USA|    ABCDFAE|Food category tar...|    SAMSUNG|2018-10-12 13:10:10|  video ad|android|BADGBA-12|   5747421465445443|2018-10-12|  13|
|             USA|    ABCDFAE|Food category tar...|    SAMSUNG|2018-10-12 13:10:12|     click|android|CASS

In [8]:
# ad_campaigns.groupBy("campaign_id","date","hour","os_type").count().show()
Q1_result = (ad_campaigns.groupBy("campaign_id","date","hour","os_type","event_type")\
                .agg(count("event_type").alias("event_count"))
                .groupBy("campaign_id","date","hour","os_type")
                .pivot("event_type")
                .agg(first("event_count"))
                .fillna(0)
                .select
                ("campaign_id", "date","hour","os_type",
                 struct(col("impression"),col("click"),col("video ad"),).alias("event"),))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [9]:
Q1_result.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----------+----------+----+-------+---------+
|campaign_id|      date|hour|os_type|    event|
+-----------+----------+----+-------+---------+
|    ABCDFAE|2018-10-12|  13|android|{1, 1, 1}|
|    ABCDFAE|2018-10-12|  13|    ios|{1, 0, 0}|
+-----------+----------+----+-------+---------+

In [10]:
hdfs_output = '/tmp/datasets/output/'
Q1_result.write.json(hdfs_output + "Q1_result", mode="overwrite")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Q2.Analyse data for each campaign_id, date, hour, store_name & value to get all the events with counts

In [11]:
Q2_result  = (ad_campaigns.join(store_data, array_contains(store_data.place_ids,  ad_campaigns.place_id), "inner").drop(ad_campaigns.place_id).groupBy("campaign_id","date","hour","store_name", "event_type")\
    .agg(count("event_type").alias("event_count"))\
    .groupBy("campaign_id","date","hour","store_name")\
    .pivot("event_type")\
    .agg(first("event_count"))\
    .fillna(0)\
    .select
    ("campaign_id", "date","hour","store_name",
    struct(col("impression"),col("click"),col("video ad"),).alias("event"),))
Q2_result.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----------+----------+----+-------------+---------+
|campaign_id|      date|hour|   store_name|    event|
+-----------+----------+----+-------------+---------+
|    ABCDFAE|2018-10-12|  13|     McDonald|{2, 1, 0}|
|    ABCDFAE|2018-10-12|  13|shoppers stop|{0, 0, 1}|
|    ABCDFAE|2018-10-12|  13|   BurgerKing|{1, 1, 0}|
+-----------+----------+----+-------------+---------+

In [12]:
hdfs_output = '/tmp/datasets/output/'
Q2_result.write.json(hdfs_output + "Q2_result", mode="overwrite")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Q3. Analyse data for each campaign_id, date, hour, gender_type & value to get all the events with counts

In [13]:
df = ad_campaigns.join(user_profile, "user_id", "inner").drop(ad_campaigns.user_id)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [14]:
Q3_result  = (ad_campaigns.join(user_profile, "user_id", "inner").drop(ad_campaigns.user_id).groupBy("campaign_id","date","hour","gender", "event_type")\
    .agg(count("event_type").alias("event_count"))\
    .groupBy("campaign_id","date","hour","gender")\
    .pivot("event_type")\
    .agg(first("event_count"))\
    .fillna(0)\
    .select
    ("campaign_id", "date","hour","gender",
    struct(col("impression"),col("click"),col("video ad"),).alias("event"),))
Q3_result.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----------+----------+----+------+---------+
|campaign_id|      date|hour|gender|    event|
+-----------+----------+----+------+---------+
|    ABCDFAE|2018-10-12|  13|  male|{1, 1, 1}|
|    ABCDFAE|2018-10-12|  13|female|{1, 0, 0}|
+-----------+----------+----+------+---------+

In [15]:
hdfs_output = '/tmp/datasets/output/'
Q3_result.write.json(hdfs_output + "Q3_result", mode="overwrite")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…