In [402]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("LoadCSV").getOrCreate()
from pyspark.sql import Window
from pyspark.sql.functions import avg
from pyspark.sql.functions import col, lit


In [403]:
df = spark.read.csv('dataset.csv',
                   sep = ',',
                   header = True,
                   quote = '',
                   inferSchema = True #Automatically defines the type of the schema
                   )

In [404]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- Occupation: integer (nullable = true)
 |-- purchase_1: integer (nullable = true)
 |-- purchase_2: integer (nullable = true)
 |-- purchase_3: integer (nullable = true)
 |-- purchase_4: integer (nullable = true)
 |-- purchase_5: integer (nullable = true)



In [405]:
df.show(5)

+---+----------+----------+----------+----------+----------+----------+
|age|Occupation|purchase_1|purchase_2|purchase_3|purchase_4|purchase_5|
+---+----------+----------+----------+----------+----------+----------+
| 52|        19|      3041|       532|       183|     21835|     12338|
| 53|         6|      3052|      3999|       872|      8704|     10117|
| 20|         8|      3466|      8520|       138|     18532|      5271|
| 50|         2|      1062|       524|      1677|     22875|      4637|
| 55|         8|      1416|      4021|      1358|     10176|      8943|
+---+----------+----------+----------+----------+----------+----------+
only showing top 5 rows



# Question 1

Find the average of the purchases (1 – till 5)

In [406]:
df.describe('purchase_1','purchase_2','purchase_3','purchase_4','purchase_5').show()

+-------+------------------+-----------------+------------------+-----------------+------------------+
|summary|        purchase_1|       purchase_2|        purchase_3|       purchase_4|        purchase_5|
+-------+------------------+-----------------+------------------+-----------------+------------------+
|  count|             21001|            21001|             21001|            21001|             21001|
|   mean| 2505.573639350507|5027.339555259274|1248.7371553735536|12457.99985714966|7496.7633446026375|
| stddev|1441.4610097028874|2872.855810742304| 724.9649386299643|7222.963503678821| 4319.169116931873|
|    min|                 0|                0|                 0|                2|                 0|
|    max|              5000|            10000|              2500|            24999|             15000|
+-------+------------------+-----------------+------------------+-----------------+------------------+



# Question 2

How many buyers between age 20-22 AND 30-34 AND 47-50 

In [407]:
df.filter(((df.age>=20) & (df.age<=22)) | ((df.age>=30) & (df.age<=34)) | ((df.age>=47) & (df.age<=50))).count()               # Age between 30 to 35

4604

In [408]:
df.filter(((df.age>=20) & (df.age<=22)) | ((df.age>=30) & (df.age<=34)) | ((df.age>=47) & (df.age<=50))).show(5)   

+---+----------+----------+----------+----------+----------+----------+
|age|Occupation|purchase_1|purchase_2|purchase_3|purchase_4|purchase_5|
+---+----------+----------+----------+----------+----------+----------+
| 20|         8|      3466|      8520|       138|     18532|      5271|
| 50|         2|      1062|       524|      1677|     22875|      4637|
| 47|        15|      3625|      3929|        86|     14172|      2624|
| 32|        18|      2939|      1552|      1750|     11543|       825|
| 31|        15|       840|      9348|      1703|      4073|      1668|
+---+----------+----------+----------+----------+----------+----------+
only showing top 5 rows



# Question 3

What AGE has the max average purchases (1 – till 5) 

In [409]:
df_Question3_1 = df.withColumn( 'rec_avg' , 
                     df['purchase_1'] +
                     df['purchase_2'] +
                     df['purchase_3'] +
                     df['purchase_4'] +
                     df['purchase_5'] / 5)

    
df_Question3_2 = df_Question3_1.groupby('age').agg({
                       'rec_avg': 'avg'
                       })

df_Question3_3 = df_Question3_2.sort(col("avg(rec_avg)").desc()).show(1)



+---+------------------+
|age|      avg(rec_avg)|
+---+------------------+
| 65|23973.389629629655|
+---+------------------+
only showing top 1 row



# Question 4

What is the purchase averages (1 till 5) per Occupation?

In [410]:
df_Question4_1 = df.withColumn( 'rec_avg' , 
                     df['purchase_1'] +
                     df['purchase_2'] +
                     df['purchase_3'] +
                     df['purchase_4'] +
                     df['purchase_5'] / 5)

    
df_Question4_2 = df1.groupby('Occupation').agg({
                       'rec_avg': 'avg'
                       }).sort(col("Occupation").asc()).show()



+----------+------------------+
|Occupation|      avg(rec_avg)|
+----------+------------------+
|         1|22762.597268292684|
|         2|22885.342300556596|
|         3|22790.268996415743|
|         4| 22653.15859519406|
|         5|22577.990666666683|
|         6|22740.546080305885|
|         7|22377.395539033467|
|         8|23165.312331406552|
|         9|22753.421353670183|
|        10|23025.979651162816|
|        11|22897.927326150824|
|        12| 22109.87526066354|
|        13|22634.039519852246|
|        14|23195.895660749487|
|        15|22895.525883476635|
|        16| 22727.51117589891|
|        17|22584.395829383848|
|        18| 22736.86441351893|
|        19| 22395.07851314593|
|        20|22940.695381526086|
+----------+------------------+



# Question 5

What are the min & max purchases, per Occupation per age?

In [411]:
df_Question5_1 = df.withColumn( 'rec_max_min' , 
                     df['purchase_1'] +
                     df['purchase_2'] +
                     df['purchase_3'] +
                     df['purchase_4'] +
                     df['purchase_5'])

    
df_Question5_2_min = df_Question5_1.groupby('age','Occupation').agg({
                       'rec_max_min': 'sum',
                       }).sort(col("sum(rec_max_min)").asc()).show(1)



df_Question5_2_max = df_Question5_1.groupby('age','Occupation').agg({
                       'rec_max_min': 'sum',
                       }).sort(col("sum(rec_max_min)").desc()).show(1)



+---+----------+----------------+
|age|Occupation|sum(rec_max_min)|
+---+----------+----------------+
| 59|        17|          203045|
+---+----------+----------------+
only showing top 1 row

+---+----------+----------------+
|age|Occupation|sum(rec_max_min)|
+---+----------+----------------+
| 29|         3|          953283|
+---+----------+----------------+
only showing top 1 row



# Question 6

How many buyers between ages 40-47 buy @purchase3 more than 2,200 and purchase4 less than 1,000 and purchase5 between 10,000-11,500?

In [412]:
df.filter((df.age>=40) &
          (df.age>=47) &
          (df.purchase_3>2200) &
          (df.purchase_4 <1000)  &
          (df.purchase_5 >=10000)  &
          (df.purchase_5 <=11500)).count()       

6

# Question 7

What is the max purchase (1 till 5) per age?

In [413]:
df_Question7_3 = df.groupby('age').agg({
                     'purchase_1': 'max',
                     'purchase_2': 'max', 
                     'purchase_3': 'max',
                     'purchase_4': 'max',
                     'purchase_5': 'max'
                       }).sort(col("age").asc()).show(55)

+---+---------------+---------------+---------------+---------------+---------------+
|age|max(purchase_3)|max(purchase_2)|max(purchase_5)|max(purchase_1)|max(purchase_4)|
+---+---------------+---------------+---------------+---------------+---------------+
| 16|           2495|           9985|          14773|           4983|          24909|
| 17|           2493|           9968|          14953|           4990|          24999|
| 18|           2500|           9997|          14969|           4991|          24975|
| 19|           2495|           9994|          15000|           4991|          24901|
| 20|           2497|           9984|          14985|           5000|          24924|
| 21|           2494|           9984|          14994|           4997|          24952|
| 22|           2497|           9999|          14983|           4984|          24994|
| 23|           2497|           9983|          14967|           4992|          24967|
| 24|           2493|           9973|          14995| 