In [9]:
from pyspark.sql import SparkSession

file_path = "/content/CardioGoodFitness-1 (1).csv"
data = spark.read.csv(file_path, header=True, inferSchema=True)

# Convert DataFrame to RDD
rdd = data.rdd

# Display first 5 rows of the RDD
rdd.take(5)


[Row(Product='TM195', Age=18, Gender='Male', Education=14, MaritalStatus='Single', Usage=3, Fitness=4, Income=29562, Miles=112),
 Row(Product='TM195', Age=19, Gender='Male', Education=15, MaritalStatus='Single', Usage=2, Fitness=3, Income=31836, Miles=75),
 Row(Product='TM195', Age=19, Gender='Female', Education=14, MaritalStatus='Partnered', Usage=4, Fitness=3, Income=30699, Miles=66),
 Row(Product='TM195', Age=19, Gender='Male', Education=12, MaritalStatus='Single', Usage=3, Fitness=3, Income=32973, Miles=85),
 Row(Product='TM195', Age=20, Gender='Male', Education=13, MaritalStatus='Partnered', Usage=4, Fitness=2, Income=35247, Miles=47)]

In [10]:
def create_profile(row):
    if row.Age < 30 and row.Income < 50000:
        profile = "Young Low-Income"
    elif row.Age < 30 and row.Income >= 50000:
        profile = "Young High-Income"
    elif row.Age >= 30 and row.Income < 50000:
        profile = "Adult Low-Income"
    else:
        profile = "Adult High-Income"
    return (row.Product, row.Age, row.Income, row.Miles, profile)

# Create profiles using map()
profile_rdd = rdd.map(create_profile)

# Show sample profiles
profile_rdd.take(5)


[('TM195', 18, 29562, 112, 'Young Low-Income'),
 ('TM195', 19, 31836, 75, 'Young Low-Income'),
 ('TM195', 19, 30699, 66, 'Young Low-Income'),
 ('TM195', 19, 32973, 85, 'Young Low-Income'),
 ('TM195', 20, 35247, 47, 'Young Low-Income')]

In [11]:
grouped_rdd = profile_rdd.map(lambda x: (x[4], (x[3], 1)))\
                         .reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1]))\
                         .mapValues(lambda x: x[0] / x[1])

# Display results
grouped_rdd.collect()


[('Young Low-Income', 89.93055555555556),
 ('Young High-Income', 122.48780487804878),
 ('Adult Low-Income', 90.0),
 ('Adult High-Income', 108.71428571428571)]

In [12]:
high_value_rdd = profile_rdd.filter(lambda x: x[2] > 60000 and x[3] > 200)

# Display high-value customers
high_value_rdd.collect()


[('TM798', 25, 75946, 240, 'Young High-Income'),
 ('TM798', 29, 85906, 300, 'Young High-Income'),
 ('TM798', 30, 90886, 280, 'Adult High-Income'),
 ('TM798', 31, 89641, 260, 'Adult High-Income'),
 ('TM798', 35, 92131, 360, 'Adult High-Income')]