In [2]:
!pip install pyspark




[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip





In [3]:
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder \
    .appName("MyApp") \
    .master("local[*]") \
    .getOrCreate()  # Use all available CPU cores

# Verify Spark session
spark

### Qn 1
- Load the JSON file into a PySpark DataFrame and display the first 5 rows.

In [4]:
df = spark.read.json('../PySpark/sample_data/users_data.json', multiLine=True)

In [5]:
df.show()

+---+-------------+--------------------+---+--------+
|age|         city|             friends| id|    name|
+---+-------------+--------------------+---+--------+
| 78|       Austin|[{[Watching Sport...|  0|  Elijah|
| 97|       Boston|[{[Watching Sport...|  1|    Noah|
| 48|    San Diego|[{[Reading, Volun...|  2|     Evy|
| 39|    St. Louis|[{[Watching Sport...|  3|  Oliver|
| 95|    St. Louis|[{[Movie Watching...|  4| Michael|
| 19|     Portland|[{[Painting, Tele...|  5| Michael|
| 76|       Austin|[{[Genealogy, Coo...|  6|   Lucas|
| 25|  San Antonio|[{[Music, Golf], ...|  7|Michelle|
| 61|       Austin|[{[Bicycling, Ski...|  8|   Emily|
| 33|  New Orleans|[{[Traveling, Bic...|  9|    Liam|
| 59|  New Orleans|[{[Video Games, F...| 10|    Levi|
| 82|     Portland|[{[Jewelry Making...| 11|   Lucas|
| 82|   Charleston|[{[Eating Out], O...| 12|   Kevin|
| 34|  San Antonio|[{[Yoga, Travelin...| 13|  Olivia|
| 49|  Los Angeles|[{[Yoga, Televisi...| 14|  Robert|
| 98|      Chicago|[{[Travel

In [6]:
df.dropDuplicates(['city']).select('city').show()

+---------------+
|           city|
+---------------+
|     Charleston|
|       Savannah|
|    San Antonio|
|    Los Angeles|
|Saint Augustine|
|      San Diego|
|      Nashville|
|  San Francisco|
|       Portland|
|      St. Louis|
|         Austin|
|     Washington|
|        Chicago|
|    Miami Beach|
|        Branson|
|         Sedona|
|    New Orleans|
|      Las Vegas|
|       Honolulu|
|        Seattle|
+---------------+
only showing top 20 rows



### Qn 3
- Count the total number of users present in the dataset.

In [7]:
df_users = df.select('name').distinct()
df_users.count()

35

### Qn 4
- Filter out users who are older than 80 years and display their names and ages.

In [8]:
df_super_senior = df.filter(df.age >= 80)['name', 'age']
df_super_senior.show()

+-------+---+
|   name|age|
+-------+---+
|   Noah| 97|
|Michael| 95|
|  Lucas| 82|
|  Kevin| 82|
|  Grace| 98|
|  Kevin| 93|
|  Kevin| 97|
|  Emily| 82|
|   Luke| 84|
| Robert| 89|
| Elijah| 96|
| Sophie| 84|
|  Kevin| 88|
|  Chloe| 97|
|   Nora| 83|
| Amelia| 84|
|    Leo| 97|
|  Mateo| 95|
| Olivia| 89|
| Amelia| 96|
+-------+---+
only showing top 20 rows



### Qn 5
- Extract the list of unique cities where the users live.

In [9]:
df_city = df.dropDuplicates(['city'])
df_city.select('city').show()

+---------------+
|           city|
+---------------+
|     Charleston|
|       Savannah|
|    San Antonio|
|    Los Angeles|
|Saint Augustine|
|      San Diego|
|      Nashville|
|  San Francisco|
|       Portland|
|      St. Louis|
|         Austin|
|     Washington|
|        Chicago|
|    Miami Beach|
|        Branson|
|         Sedona|
|    New Orleans|
|      Las Vegas|
|       Honolulu|
|        Seattle|
+---------------+
only showing top 20 rows



### Qn 6
- Display each user's name alongside the names of their friends by flattening the friends column.

In [10]:
from pyspark.sql.functions import explode, col

df_friend = df.select('name', explode('friends').alias('Friends'))  # Explode friends column
df_friend = df_friend.select(col('name'), col('Friends.name').alias('Friend'))  # Extract only friend's name
df_friend.show()

+-------+--------+
|   name|  Friend|
+-------+--------+
| Elijah|Michelle|
| Elijah|  Robert|
|   Noah|  Oliver|
|   Noah|  Olivia|
|   Noah|  Robert|
|   Noah|     Ava|
|   Noah| Michael|
|   Noah| Michael|
|    Evy|     Joe|
|    Evy|     Joe|
|    Evy|  Oliver|
|    Evy|    Liam|
|    Evy|  Amelia|
| Oliver|   Mateo|
| Oliver|    Nora|
| Oliver|     Ava|
| Oliver|  Amelia|
| Oliver|     Leo|
|Michael|   Mateo|
|Michael|   Chris|
+-------+--------+
only showing top 20 rows



### Qn 7
- Identify the user who has the most friends in the dataset.

In [11]:
from pyspark.sql.functions import count

df_most_frnd = df_friend.groupBy('name').agg(count('name').alias('count'))
df_most_frnd.orderBy('count', ascending = False).show(1)

+-------+-----+
|   name|count|
+-------+-----+
|Michael| 1294|
+-------+-----+
only showing top 1 row



### Qn 8
- Count the total number of unique hobbies listed across all users.

In [12]:
from pyspark.sql.functions import col, explode
df_friendlist = df.select(col('name'), explode('friends').alias('Friend'))
df_hobbies = df_friendlist.select(explode(col('Friend.hobbies')).alias('hobby')) # getting hobby out of 
df_dis_hob = df_hobbies.select('hobby').distinct() # getting distinct hobby
df_dis_hob.count()

37

### Qn 9
- Find the most common hobby among all the users' friends.

In [13]:
df_most_cmn_hob = df_hobbies.groupBy('hobby').agg(count('hobby').alias('count'))
df_most_cmn_hob.orderBy('count', ascending = False).show(1)

+-----------+-----+
|      hobby|count|
+-----------+-----+
|Video Games| 2766|
+-----------+-----+
only showing top 1 row



### Qn 10
- Add a new column age_category to classify users as Senior (age >= 60) or Adult (age < 60).

In [14]:
from pyspark.sql.functions import when, col
df_new = df.withColumn('age_category', when(col('age')> 60, 'Senior').otherwise('adult'))
df_new.show()

+---+-------------+--------------------+---+--------+------------+
|age|         city|             friends| id|    name|age_category|
+---+-------------+--------------------+---+--------+------------+
| 78|       Austin|[{[Watching Sport...|  0|  Elijah|      Senior|
| 97|       Boston|[{[Watching Sport...|  1|    Noah|      Senior|
| 48|    San Diego|[{[Reading, Volun...|  2|     Evy|       adult|
| 39|    St. Louis|[{[Watching Sport...|  3|  Oliver|       adult|
| 95|    St. Louis|[{[Movie Watching...|  4| Michael|      Senior|
| 19|     Portland|[{[Painting, Tele...|  5| Michael|       adult|
| 76|       Austin|[{[Genealogy, Coo...|  6|   Lucas|      Senior|
| 25|  San Antonio|[{[Music, Golf], ...|  7|Michelle|       adult|
| 61|       Austin|[{[Bicycling, Ski...|  8|   Emily|      Senior|
| 33|  New Orleans|[{[Traveling, Bic...|  9|    Liam|       adult|
| 59|  New Orleans|[{[Video Games, F...| 10|    Levi|       adult|
| 82|     Portland|[{[Jewelry Making...| 11|   Lucas|      Sen

### Qn 11
- Write the modified DataFrame (with the age_category column) back to a new JSON file.

In [15]:
df_new.write.mode('overwrite').json('../PySpark/sample_data/new_users_data.json')