Event Friends Recommendation


In [0]:
from pyspark.sql import Row

# Create friendship_status DataFrame
friendship_status_data = [
    Row(user_a_id=111, user_b_id=333, status='not_friends'),
    Row(user_a_id=222, user_b_id=333, status='not_friends'),
    Row(user_a_id=333, user_b_id=222, status='not_friends'),
    Row(user_a_id=222, user_b_id=111, status='friends'),
    Row(user_a_id=111, user_b_id=222, status='friends'),
    Row(user_a_id=333, user_b_id=111, status='not_friends')
]
friendship_status = spark.createDataFrame(friendship_status_data)

# Create event_rsvp DataFrame
event_rsvp_data = [
    Row(user_id=111, event_id=567, event_type='public', attendance_status='going', event_date='07/12/2022'),
    Row(user_id=222, event_id=789, event_type='private', attendance_status='going', event_date='07/15/2022'),
    Row(user_id=333, event_id=789, event_type='private', attendance_status='maybe', event_date='07/15/2022'),
    Row(user_id=111, event_id=234, event_type='private', attendance_status='not_going', event_date='07/18/2022'),
    Row(user_id=222, event_id=234, event_type='private', attendance_status='going', event_date='07/18/2022'),
    Row(user_id=333, event_id=234, event_type='private', attendance_status='going', event_date='07/18/2022')
]
event_rsvp = spark.createDataFrame(event_rsvp_data)

# Register as temp views for SQL
friendship_status.createOrReplaceTempView("friendship_status")
event_rsvp.createOrReplaceTempView("event_rsvp")


In [0]:

# SQL query for recommendation
result = spark.sql("""
WITH private_events AS (
  SELECT user_id, event_id
  FROM event_rsvp
  WHERE attendance_status IN ('going', 'maybe')
    AND event_type = 'private'
)
SELECT 
  friends.user_a_id, 
  friends.user_b_id
FROM private_events AS events_1
INNER JOIN private_events AS events_2
  ON events_1.user_id != events_2.user_id
  AND events_1.event_id = events_2.event_id
INNER JOIN friendship_status AS friends
  ON events_1.user_id = friends.user_a_id
  AND events_2.user_id = friends.user_b_id
WHERE friends.status = 'not_friends'
GROUP BY friends.user_a_id, friends.user_b_id
HAVING COUNT(*) >= 2
ORDER BY friends.user_a_id, friends.user_b_id
""")

display(result)

In [0]:
from pyspark.sql.functions import col

# Filter private events with attendance_status 'going' or 'maybe'
private_events = event_rsvp.filter(
    (col("attendance_status").isin("going", "maybe")) &
    (col("event_type") == "private")
).select("user_id", "event_id")

# Self-join to find user pairs attending same private event
events_1 = private_events.alias("events_1")
events_2 = private_events.alias("events_2")
user_pairs = events_1.join(
    events_2,
    (col("events_1.event_id") == col("events_2.event_id")) &
    (col("events_1.user_id") != col("events_2.user_id"))
).select(
    col("events_1.user_id").alias("user_a_id"),
    col("events_2.user_id").alias("user_b_id")
)

# Join with friendship_status to get not_friends pairs
not_friends = user_pairs.join(
    friendship_status,
    (user_pairs.user_a_id == friendship_status.user_a_id) &
    (user_pairs.user_b_id == friendship_status.user_b_id) &
    (friendship_status.status == "not_friends")
)

# Group by user_a_id, user_b_id and filter pairs with at least 2 shared events
result = not_friends.groupBy("user_a_id", "user_b_id").count().filter(col("count") >= 2).orderBy("user_a_id", "user_b_id")

display(result)