# Event Like Prediction Model
This notebook connects to your local PostgreSQL `pnta` database and trains a machine learning model to predict whether an event will get more than a threshold number of likes.

In [59]:
import pandas as pd
from sqlalchemy import create_engine

# Step 1: Define your connection parameters
DB_USER = 'postgres'
DB_PASS = 'krisdan29'   # Be cautious about hardcoding
DB_HOST = 'localhost'
DB_PORT = '5432'
DB_NAME = 'pnta'

# Step 2: Create SQLAlchemy engine
engine = create_engine(f'postgresql+psycopg2://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}')


In [None]:
query = """
WITH event_interactions AS (
    SELECT 
        e.event_id,
        COUNT(DISTINCT uel.user_id) AS like_count,
        COUNT(DISTINCT uec.user_id) AS checkin_count,
        AVG(uer.rating) AS avg_event_rating
    FROM pnta.event e
    LEFT JOIN pnta.user_event_like uel ON e.event_id = uel.event_id
    LEFT JOIN pnta.user_event_checkin uec ON e.event_id = uec.event_id
    LEFT JOIN pnta.user_event_rating uer ON e.event_id = uer.event_id
    GROUP BY e.event_id
),
labeled_events AS (
    SELECT 
        ei.*,
        CASE 
            WHEN like_count >= 10 AND checkin_count >= 5 AND avg_event_rating >= 4.0 THEN 1
            ELSE 0
        END AS success_flag
    FROM event_interactions ei
),
event_tags AS (
    SELECT 
        et.event_id,
        STRING_AGG(t.name, ',') AS tags
    FROM pnta.event_tag et
    JOIN pnta.tag t ON et.tag_id = t.tag_id
    GROUP BY et.event_id
),
user_aggregates AS (
    SELECT 
        e.event_id,
        AVG(EXTRACT(YEAR FROM AGE(CURRENT_DATE, u.birth_date))) AS avg_age,
        SUM(CASE WHEN u.gender = 'female' THEN 1 ELSE 0 END)::float / COUNT(*) AS percent_female,
        SUM(CASE WHEN u.city = l.city THEN 1 ELSE 0 END)::float / COUNT(*) AS percent_local_users
    FROM pnta.event e
    JOIN pnta.user_event_checkin uec ON e.event_id = uec.event_id
    JOIN pnta.userp u ON u.user_id = uec.user_id
    LEFT JOIN pnta.venue v ON e.venue_id = v.venue_id
    LEFT JOIN pnta.location l ON v.venue_id = l.venue_id
    GROUP BY e.event_id
)

SELECT 
    e.event_id,
    e.event_date,
    e.start_time,
    e.end_time,
    le.like_count,
    le.checkin_count,
    le.avg_event_rating,
    le.success_flag,
    et.tags,
    v.price_rating,
    v.rating AS venue_rating,
    l.city AS venue_city,
    ua.avg_age,
    ua.percent_female,
    ua.percent_local_users
FROM pnta.event e
JOIN labeled_events le ON e.event_id = le.event_id
LEFT JOIN event_tags et ON e.event_id = et.event_id
LEFT JOIN venue v ON e.venue_id = v.venue_id
LEFT JOIN location l ON v.venue_id = l.venue_id
LEFT JOIN user_aggregates ua ON e.event_id = ua.event_id;
"""

# Step 4: Run the query
df = pd.read_sql_query(query, engine)

pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
# Step 5: Inspect the result
print(df.head(10))


# Or as Parquet
# df.to_parquet('event_success_dataset.parquet', index=False)

   event_id  event_date start_time  end_time  like_count  checkin_count avg_event_rating  success_flag
0       391  2025-04-23   21:00:00  22:00:00           0              0             None             0
1       392  2025-04-10   19:00:00  01:00:00           0              0             None             0
2       393  2025-05-06   17:00:00  22:00:00           0              0             None             0
3       394  2025-04-15   20:00:00  02:00:00           0              0             None             0
4       395  2025-04-25   21:00:00  01:00:00           0              0             None             0
5       396  2025-04-22   17:00:00  00:00:00           0              0             None             0
6       397  2025-04-13   21:00:00  23:00:00           0              0             None             0
7       398  2025-04-27   20:00:00  01:00:00           0              0             None             0
8       399  2025-05-02   21:00:00  23:00:00           0              0  

In [66]:
query = """
SELECT COUNT(DISTINCT user_id) FROM pnta.user_event_like;


"""

# Step 4: Run the query
df = pd.read_sql_query(query, engine)

pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
# Step 5: Inspect the result
print(df.head(10))


# Or as Parquet
# df.to_parquet('event_success_dataset.parquet', index=False)

   count
0    300


In [None]:
import pandas as pd
from sqlalchemy import create_engine

# Database connection
DB_USER = 'postgres'
DB_PASS = 'krisdan29'
DB_HOST = 'localhost'
DB_PORT = '5432'
DB_NAME = 'pnta'

engine = create_engine(f'postgresql+psycopg2://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}')

# Load each required table into its own DataFrame

# 1. user_event_like (event likes)
df_user_event_like = pd.read_sql("SELECT user_id, event_id FROM pnta.user_event_like", engine)

# 2. user_event_rating
df_user_event_rating = pd.read_sql("SELECT user_id, event_id, rating FROM pnta.user_event_rating", engine)

# 3. user_event_checkin
df_user_event_checkin = pd.read_sql("SELECT user_id, event_id, checked_in_at FROM pnta.user_event_checkin", engine)

# 4. event
df_event = pd.read_sql("SELECT event_id, event_date, start_time, end_time, venue_id FROM pnta.event", engine)

# 5. event_tag
df_event_tag = pd.read_sql("SELECT * FROM pnta.event_tag", engine)

# 6. location
df_location = pd.read_sql("SELECT venue_id, city FROM pnta.location", engine)

# 7. tag
df_tag = pd.read_sql("SELECT * FROM pnta.tag", engine)

# 8. user_filter_tag
df_user_filter_tag = pd.read_sql("SELECT * FROM pnta.user_filter_tag", engine)

# 9. user_venue_like
df_user_venue_like = pd.read_sql("SELECT user_id, venue_id FROM pnta.user_venue_like", engine)

# 10. user_venue_rating
df_user_venue_rating = pd.read_sql("SELECT user_id, venue_id, rating FROM pnta.user_venue_rating", engine)

# 11. userp
df_userp = pd.read_sql("SELECT user_id, gender, city, birth_date FROM pnta.userp", engine)

# 12. venue
df_venue = pd.read_sql("SELECT venue_id, price_rating, rating FROM pnta.venue", engine)


# Show first 5 rows of each DataFrame

print("=== df_user_event_like ===")
print(df_user_event_like.head(), "\n")

print("=== df_user_event_rating ===")
print(df_user_event_rating.head(), "\n")

print("=== df_user_event_checkin ===")
print(df_user_event_checkin.head(), "\n")

print("=== df_event ===")
print(df_event.head(), "\n")

print("=== df_event_tag ===")
print(df_event_tag.head(), "\n")

print("=== df_location ===")
print(df_location.head(), "\n")

print("=== df_tag ===")
print(df_tag.head(), "\n")

print("=== df_user_filter_tag ===")
print(df_user_filter_tag.head(), "\n")

print("=== df_user_venue_like ===")
print(df_user_venue_like.head(), "\n")

print("=== df_user_venue_rating ===")
print(df_user_venue_rating.head(), "\n")

print("=== df_userp ===")
print(df_userp.head(), "\n")

print("=== df_venue ===")
print(df_venue.head(), "\n")



=== df_likes ===
   user_id  event_id
0        1        24
1        1        18
2        1        12
3        1        13
4        2         9 

=== df_ratings ===
   user_id  event_id  rating
0      262        11       3
1      123        10       5
2       18        22       5
3        1        13       5
4      299        30       5 

=== df_checkins ===
   user_id  event_id              checked_in_at
0       18        22 2025-04-08 17:58:01.862748
1      206        21 2025-04-08 17:58:01.862748
2       82        11 2025-04-08 17:58:01.862748
3      234        15 2025-04-08 17:58:01.862748
4      291        14 2025-04-08 17:58:01.862748 

=== df_event ===
   event_id  event_date start_time  end_time  venue_id
0       391  2025-04-23   21:00:00  22:00:00        13
1       392  2025-04-10   19:00:00  01:00:00        10
2       393  2025-05-06   17:00:00  22:00:00        22
3       394  2025-04-15   20:00:00  02:00:00        27
4       395  2025-04-25   21:00:00  01:00:00        13 

=