In [None]:
import pandas as pd
from sqlalchemy import create_engine
from sklearn.preprocessing import MultiLabelBinarizer
import ace_tools as tools


DB_USER = 'postgres'
DB_PASS = 'krisdan29'   # change
DB_HOST = 'localhost'
DB_PORT = '5432'
DB_NAME = 'pnta'

# Step 2: Create SQLAlchemy engine
engine = create_engine(f'postgresql+psycopg2://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}')


In [None]:
# 1. user_event_like (event likes)
df_user_event_like = pd.read_sql("SELECT user_id, event_id FROM pnta.user_event_like", engine)

# 2. user_event_rating
df_user_event_rating = pd.read_sql("SELECT user_id, event_id, rating FROM pnta.user_event_rating", engine)

# 3. user_event_checkin
df_user_event_checkin = pd.read_sql("SELECT user_id, event_id, checked_in_at FROM pnta.user_event_checkin", engine)

# 4. event
df_event = pd.read_sql("SELECT event_id, event_date, start_time, end_time, venue_id FROM pnta.event", engine)

# 5. event_tag
df_event_tag = pd.read_sql("SELECT * FROM pnta.event_tag", engine)

# 6. location
df_location = pd.read_sql("SELECT venue_id, city FROM pnta.location", engine)

# 7. tag
df_tag = pd.read_sql("SELECT * FROM pnta.tag", engine)

# 8. user_filter_tag
df_user_filter_tag = pd.read_sql("SELECT * FROM pnta.user_filter_tag", engine)

# 9. user_venue_like
df_user_venue_like = pd.read_sql("SELECT user_id, venue_id FROM pnta.user_venue_like", engine)

# 10. user_venue_rating
df_user_venue_rating = pd.read_sql("SELECT user_id, venue_id, rating FROM pnta.user_venue_rating", engine)

# 11. userp
df_userp = pd.read_sql("SELECT user_id, gender, city, birth_date FROM pnta.userp", engine)

# 12. venue
df_venue = pd.read_sql("SELECT venue_id, price_rating, rating FROM pnta.venue", engine)


In [None]:
# 1. Aggregate check-ins as our target
df_event_attendance = df_user_event_checkin.groupby('event_id').size().reset_index(name='checkin_count')

# 2. Merge basic event data
df = df_event.merge(df_event_attendance, on='event_id', how='left')
df = df.merge(df_venue, on='venue_id', how='left')
df = df.merge(df_location, on='venue_id', how='left')

# 3. Process event tags: explode tags per event
df_event_tags_full = df_event_tag.merge(df_tag, on='tag_id', how='left')
df_tag_counts = df_event_tags_full.groupby(['event_id', 'tag']).size().unstack(fill_value=0)

# 4. Combine with main df
df = df.merge(df_tag_counts, on='event_id', how='left')
df = df.fillna({'checkin_count': 0})
df = df.fillna(0)  # for tag columns

# 5. Convert target to binary
df['successful'] = (df['checkin_count'] >= 50).astype(int)


print(df.head(30))

