In [2]:
import pandas as pd
from sqlalchemy import create_engine
from sklearn.preprocessing import MultiLabelBinarizer



DB_USER = 'postgres'
DB_PASS = 'krisdan29'   # change
DB_HOST = 'localhost'
DB_PORT = '5432'
DB_NAME = 'pnta'

# Step 2: Create SQLAlchemy engine
engine = create_engine(f'postgresql+psycopg2://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}')


In [3]:
# 1. user_event_like (event likes)
df_user_event_like = pd.read_sql("SELECT user_id, event_id FROM pnta.user_event_like", engine)

# 2. user_event_rating
df_user_event_rating = pd.read_sql("SELECT user_id, event_id, rating FROM pnta.user_event_rating", engine)

# 3. user_event_checkin
df_user_event_checkin = pd.read_sql("SELECT user_id, event_id, checked_in_at FROM pnta.user_event_checkin", engine)

# 4. event
df_event = pd.read_sql("SELECT event_id, event_date, start_time, end_time, venue_id FROM pnta.event", engine)

# 5. event_tag
df_event_tag = pd.read_sql("SELECT * FROM pnta.event_tag", engine)

# 6. location
df_location = pd.read_sql("SELECT venue_id, city FROM pnta.location", engine)

# 7. tag
df_tag = pd.read_sql("SELECT * FROM pnta.tag", engine)

# 8. user_filter_tag
df_user_filter_tag = pd.read_sql("SELECT * FROM pnta.user_filter_tag", engine)

# 9. user_venue_like
df_user_venue_like = pd.read_sql("SELECT user_id, venue_id FROM pnta.user_venue_like", engine)

# 10. user_venue_rating
df_user_venue_rating = pd.read_sql("SELECT user_id, venue_id, rating FROM pnta.user_venue_rating", engine)

# 11. userp
df_userp = pd.read_sql("SELECT user_id, gender, city, birth_date FROM pnta.userp", engine)

# 12. venue
df_venue = pd.read_sql("SELECT venue_id, price_rating, rating FROM pnta.venue", engine)


In [5]:
# 1. Aggregate check-ins as our target
df_event_attendance = df_user_event_checkin.groupby('event_id').size().reset_index(name='checkin_count')

# 2. Merge basic event data
df = df_event.merge(df_event_attendance, on='event_id', how='left')
df = df.merge(df_venue, on='venue_id', how='left')
df = df.merge(df_location, on='venue_id', how='left')

# 3. Process event tags: explode tags per event
df_event_tags_full = df_event_tag.merge(df_tag, on='tag_id', how='left')

# OPTIONAL: Confirm available tag name column
print(df_event_tags_full.columns)  # Should include something like 'name' or 'tag_name'

# Use the correct tag column here (e.g., 'name' instead of 'tag')
df_tag_counts = df_event_tags_full.groupby(['event_id', 'name']).size().unstack(fill_value=0)

# 4. Combine with main df
df = df.merge(df_tag_counts, on='event_id', how='left')
df = df.fillna({'checkin_count': 0})
df = df.fillna(0)  # for tag columns

# 5. Convert checkin count to binary success target
df['successful'] = (df['checkin_count'] >= 50).astype(int)

# 6. Preview your dataset
print(df.head(30))


Index(['event_id', 'tag_id', 'name', 'category'], dtype='object')
    event_id  event_date start_time  end_time  venue_id  checkin_count  \
0          1  2025-03-04   19:00:00  21:00:00        58           21.0   
1          2  2025-03-14   21:00:00  22:00:00        72           34.0   
2          3  2025-03-05   17:00:00  18:00:00       217           25.0   
3          4  2025-05-04   21:00:00  22:00:00       120           28.0   
4          5  2025-05-23   21:00:00  01:00:00       102           23.0   
5          6  2025-05-15   19:00:00  20:00:00       230           31.0   
6          7  2025-05-29   20:00:00  23:00:00        82           25.0   
7          8  2025-03-28   19:00:00  20:00:00        80           27.0   
8          9  2025-03-13   19:00:00  22:00:00       195           32.0   
9         10  2025-03-06   20:00:00  21:00:00       136           37.0   
10        11  2025-05-10   19:00:00  22:00:00        41           35.0   
11        12  2025-05-30   17:00:00  18:00:00 

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# 1. Aggregate check-ins
df_event_attendance = df_user_event_checkin.groupby('event_id').size().reset_index(name='checkin_count')

# 2. Merge with event, venue, and location
df = df_event.merge(df_event_attendance, on='event_id', how='left')
df = df.merge(df_venue, on='venue_id', how='left')
df = df.merge(df_location, on='venue_id', how='left')

# 3. Process event tags
df_event_tags_full = df_event_tag.merge(df_tag, on='tag_id', how='left')
print(df_event_tags_full.columns)  # Confirm tag name column (usually 'name')

# Replace 'name' with actual tag column if different
df_tag_counts = df_event_tags_full.groupby(['event_id', 'name']).size().unstack(fill_value=0)

# 4. Merge tag data
df = df.merge(df_tag_counts, on='event_id', how='left')
df = df.fillna({'checkin_count': 0})
df = df.fillna(0)

# 5. Create binary success label
df['successful'] = (df['checkin_count'] >= 50).astype(int)

# 6. Optional: Encode categorical fields
if 'city' in df.columns:
    df['city'] = df['city'].astype('category').cat.codes
if 'price_rating' in df.columns:
    df['price_rating'] = df['price_rating'].astype('category').cat.codes

# 7. Prepare features and labels
excluded = ['event_id', 'venue_id', 'event_date', 'start_time', 'end_time', 'checkin_count', 'successful']
X = df.drop(columns=excluded)
y = df['successful']

# 8. Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)

# 9. Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 10. Evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


Index(['event_id', 'tag_id', 'name', 'category'], dtype='object')
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3126

    accuracy                           1.00      3126
   macro avg       1.00      1.00      1.00      3126
weighted avg       1.00      1.00      1.00      3126

