In [36]:
from sqlalchemy import create_engine
import pandas as pd

# Set up your PostgreSQL connection
username = 'postgres'
password = 'krisdan29'
host = 'localhost'  # change if needed
port = '5432'
dbname = 'pnta'

# Connect to DB
engine = create_engine(f'postgresql+psycopg2://{username}:{password}@{host}:{port}/{dbname}')




In [37]:
# Load relevant tables from the database
event = pd.read_sql("SELECT * FROM pnta.event", con=engine)
venue = pd.read_sql("SELECT * FROM pnta.venue", con=engine)
location = pd.read_sql("SELECT * FROM pnta.location", con=engine)  # 👈 ADD THIS
checkins = pd.read_sql("SELECT * FROM pnta.user_event_checkin", con=engine)
likes = pd.read_sql("SELECT * FROM pnta.user_event_like", con=engine)
ratings = pd.read_sql("SELECT * FROM pnta.user_event_rating", con=engine)
event_tags = pd.read_sql("SELECT * FROM pnta.event_tag", con=engine)
tags = pd.read_sql("SELECT * FROM pnta.tag", con=engine)

# Load venue popularity summary CSV
venue_success = pd.read_csv("../venuePopularity/venue_success_summary.csv")


In [38]:
import pandas as pd

# Modified version of the final script with season removed and event_month one-hot encoded instead


# Merge event with venue
event_full = pd.merge(event, venue, on="venue_id", how="left")

# Merge with venue success summary (adds venue_popularity_tier and metrics)
event_full = pd.merge(event_full, venue_success, on="venue_id", how="left")

# Merge with aggregated check-ins, likes, and ratings
checkin_counts = checkins.groupby("event_id").size().reset_index(name="checkin_count")
like_counts = likes.groupby("event_id").size().reset_index(name="like_count")
rating_agg = ratings.groupby("event_id").agg(
    rating_avg=("rating", "mean"),
    rating_count=("rating", "count")
).reset_index()

event_full = pd.merge(event_full, checkin_counts, on="event_id", how="left")
event_full = pd.merge(event_full, like_counts, on="event_id", how="left")
event_full = pd.merge(event_full, rating_agg, on="event_id", how="left")

# Merge event tags and tags (for tag names)
event_tag_names = pd.merge(event_tags, tags, on="tag_id", how="left")[["event_id", "name"]]

# Group tag names into a list per event
event_tags_grouped = (
    event_tag_names.groupby("event_id")["name"]
    .apply(list)
    .reset_index(name="tag_list")
)

# Drop duplicates before merging back tag_list
event_full = event_full.drop_duplicates(subset="event_id")

# Merge tag list back in
event_full = pd.merge(event_full, event_tags_grouped, on="event_id", how="left")

# Fill NaNs in interaction counts with 0
event_full["checkin_count"] = event_full["checkin_count"].fillna(0).astype(int)
event_full["like_count"] = event_full["like_count"].fillna(0).astype(int)
event_full["rating_count"] = event_full["rating_count"].fillna(0).astype(int)
event_full["rating_avg"] = event_full["rating_avg"].fillna(0)

# ✅ ADD INTERACTION FEATURE HERE
event_full["engagement"] = event_full["checkin_count"] * event_full["rating_avg"]

# Drop irrelevant columns if they exist
columns_to_drop = ['name_x', 'description_x', 'description_y', 'created_at', 'manager_id', 'name_y', 'picture', 'icon']
event_full = event_full.drop(columns=[col for col in columns_to_drop if col in event_full.columns])

# ---------- Feature engineering: time-based ----------
event_full["event_date"] = pd.to_datetime(event_full["event_date"])
event_full["event_weekday"] = event_full["event_date"].dt.weekday
event_full["event_month"] = event_full["event_date"].dt.month  # (1–12)

# Extract hour from start time
event_full["start_hour"] = pd.to_datetime(event_full["start_time"], format='%H:%M:%S').dt.hour

# ---------- Remove invalid or incomplete rows ----------
before = len(event_full)
event_full = event_full.dropna().reset_index(drop=True)
after = len(event_full)
print(f"Dropped {before - after} rows with NaN values.")

# Keep only logical user behavior: likes >= checkins >= ratings
event_full = event_full[
    (event_full["like_count"] >= event_full["checkin_count"]) &
    (event_full["checkin_count"] >= event_full["rating_count"])
].reset_index(drop=True)

# ---------- Generate Venue-Adjusted Success Score ----------
event_full["success_score"] = (
    0.5 * (event_full["checkin_count"] / (event_full["avg_checkins"] + 1)) +
    0.3 * (event_full["like_count"] / (event_full["avg_likes"] + 1)) +
    0.2 * (event_full["rating_avg"] / (event_full["avg_event_rating"] + 0.1))
)

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
event_full["success_score_scaled"] = scaler.fit_transform(event_full[["success_score"]])



# ---------- Encoding categorical + tag features ----------
# Ordinal encode price rating
price_map = {"LOW": 0, "MEDIUM": 1, "HIGH": 2}
event_full["price_rating_encoded"] = event_full["price_rating"].map(price_map)


# Convert tag_list to string (if needed), then encode
event_full["tag"] = event_full["tag_list"].astype(str)
# Provided mapping
tag2id = {
    'Techno': 1, 'House': 2, 'Rap': 3, 'Rave': 4, 'Live': 5, 'Pop': 6, 'Loud': 7, 'None': 8, 'DJ': 9, 'Acoustic': 10,
    'Indie': 11, 'Jazz': 12, 'Pub': 13, 'Club': 14, 'Tavern': 15, 'Inn': 16, 'Hotel': 17, 'Dive': 18, 'Brew': 19,
    'Gastro': 20, 'Rooftop': 21, 'Art Bar': 22, 'Lounge': 23, 'Barcade': 24, 'Karaoke': 25, 'Hookah': 26, 'Concept': 27,
    'Pool': 28, 'Beach': 29, 'Sports': 30, 'College': 31, 'Open Air': 32, 'Dress Code': 33, 'Themed Night': 34,
    'Guest DJs': 35, 'Student': 36, 'Young': 37, '40+': 38, 'LGBTQ': 39, 'Street': 40, 'Local': 41, 'Tourists': 42,
    'Mixed Crowd': 43, 'Expats': 44, 'Low': 45, 'Medium': 46, 'High': 47, 'Open': 48, 'Custom': 49, 'Any': 50
}

# Convert string tags to integers using provided mapping
event_full["event_tag"] = event_full["tag_list"].apply(
    lambda tags: [tag2id[tag] for tag in tags if tag in tag2id]
)

from tensorflow.keras.preprocessing.sequence import pad_sequences

# Choose max number of tags per event (e.g. 5)
max_len = 6

# Pad all tag lists to length 5
event_tag_padded = pad_sequences(event_full["event_tag"], maxlen=max_len, padding="post", truncating="post")

# Add to your dataframe (optional, useful for inspection)
tag_cols = [f"tag_slot_{i}" for i in range(max_len)]
event_full[tag_cols] = pd.DataFrame(event_tag_padded.tolist(), index=event_full.index)



# Drop unused columns
drop_cols = [
    "event_date", "start_time", "end_time", "price_rating", "tag","event_tag",
    "close_hours", "open_hours", "tag_list", "venue_rating", "rating_scaled", "success_score"
]
event_full = event_full.drop(columns=drop_cols, errors="ignore")

# ---------- View preview ----------
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
pd.set_option('display.max_rows', 100)

event_full.head(500)





Dropped 676 rows with NaN values.


Unnamed: 0,event_id,venue_id,rating,total_events,avg_checkins,avg_likes,avg_event_rating,median_event_rating,venue_popularity_tier,checkin_count,like_count,rating_avg,rating_count,engagement,event_weekday,event_month,start_hour,success_score_scaled,price_rating_encoded,tag_slot_0,tag_slot_1,tag_slot_2,tag_slot_3,tag_slot_4,tag_slot_5
0,2,72,4.7,41,20.170732,39.048780,3.564462,3.625000,5,34,44,2.857143,28,97.142857,4,3,21,0.521867,1,4,12,0,0,0,0
1,8,80,4.3,27,19.481481,37.703704,3.096752,3.285714,3,27,34,4.136364,22,111.681818,4,3,19,0.418315,2,40,1,12,0,0,0
2,9,195,3.2,37,18.864865,36.810811,3.120457,3.206897,2,32,38,3.206897,29,102.620690,3,3,19,0.538859,0,24,15,0,0,0,0
3,10,136,3.2,30,18.633333,36.266667,3.299544,3.166290,2,37,50,4.464286,28,165.178571,3,3,20,0.830268,0,24,45,38,0,0,0
4,11,41,4.8,35,20.428571,40.142857,3.402587,3.592593,5,35,53,3.555556,27,124.444444,5,5,19,0.635617,0,4,5,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,6431,61,3.7,35,19.257143,38.171429,3.151590,3.281250,3,28,55,3.629630,27,101.629630,4,9,17,0.567380,1,8,16,6,0,0,0
496,6434,169,4.7,41,18.365854,36.195122,3.423374,3.526316,2,26,51,2.782609,23,72.347826,4,8,20,0.475493,2,14,0,0,0,0,0
497,6435,65,4.6,39,19.051282,37.769231,3.262333,3.545455,3,28,55,4.080000,25,114.240000,4,10,22,0.597265,2,40,10,50,31,0,0
498,6446,299,4.0,38,17.131579,33.921053,3.119651,3.540727,1,24,48,2.523810,21,60.571429,0,1,19,0.466192,2,48,10,4,46,0,0


In [39]:
# Define target variable
y = event_full["success_score_scaled"]

# Define input features: drop all labels and raw engagement columns
X = event_full.drop(columns=[
     "success_score_scaled",
    "checkin_count", "like_count", "rating_avg",  # optional: remove raw targets
      # optional: remove IDs if not needed
])

# Export for use elsewhere
event_full.to_csv("event_success_dataset.csv", index=False)
print(X.columns.tolist())

print("Dataset exported as event_success_dataset.csv")


['event_id', 'venue_id', 'rating', 'total_events', 'avg_checkins', 'avg_likes', 'avg_event_rating', 'median_event_rating', 'venue_popularity_tier', 'rating_count', 'engagement', 'event_weekday', 'event_month', 'start_hour', 'price_rating_encoded', 'tag_slot_0', 'tag_slot_1', 'tag_slot_2', 'tag_slot_3', 'tag_slot_4', 'tag_slot_5']
Dataset exported as event_success_dataset.csv
