In [3]:
from sqlalchemy import create_engine
import pandas as pd

# Set up your PostgreSQL connection
username = 'postgres'
password = 'krisdan29'
host = 'localhost'  # change if needed
port = '5432'
dbname = 'pnta'

# Connect to DB
engine = create_engine(f'postgresql+psycopg2://{username}:{password}@{host}:{port}/{dbname}')




In [12]:
# Load relevant tables from the database
event = pd.read_sql("SELECT * FROM pnta.event", con=engine)
venue = pd.read_sql("SELECT * FROM pnta.venue", con=engine)
location = pd.read_sql("SELECT * FROM pnta.location", con=engine)  # 👈 ADD THIS
checkins = pd.read_sql("SELECT * FROM pnta.user_event_checkin", con=engine)
likes = pd.read_sql("SELECT * FROM pnta.user_event_like", con=engine)
ratings = pd.read_sql("SELECT * FROM pnta.user_event_rating", con=engine)
event_tags = pd.read_sql("SELECT * FROM pnta.event_tag", con=engine)
tags = pd.read_sql("SELECT * FROM pnta.tag", con=engine)

# Load venue popularity summary CSV
venue_success = pd.read_csv("../venuePopularity/venue_success_summary.csv")


In [35]:
import pandas as pd

# Modified version of the final script with season removed and event_month one-hot encoded instead


# Merge event with venue
event_full = pd.merge(event, venue, on="venue_id", how="left")

# Merge with venue success summary (adds venue_popularity_tier and metrics)
event_full = pd.merge(event_full, venue_success, on="venue_id", how="left")

# Merge with aggregated check-ins, likes, and ratings
checkin_counts = checkins.groupby("event_id").size().reset_index(name="checkin_count")
like_counts = likes.groupby("event_id").size().reset_index(name="like_count")
rating_agg = ratings.groupby("event_id").agg(
    rating_avg=("rating", "mean"),
    rating_count=("rating", "count")
).reset_index()

event_full = pd.merge(event_full, checkin_counts, on="event_id", how="left")
event_full = pd.merge(event_full, like_counts, on="event_id", how="left")
event_full = pd.merge(event_full, rating_agg, on="event_id", how="left")

# Merge event tags and tags (for tag names)
event_tag_names = pd.merge(event_tags, tags, on="tag_id", how="left")[["event_id", "name"]]

# Group tag names into a list per event
event_tags_grouped = (
    event_tag_names.groupby("event_id")["name"]
    .apply(list)
    .reset_index(name="tag_list")
)

# Drop duplicates before merging back tag_list
event_full = event_full.drop_duplicates(subset="event_id")

# Merge tag list back in
event_full = pd.merge(event_full, event_tags_grouped, on="event_id", how="left")

# Fill NaNs in interaction counts with 0
event_full["checkin_count"] = event_full["checkin_count"].fillna(0).astype(int)
event_full["like_count"] = event_full["like_count"].fillna(0).astype(int)
event_full["rating_count"] = event_full["rating_count"].fillna(0).astype(int)
event_full["rating_avg"] = event_full["rating_avg"].fillna(0)

# Drop irrelevant columns if they exist
columns_to_drop = ['name_x', 'description_x', 'description_y', 'created_at', 'manager_id', 'name_y', 'picture', 'icon']
event_full = event_full.drop(columns=[col for col in columns_to_drop if col in event_full.columns])

# ---------- Feature engineering: time-based ----------
event_full["event_date"] = pd.to_datetime(event_full["event_date"])
event_full["event_weekday"] = event_full["event_date"].dt.weekday
event_full["event_month"] = event_full["event_date"].dt.month  # (1–12)

# Extract hour from start time
event_full["start_hour"] = pd.to_datetime(event_full["start_time"], format='%H:%M:%S').dt.hour

# ---------- Remove invalid or incomplete rows ----------
before = len(event_full)
event_full = event_full.dropna().reset_index(drop=True)
after = len(event_full)
print(f"Dropped {before - after} rows with NaN values.")

# Keep only logical user behavior: likes >= checkins >= ratings
event_full = event_full[
    (event_full["like_count"] >= event_full["checkin_count"]) &
    (event_full["checkin_count"] >= event_full["rating_count"])
].reset_index(drop=True)

# ---------- Generate Venue-Adjusted Success Score ----------
event_full["success_score"] = (
    0.5 * (event_full["checkin_count"] / (event_full["avg_checkins"] + 1)) +
    0.3 * (event_full["like_count"] / (event_full["avg_likes"] + 1)) +
    0.2 * (event_full["rating_avg"] / (event_full["avg_event_rating"] + 0.1))
)

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
event_full["success_score_scaled"] = scaler.fit_transform(event_full[["success_score"]])

# --- City-level features ---
city_stats = event_full.groupby("city").agg(
    city_avg_success=("success_score_scaled", "mean"),
    city_total_events=("event_id", "count"),
    city_avg_venue_popularity=("venue_popularity_tier", "mean")
).reset_index()

# Merge back into main dataset
event_full = pd.merge(event_full, city_stats, on="city", how="left")



# ---------- Encoding categorical + tag features ----------
# Ordinal encode price rating
price_map = {"LOW": 0, "MEDIUM": 1, "HIGH": 2}
event_full["price_rating_encoded"] = event_full["price_rating"].map(price_map)

# One-hot encode weekday, month, hour
event_full = pd.get_dummies(event_full, columns=["event_weekday"], prefix="weekday")
event_full = pd.get_dummies(event_full, columns=["event_month"], prefix="month")
event_full = pd.get_dummies(event_full, columns=["start_hour"], prefix="hour")

# ✅ Convert boolean columns to 0/1 integers
bool_cols = event_full.select_dtypes(include=["bool"]).columns
event_full[bool_cols] = event_full[bool_cols].astype(int)


# Multi-hot encode tags
unique_tags = sorted({tag for tags in event_full["tag_list"] for tag in tags})
for tag in unique_tags:
    event_full[f"tag_{tag}"] = event_full["tag_list"].apply(lambda tags: int(tag in tags))


event_full = pd.get_dummies(event_full, columns=["city"], prefix="city")

# Convert any resulting bools to 0/1
bool_cols = event_full.select_dtypes(include=["bool"]).columns
event_full[bool_cols] = event_full[bool_cols].astype(int)

event_full["rating_scaled"] = scaler.fit_transform(event_full[["rating"]])

event_full = event_full.rename(columns={"rating": "venue_rating"})
event_full["venue_rating_scaled"] = scaler.fit_transform(event_full[["venue_rating"]])

from sklearn.preprocessing import StandardScaler

cols_to_scale = [
    'total_events', 'avg_checkins', 'avg_likes',
    'avg_event_rating', 'median_event_rating', 'venue_popularity_tier',
    'checkin_count', 'like_count', 'rating_avg', 'rating_count',
    'city_avg_success', 'city_total_events', 'city_avg_venue_popularity'
]

scaler = StandardScaler()
event_full[cols_to_scale] = scaler.fit_transform(event_full[cols_to_scale])


# Drop unused columns
drop_cols = [
    "event_date", "start_time", "end_time", "price_rating", "event_id", "venue_id",
    "close_hours", "open_hours", "tag_list", "venue_rating", "rating_scaled", "success_score"
]
event_full = event_full.drop(columns=drop_cols, errors="ignore")

# ---------- View preview ----------
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
pd.set_option('display.max_rows', 100)

event_full.head(1000)





Dropped 676 rows with NaN values.


Unnamed: 0,total_events,avg_checkins,avg_likes,avg_event_rating,median_event_rating,venue_popularity_tier,checkin_count,like_count,rating_avg,rating_count,success_score_scaled,city_avg_success,city_total_events,city_avg_venue_popularity,price_rating_encoded,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,month_1,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,tag_40+,tag_Acoustic,tag_Any,tag_Art Bar,tag_Barcade,tag_Beach,tag_Brew,tag_Club,tag_College,tag_Concept,tag_Custom,tag_DJ,tag_Dive,tag_Dress Code,tag_Gastro,tag_Guest DJs,tag_High,tag_Hookah,tag_Hotel,tag_House,tag_Indie,tag_Inn,tag_Jazz,tag_Karaoke,tag_LGBTQ,tag_Live,tag_Local,tag_Loud,tag_Lounge,tag_Low,tag_Medium,tag_Mixed Crowd,tag_None,tag_Open,tag_Open Air,tag_Pool,tag_Pop,tag_Pub,tag_Rap,tag_Rave,tag_Rooftop,tag_Sports,tag_Street,tag_Student,tag_Tavern,tag_Techno,tag_Themed Night,tag_Tourists,tag_Young,city_Aalborg,city_Aarhus,city_Copenhagen,city_Esbjerg,city_Horsens,city_Kolding,city_Odense,city_Randers,city_Roskilde,city_Vejle,venue_rating_scaled
0,0.977560,0.741763,0.282260,1.583876,1.120445,1.248166,2.051580,-0.907938,-0.880031,1.278251,0.521867,0.434858,0.290353,-0.444820,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0.85
1,-1.636414,0.042223,-0.401918,-1.467651,-1.204822,-0.208173,0.289125,-2.217749,1.100073,-0.285580,0.418315,0.956156,-1.629651,-1.232985,2,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0.65
2,0.230711,-0.583600,-0.856091,-1.312985,-1.744992,-0.936342,1.548021,-1.693825,-0.338647,1.538890,0.538859,0.956156,-1.629651,-1.232985,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0.10
3,-1.076277,-0.818588,-1.132872,-0.144552,-2.023289,-0.936342,2.806918,-0.122051,1.607663,1.278251,0.830268,-0.420195,-0.698740,1.133684,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0.10
4,-0.142714,1.003452,0.838766,0.527741,0.898343,1.248166,2.303360,0.270893,0.201041,1.017613,0.635617,-0.420195,-0.698740,1.133684,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0.90
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
831,0.230711,-0.336725,-0.306195,-0.401114,-0.116978,-0.208173,-0.466213,-0.384013,-0.588543,-0.285580,0.386244,0.130055,-1.222377,-1.908751,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0.25
832,0.790848,0.010271,0.092136,0.854712,1.788841,-0.208173,-0.214434,-0.122051,-1.278062,0.496336,0.374049,-0.761798,1.744901,0.953985,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0.10
833,-0.142714,1.293432,1.361953,0.618012,0.000175,1.248166,1.296242,1.449723,0.037652,-0.806857,0.582373,0.434858,0.290353,-0.444820,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0.40
834,1.350985,0.757306,0.340252,-0.031451,-0.351283,1.248166,0.037345,-0.122051,1.260491,0.496336,0.470358,-0.761798,1.744901,0.953985,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0.45


In [37]:
# Define target variable
y = event_full["success_score_scaled"]

# Define input features: drop all labels and raw engagement columns
X = event_full.drop(columns=[
     "success_score_scaled",
    "checkin_count", "like_count", "rating_avg",  # optional: remove raw targets
      # optional: remove IDs if not needed
])

# Export for use elsewhere
event_full.to_csv("event_success_dataset.csv", index=False)
print(X.columns.tolist())

print("Dataset exported as event_success_dataset.csv")


['total_events', 'avg_checkins', 'avg_likes', 'avg_event_rating', 'median_event_rating', 'venue_popularity_tier', 'rating_count', 'city_avg_success', 'city_total_events', 'city_avg_venue_popularity', 'price_rating_encoded', 'weekday_0', 'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'month_1', 'month_2', 'month_3', 'month_4', 'month_5', 'month_6', 'month_7', 'month_8', 'month_9', 'month_10', 'month_11', 'month_12', 'hour_17', 'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22', 'tag_40+', 'tag_Acoustic', 'tag_Any', 'tag_Art Bar', 'tag_Barcade', 'tag_Beach', 'tag_Brew', 'tag_Club', 'tag_College', 'tag_Concept', 'tag_Custom', 'tag_DJ', 'tag_Dive', 'tag_Dress Code', 'tag_Gastro', 'tag_Guest DJs', 'tag_High', 'tag_Hookah', 'tag_Hotel', 'tag_House', 'tag_Indie', 'tag_Inn', 'tag_Jazz', 'tag_Karaoke', 'tag_LGBTQ', 'tag_Live', 'tag_Local', 'tag_Loud', 'tag_Lounge', 'tag_Low', 'tag_Medium', 'tag_Mixed Crowd', 'tag_None', 'tag_Open', 'tag_Open Air', 'tag_Pool', '