# Event Like Prediction Model
This notebook connects to your local PostgreSQL `pnta` database and trains a machine learning model to predict whether an event will get more than a threshold number of likes.

In [5]:
import pandas as pd
from sqlalchemy import create_engine
from sklearn.preprocessing import MultiLabelBinarizer

DB_USER = 'postgres'
DB_PASS = 'krisdan29'   # change
DB_HOST = 'localhost'
DB_PORT = '5432'
DB_NAME = 'pnta'

# Step 2: Create SQLAlchemy engine
engine = create_engine(f'postgresql+psycopg2://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}')


### Load each required table into its own DataFrame

In [6]:
# 1. user_event_like (event likes)
df_user_event_like = pd.read_sql("SELECT user_id, event_id FROM pnta.user_event_like", engine)

# 2. user_event_rating
df_user_event_rating = pd.read_sql("SELECT user_id, event_id, rating FROM pnta.user_event_rating", engine)

# 3. user_event_checkin
df_user_event_checkin = pd.read_sql("SELECT user_id, event_id, checked_in_at FROM pnta.user_event_checkin", engine)

# 4. event
df_event = pd.read_sql("SELECT event_id, event_date, start_time, end_time, venue_id FROM pnta.event", engine)

# 5. event_tag
df_event_tag = pd.read_sql("SELECT * FROM pnta.event_tag", engine)

# 6. location
df_location = pd.read_sql("SELECT venue_id, city FROM pnta.location", engine)

# 7. tag
df_tag = pd.read_sql("SELECT * FROM pnta.tag", engine)

# 8. user_filter_tag
df_user_filter_tag = pd.read_sql("SELECT * FROM pnta.user_filter_tag", engine)

# 9. user_venue_like
df_user_venue_like = pd.read_sql("SELECT user_id, venue_id FROM pnta.user_venue_like", engine)

# 10. user_venue_rating
df_user_venue_rating = pd.read_sql("SELECT user_id, venue_id, rating FROM pnta.user_venue_rating", engine)

# 11. userp
df_userp = pd.read_sql("SELECT user_id, gender, city, birth_date FROM pnta.userp", engine)

# 12. venue
df_venue = pd.read_sql("SELECT venue_id, price_rating, rating FROM pnta.venue", engine)


### Merge all dataframes into one using left-join

In [7]:
# Merge user_event_like: count likes per event
likes = df_user_event_like.groupby("event_id").size().reset_index(name="like_count")

# Merge user_event_checkin: count check-ins per event
checkins = df_user_event_checkin.groupby("event_id").size().reset_index(name="checkin_count")

# Merge user_event_rating: average rating per event
ratings = df_user_event_rating.groupby("event_id").agg(
    avg_event_rating=("rating", "mean"),
    event_rating_count=("rating", "count")
).reset_index()

# Merge event_tag with tag to get tag names
event_tags = pd.merge(df_event_tag, df_tag, on="tag_id", how="left")
tags = event_tags.groupby("event_id").agg(tags=("name", lambda x: ",".join(x))).reset_index()

# Merge venue with location to get venue city
venue_full = pd.merge(df_venue, df_location, on="venue_id", how="left")

# Merge user check-in with userp for demographics
user_demo = pd.merge(df_user_event_checkin, df_userp, on="user_id", how="left")
user_demo["age"] = pd.Timestamp("now").year - pd.to_datetime(user_demo["birth_date"]).dt.year
user_stats = user_demo.groupby("event_id").agg(
    avg_age=("age", "mean"),
    percent_male=("gender", lambda x: (x == "male").mean()),
    percent_female=("gender", lambda x: (x == "female").mean()),
    percent_local_users=("city", lambda x: (x == x.mode()[0]).mean() if not x.mode().empty else 0)
).reset_index()

# Start with base event info
df_merged = df_event.copy()


# Join step-by-step
df_merged = df_merged.merge(likes, on="event_id", how="left")
df_merged = df_merged.merge(checkins, on="event_id", how="left")
df_merged = df_merged.merge(ratings, on="event_id", how="left")
df_merged = df_merged.merge(tags, on="event_id", how="left")
df_merged = df_merged.merge(venue_full, on="venue_id", how="left")
df_merged = df_merged.merge(user_stats, on="event_id", how="left")
# Define success_flag based on thresholds
df_merged["success_flag"] = (
    (df_merged["like_count"] >= 30.0) &
    (df_merged["checkin_count"] >= 20.0) &
    (df_merged["event_rating_count"] >= 30.0)
).astype(int)


# Optional: Fill NaNs if desired
df_merged.fillna({
    "like_count": 0,
    "checkin_count": 0,
    "avg_event_rating": 0,
    "tags": "",
    "price_rating": 0,
    "venue_rating": 0,
    "city": "Unknown",
    "avg_age": 0,
    "percent_male": 0,
    "percent_female": 0,
    "percent_local_users": 0,
    "event_rating_count": 0
}, inplace=True)

# remove rows without tags and where rating count exceeds checkin count
df_merged = df_merged[df_merged["tags"] != ""]
valid_rows = (df_merged["event_rating_count"] <= df_merged["checkin_count"])

# Keep only valid rows
df_merged = df_merged[valid_rows].copy()

# Optional: print how many rows were removed
print(f"Rows removed: {len(valid_rows) - valid_rows.sum()}")


pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
print(df_merged.head(50))



Rows removed: 8908
     event_id  event_date start_time  end_time  venue_id  like_count  checkin_count  avg_event_rating  event_rating_count                                   tags price_rating  rating        city    avg_age  percent_male  percent_female  percent_local_users  success_flag
1           2  2025-03-14   21:00:00  22:00:00        72        44.0           34.0          2.857143                28.0                              Rave,Jazz       MEDIUM     4.7  Copenhagen  32.636364      0.470588        0.323529             0.176471             0
7           8  2025-03-28   19:00:00  20:00:00        80        34.0           27.0          4.136364                22.0                     Street,Techno,Jazz         HIGH     4.3     Aalborg  31.888889      0.444444        0.333333             0.185185             0
8           9  2025-03-13   19:00:00  22:00:00       195        38.0           32.0          3.206897                29.0                         Barcade,Tavern          L

### Check how balanced success flag is

In [8]:
print(df_merged["success_flag"].value_counts(normalize=True))


success_flag
0    0.956938
1    0.043062
Name: proportion, dtype: float64


#### Add 'has_demographics' so model understands whether he can rely on demographics data 

In [9]:
df_merged["has_demographics"] = (df_merged["avg_age"] > 0).astype(int)

## Ordinal encoding

In [10]:
df_merged["price_rating"] = df_merged["price_rating"].str.upper()
price_map = {"LOW": 1, "MEDIUM": 2, "HIGH": 3}
df_merged["price_rating_encoded"] = df_merged["price_rating"].map(price_map)

df_merged["event_date"] = pd.to_datetime(df_merged["event_date"])
df_merged["start_time"] = pd.to_datetime(df_merged["start_time"], format="%H:%M:%S").dt.time
df_merged["end_time"] = pd.to_datetime(df_merged["end_time"], format="%H:%M:%S").dt.time



pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
print(df_merged.head(50))


     event_id event_date start_time  end_time  venue_id  like_count  checkin_count  avg_event_rating  event_rating_count                                   tags price_rating  rating        city    avg_age  percent_male  percent_female  percent_local_users  success_flag  has_demographics  price_rating_encoded
1           2 2025-03-14   21:00:00  22:00:00        72        44.0           34.0          2.857143                28.0                              Rave,Jazz       MEDIUM     4.7  Copenhagen  32.636364      0.470588        0.323529             0.176471             0                 1                     2
7           8 2025-03-28   19:00:00  20:00:00        80        34.0           27.0          4.136364                22.0                     Street,Techno,Jazz         HIGH     4.3     Aalborg  31.888889      0.444444        0.333333             0.185185             0                 1                     3
8           9 2025-03-13   19:00:00  22:00:00       195        38.0      

### Ordinal encoding of tags - Since tags do not follow frequency like Low->Medium-> High. We count matching tags and assign ordinal values to them. 

In [11]:
# Step 1: Split all tags and count frequency
from collections import Counter

# Split tags and flatten list
all_tags = df_merged["tags"].dropna().str.split(",").sum()
tag_counts = Counter(all_tags)

# Step 2: Assign ordinal codes (most common gets lowest code)
sorted_tags = [tag for tag, _ in tag_counts.most_common()]
tag_to_code = {tag: idx for idx, tag in enumerate(sorted_tags)}

# Step 3: Map encoded tags back to each row (as list of ints)
def encode_tag_list(tag_str):
    if pd.isna(tag_str) or tag_str.strip() == "":
        return []
    return [tag_to_code[tag] for tag in tag_str.split(",") if tag in tag_to_code]

df_merged["tags_encoded"] = df_merged["tags"].apply(encode_tag_list)
print(df_merged[["tags", "tags_encoded"]].head(10))



                  tags  tags_encoded
1            Rave,Jazz        [7, 8]
7   Street,Techno,Jazz    [34, 5, 8]
8       Barcade,Tavern      [13, 11]
9      Barcade,Low,40+  [13, 14, 27]
10           Rave,Live       [7, 19]
11   DJ,Art Bar,Lounge   [3, 30, 28]
12        DJ,Dive,Club    [3, 6, 12]
14          Local,Live      [33, 19]
18    DJ,Indie,Student   [3, 17, 32]
19               Young          [31]


NEW Features to help model

In [12]:
df_exploded = df_merged.explode("tags_encoded")


df_exploded = df_exploded.dropna(subset=["tags_encoded"])


tag_stats = df_exploded.groupby("tags_encoded").agg(
    avg_likes_per_tag=("like_count", "mean"),
    avg_checkins_per_tag=("checkin_count", "mean"),
    avg_rating_per_tag=("avg_event_rating", "mean")
).reset_index()


df_with_tags = df_merged[["event_id", "tags_encoded"]].explode("tags_encoded").dropna()
df_with_tags = df_with_tags.merge(tag_stats, on="tags_encoded", how="left")


event_tag_means = df_with_tags.groupby("event_id").agg(
    expected_likes=("avg_likes_per_tag", "mean"),
    expected_checkins=("avg_checkins_per_tag", "mean"),
    expected_rating=("avg_rating_per_tag", "mean")
).reset_index()


df_merged = df_merged.drop(columns=["expected_likes", "expected_checkins", "expected_rating"], errors="ignore")
df_merged = df_merged.merge(event_tag_means, on="event_id", how="left")

df_merged = df_merged[
    (df_merged['expected_likes'] != 0) &
    (df_merged['expected_checkins'] != 0) &
    (df_merged['expected_rating'] != 0)
]

print(df_merged.head(50))

    event_id event_date start_time  end_time  venue_id  like_count  checkin_count  avg_event_rating  event_rating_count                                   tags price_rating  rating        city    avg_age  percent_male  percent_female  percent_local_users  success_flag  has_demographics  price_rating_encoded             tags_encoded  expected_likes  expected_checkins  expected_rating
0          2 2025-03-14   21:00:00  22:00:00        72        44.0           34.0          2.857143                28.0                              Rave,Jazz       MEDIUM     4.7  Copenhagen  32.636364      0.470588        0.323529             0.176471             0                 1                     2                   [7, 8]       50.493506          25.746753         3.424354
1          8 2025-03-28   19:00:00  20:00:00        80        34.0           27.0          4.136364                22.0                     Street,Techno,Jazz         HIGH     4.3     Aalborg  31.888889      0.444444        0.3333

### Since XGboost doesn't take in arrays into classifying output, we multi-hot encode these tags

In [13]:
mlb = MultiLabelBinarizer()
tags_multi_hot = mlb.fit_transform(df_merged["tags_encoded"])

# Step 2: Clean and flatten tag columns
tag_names = [str(cls).replace(" ", "_").replace("/", "_") for cls in mlb.classes_]
tags_df = pd.DataFrame(tags_multi_hot, columns=[f"type_tag_{name}" for name in tag_names])

# Step 3: Ensure all tag values are scalar integers
tags_df = tags_df.astype("int8")  # or "int32" if many rows

# Step 4: Reset index before merge to avoid nesting
tags_df.index = df_merged.index
df_merged = df_merged.reset_index(drop=True)

# Step 5: Concatenate the tag features
df_merged = pd.concat([df_merged, tags_df], axis=1)

### Same ordinal encoding for cities

In [14]:
# Frequency-based ordinal encoding
city_freq = df_merged["city"].value_counts().index.tolist()
city_to_code = {city: idx for idx, city in enumerate(city_freq)}
df_merged["city_encoded"] = df_merged["city"].map(city_to_code)

# Actual population mapping (Denmark example)
city_population_map = {
    "Copenhagen": 1340000,
    "Aarhus": 280000,
    "Odense": 180000,
    "Aalborg": 120000,
    "Esbjerg": 72000,
    "Randers": 62000,
    "Kolding": 61000,
    "Horsens": 59000,
    "Vejle": 57000,
    "Roskilde": 50000,
    "Unknown": 0
}
df_merged["city_population"] = df_merged["city"].map(city_population_map)

# Optional: Create categorical size label too
def pop_category(pop):
    if pop >= 500000:
        return "large"
    elif pop >= 100000:
        return "medium"
    elif pop > 0:
        return "small"
    else:
        return "unknown"

df_merged["city_size_category"] = df_merged["city_population"].apply(pop_category)

# Preview
print(df_merged[["city", "city_encoded", "city_population", "city_size_category"]].drop_duplicates().sort_values("city_encoded"))

          city  city_encoded  city_population city_size_category
5       Aarhus             0           280000             medium
16    Roskilde             1            50000              small
7      Randers             2            62000              small
0   Copenhagen             3          1340000              large
18      Odense             4           180000             medium
14     Esbjerg             5            72000              small
3        Vejle             6            57000              small
13     Horsens             7            59000              small
46     Kolding             8            61000              small
1      Aalborg             9           120000             medium


In [15]:
df_merged.head(50
    )

Unnamed: 0,event_id,event_date,start_time,end_time,venue_id,like_count,checkin_count,avg_event_rating,event_rating_count,tags,price_rating,rating,city,avg_age,percent_male,percent_female,percent_local_users,success_flag,has_demographics,price_rating_encoded,tags_encoded,expected_likes,expected_checkins,expected_rating,type_tag_0,type_tag_1,type_tag_2,type_tag_3,type_tag_4,type_tag_5,type_tag_6,type_tag_7,type_tag_8,type_tag_9,type_tag_10,type_tag_11,type_tag_12,type_tag_13,type_tag_14,type_tag_15,type_tag_16,type_tag_17,type_tag_18,type_tag_19,type_tag_20,type_tag_21,type_tag_22,type_tag_23,type_tag_24,type_tag_25,type_tag_26,type_tag_27,type_tag_28,type_tag_29,type_tag_30,type_tag_31,type_tag_32,type_tag_33,type_tag_34,type_tag_35,type_tag_36,type_tag_37,type_tag_38,type_tag_39,type_tag_40,type_tag_41,type_tag_42,type_tag_43,type_tag_44,type_tag_45,type_tag_46,type_tag_47,type_tag_48,city_encoded,city_population,city_size_category
0,2,2025-03-14,21:00:00,22:00:00,72,44.0,34.0,2.857143,28.0,"Rave,Jazz",MEDIUM,4.7,Copenhagen,32.636364,0.470588,0.323529,0.176471,0,1,2,"[7, 8]",50.493506,25.746753,3.424354,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,1340000,large
1,8,2025-03-28,19:00:00,20:00:00,80,34.0,27.0,4.136364,22.0,"Street,Techno,Jazz",HIGH,4.3,Aalborg,31.888889,0.444444,0.333333,0.185185,0,1,3,"[34, 5, 8]",50.266524,25.563246,3.424683,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,120000,medium
2,9,2025-03-13,19:00:00,22:00:00,195,38.0,32.0,3.206897,29.0,"Barcade,Tavern",LOW,3.2,Aalborg,36.09375,0.46875,0.34375,0.25,0,1,1,"[13, 11]",50.976934,26.00398,3.412297,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,120000,medium
3,10,2025-03-06,20:00:00,21:00:00,136,50.0,37.0,4.464286,28.0,"Barcade,Low,40+",LOW,3.2,Vejle,33.540541,0.486486,0.27027,0.189189,0,1,1,"[13, 14, 27]",50.714458,25.866891,3.44243,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,57000,small
4,11,2025-05-10,19:00:00,22:00:00,41,53.0,35.0,3.555556,27.0,"Rave,Live",LOW,4.8,Vejle,31.171429,0.314286,0.342857,0.171429,0,1,1,"[7, 19]",50.732041,25.90767,3.426303,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,57000,small
5,12,2025-05-30,17:00:00,18:00:00,99,42.0,31.0,3.64,25.0,"DJ,Art Bar,Lounge",HIGH,3.7,Aarhus,35.548387,0.354839,0.419355,0.258065,0,1,3,"[3, 30, 28]",50.827751,25.935373,3.4016,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,280000,medium
6,13,2025-04-07,17:00:00,19:00:00,117,48.0,30.0,3.833333,30.0,"DJ,Dive,Club",LOW,3.0,Copenhagen,36.137931,0.366667,0.3,0.166667,1,1,1,"[3, 6, 12]",50.996458,25.92216,3.44054,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,1340000,large
7,15,2025-03-21,19:00:00,22:00:00,187,38.0,33.0,3.923077,26.0,"Local,Live",MEDIUM,3.8,Randers,33.30303,0.242424,0.363636,0.212121,0,1,2,"[33, 19]",50.190104,25.651042,3.470979,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,62000,small
8,19,2025-04-10,20:00:00,23:00:00,17,59.0,43.0,3.4,25.0,"DJ,Indie,Student",MEDIUM,4.1,Copenhagen,32.325581,0.395349,0.302326,0.186047,0,1,2,"[3, 17, 32]",50.799416,25.937118,3.470083,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,1340000,large
9,20,2025-05-12,19:00:00,21:00:00,109,40.0,31.0,2.344828,29.0,Young,LOW,4.3,Aarhus,34.354839,0.354839,0.387097,0.16129,0,1,1,[31],50.54902,25.686275,3.197796,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,280000,medium


In [16]:
import holidays

df_merged["event_weekday_num"] = df_merged["event_date"].dt.weekday  # 0=Monday, 6=Sunday

# Step 2: Flag holidays using Denmark's calendar
dk_holidays = holidays.country_holidays("DK")
df_merged["is_holiday"] = df_merged["event_date"].isin(dk_holidays).astype(int)
df_merged["event_weekday_num"] = df_merged["event_date"].dt.weekday


print(df_merged.head(10))

   event_id event_date start_time  end_time  venue_id  like_count  checkin_count  avg_event_rating  event_rating_count                tags price_rating  rating        city    avg_age  percent_male  percent_female  percent_local_users  success_flag  has_demographics  price_rating_encoded  tags_encoded  expected_likes  expected_checkins  expected_rating  type_tag_0  type_tag_1  type_tag_2  type_tag_3  type_tag_4  type_tag_5  type_tag_6  type_tag_7  type_tag_8  type_tag_9  type_tag_10  type_tag_11  type_tag_12  type_tag_13  type_tag_14  type_tag_15  type_tag_16  type_tag_17  type_tag_18  type_tag_19  type_tag_20  type_tag_21  type_tag_22  type_tag_23  type_tag_24  type_tag_25  type_tag_26  type_tag_27  type_tag_28  type_tag_29  type_tag_30  type_tag_31  type_tag_32  type_tag_33  type_tag_34  type_tag_35  type_tag_36  type_tag_37  type_tag_38  type_tag_39  type_tag_40  type_tag_41  type_tag_42  type_tag_43  type_tag_44  type_tag_45  type_tag_46  type_tag_47  type_tag_48  city_encoded  city

## Drop columns and run XGBOOST

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, classification_report
from xgboost import XGBClassifier
from IPython.display import display

drop_cols = ["tags", "tags_encoded", "event_date", "city","city_size_category", "success_flag","start_time","end_time","price_rating","event_id","like_count","checkin_count", "avg_event_rating", "venue_id","event_rating_count", "avg_age", "percent_male", "percent_female","percent_local_users", "has_demographics", "rating"]
X = df_merged.drop(columns=drop_cols)
y = df_merged["success_flag"]

# Step 6: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)


# Step 7: Train XGBoost model
model = XGBClassifier(eval_metric="logloss", use_label_encoder=False, random_state=42)
model.fit(X_train, y_train)

# Step 8: Predict and evaluate
probs = model.predict_proba(X_test)[:, 1]
threshold = 0.00009
y_pred = (probs >= threshold).astype(int)
precision = precision_score(y_test, y_pred)
report = classification_report(y_test, y_pred, output_dict=True)
report_df = pd.DataFrame(report).transpose()


display(report_df)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Unnamed: 0,precision,recall,f1-score,support
0,0.954545,0.066456,0.12426,316.0
1,0.057508,0.947368,0.108434,19.0
accuracy,0.116418,0.116418,0.116418,0.116418
macro avg,0.506027,0.506912,0.116347,335.0
weighted avg,0.903669,0.116418,0.123363,335.0


In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
print(X.head(500))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

drop_cols = ["tags", "tags_encoded", "event_date", "city", "start_time","end_time","price_rating","event_id","like_count","checkin_count", "avg_event_rating", "venue_id","event_rating_count", "avg_age", "percent_male", "percent_female","percent_local_users", "has_demographics", "rating"]

# Include the target in the correlation matrix
X_corr = df_merged.drop(columns=drop_cols)  # Do NOT drop success_flag

# Compute the correlation matrix (including target)
corr_matrix = X_corr.corr()

# Plot the heatmap
plt.figure(figsize=(14, 12))
sns.heatmap(corr_matrix, cmap='coolwarm', annot=False, fmt=".2f", square=True, cbar_kws={"shrink": 0.8})
plt.title("Feature Correlation Matrix (Including success_flag)")
plt.tight_layout()
plt.show()


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Display confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot(cmap="Blues")
plt.title("Confusion Matrix")
plt.show()


In [None]:
from xgboost import plot_importance
import matplotlib.pyplot as plt

# Plot feature importance (based on number of splits)
plt.figure(figsize=(12, 8))
plot_importance(model, max_num_features=20, importance_type='gain')  # 'gain' is more informative than 'weight'
plt.title("Top 20 Feature Importances (XGBoost)")
plt.tight_layout()
plt.show()


In [None]:
!pip install shap

In [None]:
import shap

# Create SHAP explainer
explainer = shap.Explainer(model, X_train)  # Use X_train from before
shap_values = explainer(X_test)

# Summary plot: global feature importance
shap.summary_plot(shap_values, X_test, max_display=20)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_score, confusion_matrix
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Load the latest DataFrame
# (Assuming df_merged is already in memory or reconstructed from previous code cells)
df = df_merged.copy()

# Select features
drop_cols = ["tags", "tags_encoded", "event_date", "city", "success_flag","start_time","end_time","price_rating","event_id","like_count","checkin_count", "avg_event_rating", "venue_id","event_rating_count", "avg_age", "percent_male", "percent_female","percent_local_users", "has_demographics", "rating"]
X = df.drop(columns=drop_cols, errors="ignore")
y = df["success_flag"]

# Fill missing values (LightGBM can handle NaNs, but safer to impute for stability)
X = X.fillna(0)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Prepare LightGBM datasets
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test)

# Define model parameters
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'verbose': -1,
    'random_state': 42
}

# Train the model
model = lgb.train(params, train_data, valid_sets=[test_data], num_boost_round=100)

# Predict probabilities and classify
y_probs = model.predict(X_test)
y_pred = (y_probs >= 0.5).astype(int)

# Evaluate
precision = precision_score(y_test, y_pred)
report = classification_report(y_test, y_pred, output_dict=True)
conf_matrix = confusion_matrix(y_test, y_pred)

# Create a visual report
report_df = pd.DataFrame(report).transpose()

# Display confusion matrix heatmap
plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=["Not Trending", "Trending"], yticklabels=["Not Trending", "Trending"])
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.tight_layout()
plt.show()


# Show feature importance
lgb.plot_importance(model, max_num_features=20, importance_type='gain', figsize=(10, 6))
plt.title("Top 20 Feature Importances")
plt.tight_layout()
plt.show()

In [22]:
X["success_flag"] = y  # Add the target column
X.to_csv("event_model_dataset.csv", index=False)


In [None]:
!pip install pandas scikit-learn tensorflow

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

# Load your dataset
df = pd.read_csv("event_model_dataset_PREPROCESSED.csv")

# Select relevant features
feature_columns = [
    'price_rating_encoded', 'expected_likes', 'expected_checkins', 'expected_rating',
    'city_encoded', 'event_weekday_num', 'is_holiday'
] + [f'type_tag_{i}' for i in range(50) if f'type_tag_{i}' in df.columns]

X = df[feature_columns]
y = df['success_flag']

# Impute missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Build the neural network
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')  # Binary output
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2, verbose=1)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

# Predict on test set
y_pred_prob = model.predict(X_test).flatten()
y_pred = (y_pred_prob > 0.5).astype(int)

# Print classification report and confusion matrix
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


In [None]:
!pip install pandas scikit-learn tensorflow