In [None]:

import pandas as pd
import psycopg2
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report

LIKES_THRESHOLD = 50  # threshold to classify events as popular

DB_PARAMS = {
    'host': 'localhost',
    'dbname': 'pnta',
    'user': 'postgres',
    'password': 'your_password',  # replace with your actual password
    'port': 5432
}


In [None]:

def get_connection():
    return psycopg2.connect(**DB_PARAMS)


In [None]:

def load_data():
    query = '''
    SELECT
        e.event_id,
        e.name,
        e.description,
        e.event_date,
        e.start_time,
        e.end_time,
        e.venue_id,
        l.city,
        v.rating AS venue_rating,
        (SELECT COUNT(*) FROM pnta.user_event_like l WHERE l.event_id = e.event_id) AS event_likes,
        ARRAY(
            SELECT t.name
            FROM pnta.tag t
            JOIN pnta.event_tag et ON t.tag_id = et.tag_id
            WHERE et.event_id = e.event_id
        ) AS event_tags
    FROM pnta.event e
    JOIN pnta.venue v ON v.venue_id = e.venue_id
    JOIN pnta.location l ON l.venue_id = v.venue_id;
    '''
    with get_connection() as conn:
        return pd.read_sql(query, conn)


In [None]:

df = load_data()
df.head()


In [None]:

def preprocess(df):
    df['label'] = (df['event_likes'] >= LIKES_THRESHOLD).astype(int)
    df['weekday'] = pd.to_datetime(df['event_date']).dt.dayofweek
    df['hour'] = pd.to_datetime(df['start_time'].astype(str), format='%H:%M:%S', errors='coerce').dt.hour
    df['text'] = df['name'].fillna('') + ' ' + df['description'].fillna('')

    # Encode text
    tfidf = TfidfVectorizer(max_features=100)
    X_text = tfidf.fit_transform(df['text']).toarray()

    # Encode categorical
    encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    X_cat = encoder.fit_transform(df[['city']])

    # Numerical features
    X_num = df[['venue_rating', 'weekday', 'hour']].fillna(0)

    # Combine
    X = pd.concat([
        pd.DataFrame(X_text),
        pd.DataFrame(X_cat),
        X_num.reset_index(drop=True)
    ], axis=1)

    X.columns = X.columns.astype(str)
    y = df['label']
    return X, y


In [None]:

X, y = preprocess(df)
X.shape, y.value_counts()


In [None]:

def train_model(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    print(classification_report(y_test, model.predict(X_test)))
    return model

model = train_model(X, y)
