In [48]:
import pymongo
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from datetime import datetime, timedelta
import numpy as np
import joblib

In [86]:
# Connect to MongoDB and retrieve data
client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client["crowd_data"]
collection = db["events"]

# Get only past events with attendance data
data = list(collection.find({
    "date": {"$lt": datetime.now().strftime("%Y-%m-%d")},
    "attendance": {"$exists": True}
}))

# Convert to DataFrame
df = pd.DataFrame(data)

# Feature engineering
df['day_of_week'] = pd.to_datetime(df['date']).dt.dayofweek
df['month'] = pd.to_datetime(df['date']).dt.month
df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)

# Encode categorical variables
label_encoders = {}
for col in ['type', 'promotion', 'weather']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Features and target
features = ['type', 'capacity', 'ticket_price', 'promotion', 'weather', 'day_of_week', 'month', 'is_weekend']
X = df[features]
y = df['attendance']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f"Model MAE: {mae:.2f}")

# Save model and encoders
joblib.dump(model, 'crowd_prediction_model.pkl')
joblib.dump(label_encoders, 'label_encoders.pkl')

Model MAE: 954.72


['label_encoders.pkl']

In [88]:
def predict_crowd(event_details):
    """
    Predict crowd for a new event
    
    Parameters:
    event_details (dict): Dictionary containing event details with keys:
        - type: concert, conference, tourist_spot, etc.
        - date: YYYY-MM-DD
        - capacity: integer
        - ticket_price: float
        - promotion: low, medium, high
        - weather: sunny, rainy, cloudy, snowy (optional, defaults to sunny)
        - location: string (optional)
    """
    # Load model and encoders
    model = joblib.load('crowd_prediction_model.pkl')
    label_encoders = joblib.load('label_encoders.pkl')
    
    # Create DataFrame
    df = pd.DataFrame([event_details])
    
    # Feature engineering
    df['date'] = pd.to_datetime(df['date'])
    df['day_of_week'] = df['date'].dt.dayofweek
    df['month'] = df['date'].dt.month
    df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
    
    # Fill missing weather
    if 'weather' not in df or df['weather'].isnull().any():
        df['weather'] = 'sunny'
    
    # Encode categorical variables
    for col in ['type', 'promotion', 'weather']:
        le = label_encoders[col]
        df[col] = le.transform(df[col])
    
    # Features
    features = ['type', 'capacity', 'ticket_price', 'promotion', 'weather', 'day_of_week', 'month', 'is_weekend']
    X = df[features]
    
    # Predict
    prediction = model.predict(X)
    
    return int(prediction[0])

In [94]:
#capacity calc
#price calc
new_event = {
        "type": "concert",
        "date": "2025-04-20",
        "capacity": 9000,
        "ticket_price": 150,
        "promotion": "high",
        "weather": "sunny",
        "location": "New York"
    }
predicted_attendance = predict_crowd(new_event)
print(f"Predicted attendance: {predicted_attendance}")

Predicted attendance: 5863
