In [3]:
from sklearn.utils import class_weight
from sklearn.metrics import roc_auc_score


# Compute class weights
class_weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weights = dict(enumerate(class_weights))

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import roc_auc_score

# Load your dataset
df = pd.read_csv("event_model_dataset.csv")

# --- 1. Handle logical inconsistencies ---
# Optional: Remove events where expected_checkins > expected_likes
# or expected_rating > expected_checkins
invalid_logic_mask = (
    (df['expected_checkins'] > df['expected_likes']) |
    (df['expected_rating'] > df['expected_checkins'])
)
print(f"Removing {invalid_logic_mask.sum()} logically inconsistent rows")
df = df[~invalid_logic_mask].copy()

# --- 2. Create derived ratio features ---
df['like_checkin_ratio'] = df['expected_likes'] / (df['expected_checkins'] + 1)
df['checkin_rating_ratio'] = df['expected_checkins'] / (df['expected_rating'] + 1)
df['like_rating_ratio'] = df['expected_likes'] / (df['expected_rating'] + 1)

# --- 3. Log-transform skewed count features ---
for col in ['expected_likes', 'expected_checkins', 'expected_rating']:
    df[f'log_{col}'] = np.log1p(df[col])


# Select relevant features
feature_columns = [
    'price_rating_encoded', 'city_encoded', 'event_weekday_num', 'is_holiday'
] + [f'type_tag_{i}' for i in range(50) if f'type_tag_{i}' in df.columns] + [
    'like_checkin_ratio', 'checkin_rating_ratio', 'like_rating_ratio',
    'log_expected_likes', 'log_expected_checkins', 'log_expected_rating'
]


X = df[feature_columns]
y = df['success_flag']

# Impute missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Build the neural network
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')  # Binary output
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.002), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=30, batch_size=64, validation_split=0.2, verbose=1, class_weight=class_weights)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

# Predict on test set
y_pred_prob = model.predict(X_test).flatten()
y_pred = (y_pred_prob > 0.05).astype(int)

auc = roc_auc_score(y_test, y_pred_prob)
print(f"AUC: {auc:.4f}")

# Print classification report and confusion matrix
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


# Assume y_test and y_pred are defined
report = classification_report(y_test, y_pred, output_dict=True)
df_report = pd.DataFrame(report).transpose()

# Optional: round and clean up
df_report = df_report.round(6)
df_report.reset_index(inplace=True)
df_report.rename(columns={'index': 'label'}, inplace=True)

# Display
print(df_report)


Removing 0 logically inconsistent rows
Epoch 1/30


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 30ms/step - accuracy: 0.4975 - loss: 0.7983 - val_accuracy: 0.6325 - val_loss: 0.6582
Epoch 2/30
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.7148 - loss: 0.5669 - val_accuracy: 0.7607 - val_loss: 0.5350
Epoch 3/30
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.8326 - loss: 0.4689 - val_accuracy: 0.8120 - val_loss: 0.4587
Epoch 4/30
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.9024 - loss: 0.3726 - val_accuracy: 0.8291 - val_loss: 0.4249
Epoch 5/30
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.9092 - loss: 0.3144 - val_accuracy: 0.8547 - val_loss: 0.3850
Epoch 6/30
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.9291 - loss: 0.3050 - val_accuracy: 0.8718 - val_loss: 0.3553
Epoch 7/30
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m