In [4]:
import sys
import os
# Add parent directory (project root) to Python path
sys.path.append(os.path.abspath(".."))
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from scripts.preprocess import preprocess_numeric, preprocess_categorical, preprocess_text
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Concatenate, Dropout
import pickle

In [6]:
# 1. Load Dataset
# --------------------------
df = pd.read_csv('../data/strong_marketing_campaign.csv')

In [7]:
# 2. Define Features
# --------------------------
numeric_features = ["PastClicks", "PastPurchases", "PreviousResponse", "CustomerLifetimeValue"]
categorical_features = ["Channel", "CampaignType"]
text_feature = "CampaignText"
target = "Response"

X = df[numeric_features + categorical_features + [text_feature]]
y = df[target]

In [8]:
# 3. Train/Test Split
# --------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [9]:
# 4. Preprocessing
# --------------------------
X_train_num, scaler = preprocess_numeric(X_train, numeric_features)
X_test_num, _ = preprocess_numeric(X_test, numeric_features, scaler)

X_train_cat, ohe = preprocess_categorical(X_train, categorical_features)
X_test_cat, _ = preprocess_categorical(X_test, categorical_features, ohe)

X_train_text, tokenizer = preprocess_text(X_train, text_feature)
X_test_text, _ = preprocess_text(X_test, text_feature, tokenizer)

In [11]:
# 5. Baseline ML Model (Optional)
# --------------------------
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import FunctionTransformer

def get_numeric(X):
    return X[numeric_features].values

def get_text(X):
    # Return as 1D array of strings
    return X[text_feature].astype(str).values

numeric_transformer = FunctionTransformer(get_numeric)
text_transformer = Pipeline([
    ('selector', FunctionTransformer(get_text, validate=False)),
    ('tfidf', TfidfVectorizer(max_features=500))
])

combined_features = FeatureUnion([
    ('numeric', numeric_transformer),
    ('text', text_transformer)
])

pipeline = Pipeline([
    ('features', combined_features),
    ('model', RandomForestClassifier(n_estimators=100, random_state=42))
])
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)
y_prob = pipeline.predict_proba(X_test)[:,1]

print(classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))

              precision    recall  f1-score   support

           0       0.69      0.59      0.64       461
           1       0.88      0.92      0.90      1539

    accuracy                           0.84      2000
   macro avg       0.79      0.75      0.77      2000
weighted avg       0.84      0.84      0.84      2000

ROC-AUC: 0.8825722819139115


In [13]:
# 6. DL Fusion Model
# --------------------------
max_words = 5000
max_len = 100

num_input = Input(shape=(X_train_num.shape[1],), name='numeric_input')
num_dense = Dense(64, activation='relu')(num_input)

cat_input = Input(shape=(X_train_cat.shape[1],), name='categorical_input')
cat_dense = Dense(32, activation='relu')(cat_input)

text_input = Input(shape=(max_len,), name='text_input')
embedding = Embedding(input_dim=max_words, output_dim=64, input_length=max_len)(text_input)
lstm = LSTM(64)(embedding)

fusion = Concatenate()([num_dense, cat_dense, lstm])
fusion = Dense(64, activation='relu')(fusion)
fusion = Dropout(0.3)(fusion)
output = Dense(1, activation='sigmoid')(fusion)

model = Model(inputs=[num_input, cat_input, text_input], outputs=output)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

history = model.fit(
    [X_train_num, X_train_cat, X_train_text],
    y_train,
    validation_data=([X_test_num, X_test_cat, X_test_text], y_test),
    epochs=10,
    batch_size=32
)



Epoch 1/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 67ms/step - accuracy: 0.8050 - loss: 0.4343 - val_accuracy: 0.8365 - val_loss: 0.3813
Epoch 2/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 81ms/step - accuracy: 0.8241 - loss: 0.3950 - val_accuracy: 0.8400 - val_loss: 0.3718
Epoch 3/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 54ms/step - accuracy: 0.8267 - loss: 0.3874 - val_accuracy: 0.8230 - val_loss: 0.3874
Epoch 4/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 78ms/step - accuracy: 0.8274 - loss: 0.3827 - val_accuracy: 0.8400 - val_loss: 0.3669
Epoch 5/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 57ms/step - accuracy: 0.8294 - loss: 0.3797 - val_accuracy: 0.8390 - val_loss: 0.3638
Epoch 6/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 73ms/step - accuracy: 0.8295 - loss: 0.3778 - val_accuracy: 0.8370 - val_loss: 0.3620
Epoch 7/10
[1m2