In [28]:
import wandb
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support
from sklearn.neural_network import MLPClassifier
from transformers import DistilBertTokenizer, BertTokenizer, RobertaTokenizer, AlbertTokenizer, DistilBertModel, BertModel, RobertaModel, AlbertModel

# Initialize W&B
wandb.init(project="campaign_success_prediction", name="single_feature_text_model_title")

VBox(children=(Label(value='0.015 MB of 0.015 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,█▇▁█▇▁█▇▁
f1_0,█▇▁█▇▁█▇▁
f1_1,▁▂█▁▂█▁▂█
max_length,▁▁▁▃▃▃███
precision_0,▁▂█▁▂█▁▂█
precision_1,█▅▁█▅▁█▅▁
recall_0,█▇▁█▇▁█▇▁
recall_1,▁▂█▁▂█▁▂█

0,1
accuracy,0.74413
f1_0,0.84644
f1_1,0.23332
hidden_sizes,"[512, 256, 128]"
max_length,512
model_name,distilbert-base-unca...
precision_0,0.80872
precision_1,0.30414
recall_0,0.88785
recall_1,0.18925


In [29]:
# Define model configurations
models = [
    ("distilbert-base-uncased", DistilBertTokenizer, DistilBertModel),
    ("bert-base-uncased", BertTokenizer, BertModel),
    ("roberta-base", RobertaTokenizer, RobertaModel),
    ("albert-base-v2", AlbertTokenizer, AlbertModel),
]

In [30]:

# Define configurations for neural network layers
nn_configs = [
    {"hidden_sizes": [128], "activation": "relu"},
    {"hidden_sizes": [256, 128], "activation": "relu"},
    {"hidden_sizes": [512, 256, 128], "activation": "relu"},
]

In [31]:
# Define max lengths for tokenization
max_lengths = [128, 256, 512, 1024]

In [32]:
def train_model(X_train, y_train, X_test, y_test, hidden_sizes, model_name, max_length):
    layer_sizes = []
    for i in range(len(hidden_sizes) - 1):
        layer_sizes.append((hidden_sizes[i], hidden_sizes[i + 1]))
    nn_model = MLPClassifier(
        hidden_layer_sizes=tuple(hidden_sizes),
        activation="relu",
        solver="adam",
        max_iter=100,
        random_state=42,
    )

    nn_model.fit(X_train, y_train)
    y_pred = nn_model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average=None)
    # Log to W&B
    wandb.log({
        "model_name": model_name,
        "hidden_sizes": str(hidden_sizes),
        "max_length": max_length,
        "accuracy": accuracy,
        "precision_0": precision[0],
        "recall_0": recall[0],
        "precision_1": precision[1],
        "recall_1": recall[1],
        "f1_0": f1[0],
        "f1_1": f1[1],
    })

    print(f"\nModel: {model_name}, Hidden Sizes: {hidden_sizes}, Max Length: {max_length}")
    print(f"Accuracy: {accuracy}")
    print(f"Precision (Class 0): {precision[0]}, Recall (Class 0): {recall[0]}")
    print(f"Precision (Class 1): {precision[1]}, Recall (Class 1): {recall[1]}")

In [33]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split

file_path = 'dataset/analysis_campaign_ML_cleaned.json'
# file_path = 'dataset/campaigns_for_analysis.json'
with open(file_path, 'r') as file:
    data = json.load(file)
filtered_data = [{'Title': campaign.get('Title', ''), 'Description': campaign.get('Description', ''), 'success': campaign.get('success', '')} for campaign in data]
for campaign in filtered_data:
    if 'success' in campaign:  # Ensure the key exists
        campaign['success'] = 1 if campaign['success'].lower() == 'yes' else 0

filtered_df = pd.DataFrame(filtered_data)
train_data, test_data = train_test_split(filtered_df, test_size=0.3, random_state=42, stratify=filtered_df['success'])


In [34]:
import torch
def encode_text(texts, tokenizer, encoder, max_length):
    inputs = tokenizer(list(texts), padding=True, truncation=True, return_tensors='pt', max_length=max_length)
    with torch.no_grad():
        outputs = encoder(**inputs)
    return outputs.last_hidden_state[:, 0, :].numpy()

In [None]:
for model_name, tokenizer_class, encoder_class in models:
    print(f"Running experiments for {model_name}...")
    tokenizer = tokenizer_class.from_pretrained(model_name)
    encoder = encoder_class.from_pretrained(model_name)
    for max_length in max_lengths[:2]:
        print(f"Tokenizing with max_length={max_length}...")
        title_embeddings_train = encode_text(train_data["Title"], tokenizer, encoder, max_length)
        title_embeddings_test = encode_text(test_data["Title"], tokenizer, encoder, max_length)
        
        description_embeddings_train = encode_text(train_data["Description"], tokenizer, encoder, max_length)
        description_embeddings_test = encode_text(test_data["Description"], tokenizer, encoder, max_length)
        
        train_embeddings = np.hstack([title_embeddings_train, description_embeddings_train])
        test_embeddings = np.hstack([title_embeddings_test, description_embeddings_test])
        for nn_config in nn_configs:
            train_model(
                train_embeddings,
                train_data["success"],
                test_embeddings,
                test_data["success"],
                hidden_sizes=nn_config["hidden_sizes"],
                model_name=model_name,
                max_length=max_length,
            )
    break

Running experiments for distilbert-base-uncased...


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Tokenizing with max_length=128...
