In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
from transformers import DistilBertTokenizer, DistilBertModel
import torch
from torch.utils.data import DataLoader, Dataset

In [2]:
import json
file_path = 'dataset/analysis_campaign_ML_cleaned.json'
# file_path = 'dataset/campaigns_for_analysis.json'
with open(file_path, 'r') as file:
    data = json.load(file)

In [7]:
for campaign in data:
    print(campaign["URL"], campaign["GoalAmount"])
    

https://www.gofundme.com/f/1-dollar-to-1-million-empowerment-or-experiment 1000000
https://www.gofundme.com/f/1-girl-1-calculator 2600
https://www.gofundme.com/f/1-year-of-event-programming 3500
https://www.gofundme.com/f/10-secrets-to-stopping-climate-change 277477
https://www.gofundme.com/f/10-thursday-with-gpbr 1000
https://www.gofundme.com/f/10-year-old-in-a-coma-after-accidenthelp-needed 150000
https://www.gofundme.com/f/100-blankets-for-100-people-in-need 1500
https://www.gofundme.com/f/100-from-100-challenge 5000
https://www.gofundme.com/f/100-hill-bear-crawl-for-owens-foundation 1000
https://www.gofundme.com/f/1000-adams-2021-holiday-fund 15000
https://www.gofundme.com/f/1000-baobabs-project 2500
https://www.gofundme.com/f/1000-mentally-strong-AHS-fundraiser 10000
https://www.gofundme.com/f/100th-video-fund-raiser-against-brain-tumors 5000
https://www.gofundme.com/f/100th-video-fund-raiser-against-cancer 5000
https://www.gofundme.com/f/1013-for-retired-nypd-mos 3000
https://www

In [4]:
filtered_data = [{'Title': campaign.get('Title', ''), 'success': campaign.get('success', '')} for campaign in data]

In [5]:
for campaign in filtered_data:
    if 'success' in campaign:  # Ensure the key exists
        campaign['success'] = 1 if campaign['success'].lower() == 'yes' else 0

In [7]:
data[0]

{'_id': {'$oid': '61e5fb7e1ad7d6fcca9ad8ac'},
 'URL': 'https://www.gofundme.com/f/1-dollar-to-1-million-empowerment-or-experiment',
 'Category': 'Community & Neighbors',
 'image': 'background-image: url("https://images.gofundme.com/uazTkxDgDmgsiYp8lsvo3iFjhnc=/720x405/https://d2g8igdw686xgo.cloudfront.net/61146977_1636558350511855_r.jpeg");',
 'GoalAmount': 1000000,
 'Organizer': {'Organizer_name': 'Courteney Ridgeway',
  'Organizer_desc': 'Organizer|Hampton, GA'},
 'Description': "Courteney Ridgeway is organizing this fundraiser.\n\n\nGood day people of power as the title says 1 dollar to one million dollars I am hosting this gofundme as what some may see as a way of empowering the black community, some may see it as an experiment to check the hearts and support of the black community amongst themselves. Heck some on the younger crowds may even see this as the new challenge. However, I truly see it as awareness this is something that has been weighing on my heart for a long time. So a

In [None]:
filtered_df = pd.DataFrame(filtered_data)

In [12]:
train_data, test_data = train_test_split(filtered_df, test_size=0.3, random_state=42, stratify=filtered_df['success'])

In [13]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
distilbert = DistilBertModel.from_pretrained('distilbert-base-uncased')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:
def encode_text(texts):
    inputs = tokenizer(list(texts), padding=True, truncation=True, return_tensors='pt', max_length=128)
    with torch.no_grad():
        outputs = distilbert(**inputs)
    return outputs.last_hidden_state[:, 0, :].numpy()

In [18]:
train_embeddings = encode_text(train_data['Title'])
test_embeddings = encode_text(test_data['Title'])

In [None]:
y_train = train_data['success'].values
y_test = test_data['success'].values

In [None]:
# def train_model(model, X_train, y_train, X_test, y_test):
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
#     print("\nModel Performance:")
#     print("Accuracy:", accuracy_score(y_test, y_pred))
#     print(classification_report(y_test, y_pred))

In [None]:
# Option 3: Neural Network
# print("\nTraining Neural Network...")
# nn_model = MLPClassifier(hidden_layer_sizes=(512, 256, 128), activation='relu', solver='adam', max_iter=100, random_state=42)
# train_model(nn_model, train_embeddings, y_train, test_embeddings, y_test)

In [None]:
import wandb

wandb.init(project="campaign_success_prediction", name="Model_Comparison")

wandb.config.update({
    "test_size": 0.3,
    "random_state": 42,
    "max_length": 128,
    "models": ["Random Forest", "Gradient Boosting", "Neural Network"]
})

# Function to train and log models
def train_model(model, X_train, y_train, X_test, y_test, model_name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Log metrics
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    
    wandb.log({
        "model_name": model_name,
        "accuracy": accuracy,
        "precision": report["weighted avg"]["precision"],
        "recall": report["weighted avg"]["recall"],
        "f1_score": report["weighted avg"]["f1-score"]
    })
    
    print(f"\n{model_name} Performance:")
    print(f"Accuracy: {accuracy}")
    print(classification_report(y_test, y_pred))

# Train and log models
print("\nTraining Random Forest...")
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
train_model(rf_model, train_embeddings, y_train, test_embeddings, y_test, "Random Forest")

print("\nTraining Gradient Boosting...")
gb_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
train_model(gb_model, train_embeddings, y_train, test_embeddings, y_test, "Gradient Boosting")

print("\nTraining Neural Network...")
nn_model = MLPClassifier(hidden_layer_sizes=(128, 64), activation='relu', solver='adam', max_iter=100, random_state=42)
train_model(nn_model, train_embeddings, y_train, test_embeddings, y_test, "Neural Network")


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mhighlander-rahat[0m. Use [1m`wandb login --relogin`[0m to force relogin
