In [90]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
from datasets import Dataset
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_recall_fscore_support
import torch

In [152]:
model_name = "distilbert/distilbert-base-cased"

#filename = "data/final_fused_data.csv"
filename = "data/just_trump_and_ross_2000_each.csv"
data = pd.read_csv(filename)
data.columns

Index(['person', 'text'], dtype='object')

In [153]:
#model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [154]:
def tokenize_text(examples):
    return tokenizer(
        examples['text'], 
        truncation=True, 
        padding='max_length',
        max_length=128,  # Set a fixed max length
        return_tensors=None
    )

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = (preds == labels).astype(np.float32).mean().item()
    
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [171]:
for param in model.base_model.parameters():
    param.requires_grad = False

for name, param in model.named_parameters():
    if any(f"transformer.layer.{i}" in name for i in [4, 5]) or "classifier" in name:
        param.requires_grad = True

for name, param in model.named_parameters():
    print(name, param.requires_grad)

distilbert.embeddings.word_embeddings.weight False
distilbert.embeddings.position_embeddings.weight False
distilbert.embeddings.LayerNorm.weight False
distilbert.embeddings.LayerNorm.bias False
distilbert.transformer.layer.0.attention.q_lin.weight False
distilbert.transformer.layer.0.attention.q_lin.bias False
distilbert.transformer.layer.0.attention.k_lin.weight False
distilbert.transformer.layer.0.attention.k_lin.bias False
distilbert.transformer.layer.0.attention.v_lin.weight False
distilbert.transformer.layer.0.attention.v_lin.bias False
distilbert.transformer.layer.0.attention.out_lin.weight False
distilbert.transformer.layer.0.attention.out_lin.bias False
distilbert.transformer.layer.0.sa_layer_norm.weight False
distilbert.transformer.layer.0.sa_layer_norm.bias False
distilbert.transformer.layer.0.ffn.lin1.weight False
distilbert.transformer.layer.0.ffn.lin1.bias False
distilbert.transformer.layer.0.ffn.lin2.weight False
distilbert.transformer.layer.0.ffn.lin2.bias False
distilbe

In [172]:
label_encoder = LabelEncoder()
data['encoded_person'] = label_encoder.fit_transform(data['person'])

In [173]:
# First encode the labels
label_encoder = LabelEncoder()
data['labels'] = label_encoder.fit_transform(data['person'])

# Split into train and temp
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    data['text'],
    data['labels'],
    test_size=0.3,
    random_state=42
)

# Split temp into test and validation
test_texts, val_texts, test_labels, val_labels = train_test_split(
    temp_texts,
    temp_labels,
    test_size=0.5,
    random_state=42
)

# Create datasets with 'labels' instead of 'person'
train_dataset = Dataset.from_dict({
    'text': train_texts,
    'labels': train_labels,
})

test_dataset = Dataset.from_dict({
    'text': test_texts,
    'labels': test_labels,
})

val_dataset = Dataset.from_dict({
    'text': val_texts,
    'labels': val_labels,
})

In [174]:
tokenized_train = train_dataset.map(tokenize_text, batched=True)
tokenized_test = test_dataset.map(tokenize_text, batched=True)
tokenized_val = val_dataset.map(tokenize_text, batched=True)

Map:   0%|          | 0/2800 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

In [175]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    save_total_limit=2,
    per_device_train_batch_size=16,
    num_train_epochs=1,
    learning_rate=1e-5,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    compute_metrics=compute_metrics,
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [176]:
train_results = trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.027764,0.991667,0.991664,0.991712,0.991667


In [177]:
# Evaluate on test set
test_results = trainer.evaluate(tokenized_test)
print("\nTest set results:", test_results)

# Print detailed metrics per class
from sklearn.metrics import classification_report

# Get predictions for test set
test_predictions = trainer.predict(tokenized_test)
y_pred = test_predictions.predictions.argmax(-1)
y_true = tokenized_test['labels']

# Convert numeric labels back to persona names for readable report
# label_names = {0: "Bob Ross", 1: "Donald Trump", 2: "Holt"}
label_names = {0: "Bob Ross", 1: "Donald Trump"}
y_true_names = [label_names[label] for label in y_true]
y_pred_names = [label_names[label] for label in y_pred]

print("\nDetailed Classification Report:")
print(classification_report(y_true_names, y_pred_names))


Test set results: {'eval_loss': 0.02516716532409191, 'eval_accuracy': 0.9950000047683716, 'eval_f1': 0.9949999583322915, 'eval_precision': 0.9950054446864306, 'eval_recall': 0.995, 'eval_runtime': 34.6382, 'eval_samples_per_second': 17.322, 'eval_steps_per_second': 2.165, 'epoch': 1.0}

Detailed Classification Report:
              precision    recall  f1-score   support

    Bob Ross       1.00      0.99      0.99       299
Donald Trump       0.99      1.00      1.00       301

    accuracy                           0.99       600
   macro avg       1.00      0.99      0.99       600
weighted avg       1.00      0.99      0.99       600



In [178]:
def test_examples(model, tokenizer, examples):
    model.eval()
    for text in examples:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
        with torch.no_grad():
            outputs = model(**inputs)
            predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
            predicted_class = torch.argmax(predictions, dim=-1).item()
            confidence = predictions[0][predicted_class].item()
            
        print(f"\nText: {text}")
        print(f"Predicted: {label_names[predicted_class]}")
        print(f"Confidence: {confidence:.2%}")

# Test examples
test_examples(model, tokenizer, [
"Folks, nobody knows infrastructure better than me, believe me. We're going to build things so beautiful, so incredible, your head will spin.", "The fake news media, they don't want to talk about it, but we're winning like nobody's ever won before. It's true!", "I know all the best people, tremendous people, and they're all saying 'Sir, what you've done is amazing.'", "We're doing numbers like nobody's ever seen, nobody thought it was possible, but we did it.", "The radical left, they don't understand business, but I built a great company, one of the greatest companies."
])


Text: Folks, nobody knows infrastructure better than me, believe me. We're going to build things so beautiful, so incredible, your head will spin.
Predicted: Bob Ross
Confidence: 53.99%

Text: The fake news media, they don't want to talk about it, but we're winning like nobody's ever won before. It's true!
Predicted: Donald Trump
Confidence: 98.79%

Text: I know all the best people, tremendous people, and they're all saying 'Sir, what you've done is amazing.'
Predicted: Donald Trump
Confidence: 97.30%

Text: We're doing numbers like nobody's ever seen, nobody thought it was possible, but we did it.
Predicted: Donald Trump
Confidence: 78.01%

Text: The radical left, they don't understand business, but I built a great company, one of the greatest companies.
Predicted: Donald Trump
Confidence: 97.51%


In [63]:
from huggingface_hub import login

# Login to Hugging Face (you'll need your token from the website)
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [179]:
# Create a model repository name
repo_name = "trump_ross_classifier"
model_name = f"StephanSchweitzer/{repo_name}"  # replace with your username

# Save the tokenizer and model
tokenizer.save_pretrained(model_name)
model.save_pretrained(model_name)

# Push to Hub
model.push_to_hub(model_name)
tokenizer.push_to_hub(model_name)

model.safetensors:   0%|          | 0.00/263M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/StephanSchweitzer/trump_ross_classifier/commit/24b07801ab40bb524fdc264f3b0f8ba4ab59fcc8', commit_message='Upload tokenizer', commit_description='', oid='24b07801ab40bb524fdc264f3b0f8ba4ab59fcc8', pr_url=None, repo_url=RepoUrl('https://huggingface.co/StephanSchweitzer/trump_ross_classifier', endpoint='https://huggingface.co', repo_type='model', repo_id='StephanSchweitzer/trump_ross_classifier'), pr_revision=None, pr_num=None)

In [180]:
model_name = "StephanSchweitzer/trump_ross_classifier"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [181]:
def predict_persona(text):
    # Tokenize the input
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128, padding=True)
    
    # Get prediction
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
        predicted_class = torch.argmax(predictions, dim=-1).item()
    
    # Get confidence score
    confidence = predictions[0][predicted_class].item()
    
    personas = {0: "Bob Ross", 1: "Donald Trump", 2: "Holt"}
    predicted_persona = personas[predicted_class]
    
    return predicted_persona, confidence

In [182]:
test_sentences = [
"Let's add a happy little pine tree right here, it'll be our secret.", 
"As your president, I will do everything in my power to protect our LGBTQ citizens from the violence and oppression of a hateful foreign ideology.",
"Sometimes life gives you dark colors, but that's what makes the bright ones so special.",
"Take a second to appreciate the beauty of the river, it doesn't have to do much to mean a lot to us.",
"Just get a general idea of where we want em to be. I like that. Now, Ive got a couple of filberts going here, so I dont have to spend all my time just cleaning them"
]


for sentence in test_sentences:
    persona, confidence = predict_persona(sentence)
    print(f"\nText: {sentence}")
    print(f"Predicted persona: {persona}")
    print(f"Confidence: {confidence:.2%}")


Text: Let's add a happy little pine tree right here, it'll be our secret.
Predicted persona: Bob Ross
Confidence: 99.09%

Text: As your president, I will do everything in my power to protect our LGBTQ citizens from the violence and oppression of a hateful foreign ideology.
Predicted persona: Donald Trump
Confidence: 98.22%

Text: Sometimes life gives you dark colors, but that's what makes the bright ones so special.
Predicted persona: Bob Ross
Confidence: 98.12%

Text: Take a second to appreciate the beauty of the river, it doesn't have to do much to mean a lot to us.
Predicted persona: Bob Ross
Confidence: 96.38%

Text: Just get a general idea of where we want em to be. I like that. Now, Ive got a couple of filberts going here, so I dont have to spend all my time just cleaning them
Predicted persona: Bob Ross
Confidence: 99.34%


In [133]:
data.head(10)

Unnamed: 0,person,text,encoded_person,labels
0,Bob Ross,there we go then with a clean dry 2in brush ve...,0,0
1,holt,"Good idea. Everyone? Gather round, so I can ca...",2,2
2,Bob Ross,apart from everybody else because you you pay ...,0,0
3,holt,"Peralta, what are you doing here?",2,2
4,holt,"Rub, rub, rub.",2,2
5,Bob Ross,"Now see, you may, its almost a natural tendenc...",0,0
6,Donald Trump,You look fab! They were so lucky to have you i...,1,1
7,Bob Ross,"Im just making it on the brush, because Im abo...",0,0
8,Bob Ross,gently just tap and lift upward always followi...,0,0
9,Bob Ross,across now same old dirty brush back into my y...,0,0
