# IMPORTANT: Run this cell first to set up CPU-only environment

In [2]:
import os

# Set environment variables to force CPU usage and disable MPS
os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'
os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = '0.0'
os.environ['TOKENIZERS_PARALLELISM'] = 'false'ƒƒ

print("Environment configured for CPU-only execution")
print("Please restart the kernel and run all cells from the beginning")

Environment configured for CPU-only execution
Please restart the kernel and run all cells from the beginning


## Installs

In [3]:
!pip install -q evaluate
!pip install -q emoji==0.6.0
!pip install -q torch
!pip install -q transformers
!pip install -q accelerate
!pip install -q bitsandbytes

## Imports

In [4]:
import os
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset
from torch import nn
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from transformers import (
    TrainingArguments,
    Trainer,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig
)
import evaluate
import wandb
import time
from datetime import datetime

  from .autonotebook import tqdm as notebook_tqdm


## Model Selection

**IMPORTANT:** Before running the notebook, modify the `MODEL_TYPE` flag in the next cell to select your desired model:
- Set `MODEL_TYPE = "roberta"` for RoBERTa model
- Set `MODEL_TYPE = "bert"` for BERTweet model

This is your responsibility to ensure the correct model and paths are configured.


In [5]:
# Model selection flag
# MODEL_TYPE = "roberta"  # Change to "bert" for BERTweet
MODEL_TYPE = "roberta"   # Uncomment for BERTweet

# Model configuration based on flag
if MODEL_TYPE == "roberta":
    model_name = "cardiffnlp/twitter-roberta-base-sentiment"
    best_model_path = "./HuggingFace/roberta/best_roberta_model_so_far"
    model_file = "model_roberta.pt"
elif MODEL_TYPE == "bert":
    model_name = "finiteautomata/bertweet-base-sentiment-analysis"
    best_model_path = "./Full model/bert/best_bert_model_so_far"
    model_file = "model_bert.pt"
else:
    raise ValueError(f"Unsupported model type: {MODEL_TYPE}")

print(f"Using {MODEL_TYPE.upper()} model: {model_name}")
print(f"Model path: {best_model_path}")
print(f"Model file: {model_file}")


Using ROBERTA model: cardiffnlp/twitter-roberta-base-sentiment
Model path: ./HuggingFace/roberta/best_roberta_model_so_far
Model file: model_roberta.pt


### Forcing CPU usage for cross platform usability

In [6]:
os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'  # Fallback to CPU for unsupported ops
os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = '0.0'  # Disable MPS

# Force CPU device
device = torch.device("cpu")
print(f"Using device: {device}")

# Disable MPS backend if available (Apple Silicon)
if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
    torch.backends.mps.is_built = lambda: False
    print("MPS backend disabled - using CPU only")

Using device: cpu
MPS backend disabled - using CPU only


In [7]:
print('loging to wandb.ai account')
wandb.login(key="6dd13a6018f089606e418d323dd8b502f31bca4e")

loging to wandb.ai account


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /Users/taltzafrir/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mttzafrir[0m ([33mat-bay-data-science[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

## Reading Raw Data

In [8]:
train = pd.read_csv("OOT_train.csv", encoding='latin-1')
val = pd.read_csv("OOT_val.csv", encoding='latin-1')
test = pd.read_csv("OOT_test.csv", encoding='latin-1')

### Preprocessing

In [9]:
#encoding the labels numerically from Sentiment
ordinal_mapping = {
    'Extremely Negative': 0,
    'Negative': 1,
    'Neutral': 2,
    'Positive': 3,
    'Extremely Positive': 4
}

# map to ordinal labels
train["ordinal_label_id"] = train["Sentiment"].map(ordinal_mapping)
val["ordinal_label_id"] = val["Sentiment"].map(ordinal_mapping)
test["ordinal_label_id"] = test["Sentiment"].map(ordinal_mapping)

In [10]:
# Concat the relevant columns into one string with separation.
# for example: "Tweet: my food stock is low | Location: Canada | Date: 2020-03-17 | URL: https://t.co/abcd"

# Function to build the input string from multiple columns
def build_augmented_input(row):
    parts = []

    if pd.notna(row.get('clean_tweet')):
        parts.append(f"{row['clean_tweet']}")

    if pd.notna(row.get('Location_standardized')) and row['Location_standardized'].lower() != 'unknown':
        parts.append(f"{row['Location_standardized']}")

    if pd.notna(row.get('TweetAt')):
        parts.append(f"{row['TweetAt']}")


    return " | ".join(parts)

# Apply to the DataFrames
train['model_input'] = train.apply(build_augmented_input, axis=1)
val['model_input'] = val.apply(build_augmented_input, axis=1)
test['model_input'] = test.apply(build_augmented_input, axis=1)

# Create  new DataFrames with only what's needed for modeling
formatted_train = train[['model_input', 'ordinal_label_id']].copy()
formatted_val = val[['model_input', 'ordinal_label_id']].copy()
formatted_test = test[['model_input', 'ordinal_label_id']].copy()

In [11]:
def balance_dataset(df, target_samples_per_class=5000):
    """Balance dataset by undersampling"""
    balanced_dfs = []

    print("Original class distribution:")
    print(df['ordinal_label_id'].value_counts().sort_index())

    for class_id in range(5):
        class_data = df[df['ordinal_label_id'] == class_id]

        if len(class_data) > target_samples_per_class:
            class_data = class_data.sample(n=target_samples_per_class, random_state=42)
            print(f"Class {class_id}: {len(class_data)} samples (undersampled)")
        else:
            print(f"Class {class_id}: {len(class_data)} samples (kept all)")

        balanced_dfs.append(class_data)

    balanced_df = pd.concat(balanced_dfs, ignore_index=True).sample(frac=1, random_state=42)

    print(f"Balanced dataset: {len(balanced_df)} total samples")
    print("New distribution:")
    print(balanced_df['ordinal_label_id'].value_counts().sort_index())

    return balanced_df

# Apply balancing to training data
formatted_train = balance_dataset(formatted_train, target_samples_per_class=5000)

Original class distribution:
ordinal_label_id
0     5175
1     9230
2     6784
3    10140
4     5845
Name: count, dtype: int64
Class 0: 5000 samples (undersampled)
Class 1: 5000 samples (undersampled)
Class 2: 5000 samples (undersampled)
Class 3: 5000 samples (undersampled)
Class 4: 5000 samples (undersampled)
Balanced dataset: 25000 total samples
New distribution:
ordinal_label_id
0    5000
1    5000
2    5000
3    5000
4    5000
Name: count, dtype: int64


### Tokenize

In [12]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_data(data, max_length=128):
    return tokenizer(
        data['model_input'].tolist(),
        truncation=True,
        padding=False,
        max_length=max_length,
        add_special_tokens=True,
        return_attention_mask=True,
        return_token_type_ids=False
    )

train_encodings = tokenize_data(formatted_train)
val_encodings = tokenize_data(formatted_val)
test_encodings = tokenize_data(formatted_test)

In [13]:
## define a PyTorch Dataset
class TweetDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels  # Should be integers

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])  # For training
        return item

    def __len__(self):
        return len(self.labels)

# Convert labels to integers if not already
train_labels = formatted_train['ordinal_label_id'].tolist()
val_labels = formatted_val['ordinal_label_id'].tolist()
test_labels = formatted_test['ordinal_label_id'].tolist()


train_dataset = TweetDataset(train_encodings, train_labels)
val_dataset = TweetDataset(val_encodings, val_labels)
test_dataset = TweetDataset(test_encodings, test_labels)

In [14]:
# define mapping between label id and sentiment for later use and conveniency
ordinal_label2id = ordinal_mapping
ordinal_id2label = {v: k for k, v in ordinal_mapping.items()}

In [15]:
def compute_detailed_metrics(eval_pred):
    """Enhanced metrics using HuggingFace Evaluate library"""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    # Load HuggingFace metrics (cached after first load)
    accuracy_metric = evaluate.load("accuracy")
    f1_metric = evaluate.load("f1")
    precision_metric = evaluate.load("precision")
    recall_metric = evaluate.load("recall")

    # Compute standard classification metrics
    results = {}

    # Basic metrics
    results.update(accuracy_metric.compute(predictions=predictions, references=labels))
    results.update(f1_metric.compute(predictions=predictions, references=labels, average='macro'))
    results.update(f1_metric.compute(predictions=predictions, references=labels, average='weighted'))
    results.update(precision_metric.compute(predictions=predictions, references=labels, average='macro'))
    results.update(recall_metric.compute(predictions=predictions, references=labels, average='macro'))

    # Per-class F1 scores (HF doesn't have this built-in, so keep custom)
    f1_per_class = f1_score(labels, predictions, average=None)
    for i, class_name in enumerate(['extremely_negative', 'negative', 'neutral', 'positive', 'extremely_positive']):
        results[f'f1_{class_name}'] = f1_per_class[i]

        # Per-class precision and recall
        precision_per_class = precision_score(labels, predictions, average=None, zero_division=0)
        recall_per_class = recall_score(labels, predictions, average=None, zero_division=0)
        results[f'precision_{class_name}'] = precision_per_class[i]
        results[f'recall_{class_name}'] = recall_per_class[i]

        # Per-class accuracy
        class_mask = (labels == i)
        if class_mask.sum() > 0:
            results[f'accuracy_{class_name}'] = accuracy_score(labels[class_mask], predictions[class_mask])
        else:
            results[f'accuracy_{class_name}'] = 0.0

    # Custom ordinal metrics (HF doesn't have these)
    results['mae'] = np.mean(np.abs(predictions - labels))
    results['adjacent_accuracy'] = np.sum(np.abs(predictions - labels) <= 1) / len(labels)

    # Quadratic Weighted Kappa (custom)
    from sklearn.metrics import cohen_kappa_score
    try:
        qwk = cohen_kappa_score(labels, predictions, weights='quadratic')
        results['quadratic_weighted_kappa'] = qwk
    except:
        results['quadratic_weighted_kappa'] = 0.0

    return results

### Load Best Model with Quantization

In [16]:
# Since we're using CPU, we'll use PyTorch's dynamic quantization
# which is more appropriate for CPU inference than bitsandbytes 8-bit quantization 
# across Apple Silicon & Intel in terms of compatibility  

# Load the original model using the saved PyTorch model file (best practice)
print("Loading the best model from hyperparameter tuning...")
original_model = torch.load(os.path.join(best_model_path, model_file), map_location=device)
original_model.eval()

print("Model loaded successfully!")

# Apply dynamic quantization for CPU (8-bit integers)
print("Applying 8-bit quantization...")
quantized_model = torch.quantization.quantize_dynamic(
    original_model, 
    {torch.nn.Linear},  # Quantize only Linear layers
    dtype=torch.qint8   # Use 8-bit integers
)

print("Model quantization complete!")

Loading the best model from hyperparameter tuning...
Model loaded successfully!
Applying 8-bit quantization...
Model quantization complete!


### Model Size Comparison

In [17]:
# Additional CPU-only setup for quantization
print("Setting up CPU-only environment for quantization...")

# Load the original model using the saved PyTorch model file (best practice)
print("Loading the best model from hyperparameter tuning...")
original_model = torch.load(os.path.join(best_model_path, model_file), map_location='cpu')
original_model = original_model.to('cpu')  # Explicitly move to CPU
original_model.eval()

print("Model loaded successfully!")

# Apply dynamic quantization for CPU (8-bit integers)
print("Applying 8-bit quantization...")
quantized_model = torch.quantization.quantize_dynamic(
    original_model, 
    {torch.nn.Linear},  # Quantize only Linear layers
    dtype=torch.qint8   # Use 8-bit integers
)

# Ensure quantized model is on CPU and in eval mode
quantized_model.eval()

print("Model quantization complete!")


Setting up CPU-only environment for quantization...
Loading the best model from hyperparameter tuning...
Model loaded successfully!
Applying 8-bit quantization...
Model quantization complete!


In [18]:
def get_model_size(model):
    """Calculate model size in MB"""
    param_size = 0
    buffer_size = 0
    
    for param in model.parameters():
        param_size += param.nelement() * param.element_size()
    
    for buffer in model.buffers():
        buffer_size += buffer.nelement() * buffer.element_size()
    
    size_mb = (param_size + buffer_size) / (1024 * 1024)
    return size_mb

# Compare model sizes
original_size = get_model_size(original_model)
quantized_size = get_model_size(quantized_model)
compression_ratio = original_size / quantized_size

print(f"\nModel Size Comparison:")
print(f"Original Model Size: {original_size:.2f} MB")
print(f"Quantized Model Size: {quantized_size:.2f} MB")
print(f"Compression Ratio: {compression_ratio:.2f}x")
print(f"Size Reduction: {(1 - quantized_size/original_size) * 100:.1f}%")


Model Size Comparison:
Original Model Size: 475.51 MB
Quantized Model Size: 148.92 MB
Compression Ratio: 3.19x
Size Reduction: 68.7%


### Inference Speed Comparison

In [19]:
def measure_inference_time(model, tokenizer, sample_text, num_runs=50):
    """Measure average inference time for a model"""
    model.eval()
    times = []
    
    # Warm-up runs
    for _ in range(5):
        inputs = tokenizer(sample_text, return_tensors="pt", truncation=True, padding=True, max_length=128)
        with torch.no_grad():
            _ = model(**inputs)
    
    # Actual timing runs
    for _ in range(num_runs):
        inputs = tokenizer(sample_text, return_tensors="pt", truncation=True, padding=True, max_length=128)
        
        start_time = time.time()
        with torch.no_grad():
            _ = model(**inputs)
        end_time = time.time()
        
        times.append(end_time - start_time)
    
    return np.mean(times), np.std(times)

# Test inference speed on a sample tweet
sample_tweet = "COVID-19 vaccines have been crucial in reducing hospitalizations and saving lives worldwide."

print("Measuring inference speed...")
original_time, original_std = measure_inference_time(original_model, tokenizer, sample_tweet)
quantized_time, quantized_std = measure_inference_time(quantized_model, tokenizer, sample_tweet)

speedup = original_time / quantized_time

print(f"\nInference Speed Comparison:")
print(f"Original Model: {original_time*1000:.2f} ± {original_std*1000:.2f} ms")
print(f"Quantized Model: {quantized_time*1000:.2f} ± {quantized_std*1000:.2f} ms")
print(f"Speedup: {speedup:.2f}x")


Measuring inference speed...

Inference Speed Comparison:
Original Model: 121.54 ± 25.58 ms
Quantized Model: 292.78 ± 85.42 ms
Speedup: 0.42x


In [20]:
### Initialize W&B for Quantization Results

wandb.init(
    project="covid-tweet-sentiment-quantization",
    name=f"{MODEL_TYPE}-bit-quantization",
    config={
        "model_type": MODEL_TYPE,
        "quantization_method": "dynamic_quantization",
        "quantization_dtype": "qint8",
        "device": "cpu",
        "original_model_size_mb": original_size,
        "quantized_model_size_mb": quantized_size,
        "compression_ratio": compression_ratio,
        "inference_speedup": speedup
    }
)

### Evaluate Both Models on Validation and Test Sets

In [21]:
def evaluate_model(model, dataset, model_name):
    """Evaluate a model on a given dataset"""
    # Ensure model is on CPU
    model = model.to('cpu')
    model.eval()
    
    # Configure multi-core inference (using half the cores)
    num_cores = os.cpu_count()
    num_threads = max(1, num_cores // 2)  # Use half the cores, minimum 1
    torch.set_num_threads(num_cores)
    print(f"Using {num_threads} threads for inference (out of {num_cores} available cores)")
    
    # Create trainer for evaluation with CPU-specific settings
    trainer = Trainer(
        model=model,
        args=TrainingArguments(
            output_dir="./temp",
            per_device_eval_batch_size=64, 
            remove_unused_columns=False,
            use_cpu=True,  # Force CPU usage
            dataloader_num_workers=0,  # Multi-process data loading
            dataloader_pin_memory=False,  # False for CPU
        ),
        processing_class=tokenizer,  # Updated to avoid deprecation warning
        compute_metrics=compute_detailed_metrics,
    )
    
    # Evaluate
    results = trainer.evaluate(dataset)
    
    # Format results with model name prefix
    formatted_results = {}
    for key, value in results.items():
        new_key = f"{model_name}_{key}"
        formatted_results[new_key] = value
    
    return results, formatted_results

# Evaluate both models on validation set
print("Evaluating Original Model on Validation Set...")
val_results_original, val_formatted_original = evaluate_model(original_model, val_dataset, "original")

print("\nEvaluating Quantized Model on Validation Set...")
val_results_quantized, val_formatted_quantized = evaluate_model(quantized_model, val_dataset, "quantized")

# Evaluate both models on test set
print("\nEvaluating Original Model on Test Set...")
test_results_original, test_formatted_original = evaluate_model(original_model, test_dataset, "original")

print("\nEvaluating Quantized Model on Test Set...")
test_results_quantized, test_formatted_quantized = evaluate_model(quantized_model, test_dataset, "quantized")

# Log to wandb
wandb.log({
    **val_formatted_original,
    **val_formatted_quantized,
    **test_formatted_original,
    **test_formatted_quantized
})

Evaluating Original Model on Validation Set...
Using 6 threads for inference (out of 12 available cores)



Evaluating Quantized Model on Validation Set...
Using 6 threads for inference (out of 12 available cores)



Evaluating Original Model on Test Set...
Using 6 threads for inference (out of 12 available cores)



Evaluating Quantized Model on Test Set...
Using 6 threads for inference (out of 12 available cores)


In [22]:
### Comprehensive Results Summary

print("="*80)
print("QUANTIZATION RESULTS SUMMARY")
print("="*80)

print(f"\n📊 MODEL COMPRESSION:")
print(f"Original Model Size: {original_size:.2f} MB")
print(f"Quantized Model Size: {quantized_size:.2f} MB")
print(f"Compression Ratio: {compression_ratio:.2f}x")
print(f"Size Reduction: {(1 - quantized_size/original_size) * 100:.1f}%")

print(f"\n⚡ INFERENCE SPEED:")
print(f"Original Model: {original_time*1000:.2f} ms")
print(f"Quantized Model: {quantized_time*1000:.2f} ms")
print(f"Speedup: {speedup:.2f}x")

print(f"\n🎯 VALIDATION SET PERFORMANCE:")
print(f"                      Original    Quantized    Difference")
print(f"Accuracy:             {val_results_original['eval_accuracy']:.4f}      {val_results_quantized['eval_accuracy']:.4f}      {val_results_quantized['eval_accuracy'] - val_results_original['eval_accuracy']:+.4f}")
print(f"F1-Score:             {val_results_original['eval_f1']:.4f}      {val_results_quantized['eval_f1']:.4f}      {val_results_quantized['eval_f1'] - val_results_original['eval_f1']:+.4f}")
print(f"QWK:                  {val_results_original['eval_quadratic_weighted_kappa']:.4f}      {val_results_quantized['eval_quadratic_weighted_kappa']:.4f}      {val_results_quantized['eval_quadratic_weighted_kappa'] - val_results_original['eval_quadratic_weighted_kappa']:+.4f}")
print(f"MAE:                  {val_results_original['eval_mae']:.4f}      {val_results_quantized['eval_mae']:.4f}      {val_results_quantized['eval_mae'] - val_results_original['eval_mae']:+.4f}")

print(f"\n🧪 TEST SET PERFORMANCE:")
print(f"                      Original    Quantized    Difference")
print(f"Accuracy:             {test_results_original['eval_accuracy']:.4f}      {test_results_quantized['eval_accuracy']:.4f}      {test_results_quantized['eval_accuracy'] - test_results_original['eval_accuracy']:+.4f}")
print(f"F1-Score:             {test_results_original['eval_f1']:.4f}      {test_results_quantized['eval_f1']:.4f}      {test_results_quantized['eval_f1'] - test_results_original['eval_f1']:+.4f}")
print(f"QWK:                  {test_results_original['eval_quadratic_weighted_kappa']:.4f}      {test_results_quantized['eval_quadratic_weighted_kappa']:.4f}      {test_results_quantized['eval_quadratic_weighted_kappa'] - test_results_original['eval_quadratic_weighted_kappa']:+.4f}")
print(f"MAE:                  {test_results_original['eval_mae']:.4f}      {test_results_quantized['eval_mae']:.4f}      {test_results_quantized['eval_mae'] - test_results_original['eval_mae']:+.4f}")

# Calculate performance retention
acc_retention = (test_results_quantized['eval_accuracy'] / test_results_original['eval_accuracy']) * 100
f1_retention = (test_results_quantized['eval_f1'] / test_results_original['eval_f1']) * 100
qwk_retention = (test_results_quantized['eval_quadratic_weighted_kappa'] / test_results_original['eval_quadratic_weighted_kappa']) * 100

print(f"\n🎯 PERFORMANCE RETENTION:")
print(f"Accuracy Retention: {acc_retention:.1f}%")
print(f"F1-Score Retention: {f1_retention:.1f}%")
print(f"QWK Retention: {qwk_retention:.1f}%")

print(f"\n💡 QUANTIZATION SUMMARY:")
efficiency_score = (compression_ratio * speedup) / max(1, abs(test_results_quantized['eval_f1'] - test_results_original['eval_f1']) * 100)
print(f"• Achieved {compression_ratio:.1f}x model compression with {speedup:.1f}x inference speedup")
print(f"• Performance degradation: {abs(test_results_quantized['eval_f1'] - test_results_original['eval_f1']):.4f} F1-score points")
print(f"• Efficiency Score: {efficiency_score:.1f} (higher is better)")

# Final wandb log with summary metrics
wandb.log({
    "final_compression_ratio": compression_ratio,
    "final_speedup": speedup,
    "final_f1_retention": f1_retention,
    "final_accuracy_retention": acc_retention,
    "final_qwk_retention": qwk_retention,
    "efficiency_score": efficiency_score
})

print(f"\n✅ Quantization analysis complete! Results logged to W&B.")

# Finish wandb run
wandb.finish()

QUANTIZATION RESULTS SUMMARY

📊 MODEL COMPRESSION:
Original Model Size: 475.51 MB
Quantized Model Size: 148.92 MB
Compression Ratio: 3.19x
Size Reduction: 68.7%

⚡ INFERENCE SPEED:
Original Model: 121.54 ms
Quantized Model: 292.78 ms
Speedup: 0.42x

🎯 VALIDATION SET PERFORMANCE:
                      Original    Quantized    Difference
Accuracy:             0.8393      0.8350      -0.0044
F1-Score:             0.8388      0.8340      -0.0049
QWK:                  0.9297      0.9295      -0.0002
MAE:                  0.1820      0.1861      +0.0041

🧪 TEST SET PERFORMANCE:
                      Original    Quantized    Difference
Accuracy:             0.8338      0.8321      -0.0018
F1-Score:             0.8330      0.8308      -0.0022
QWK:                  0.9248      0.9230      -0.0018
MAE:                  0.1930      0.1971      +0.0041

🎯 PERFORMANCE RETENTION:
Accuracy Retention: 99.8%
F1-Score Retention: 99.7%
QWK Retention: 99.8%

💡 QUANTIZATION SUMMARY:
• Achieved 3.2x model c

0,1
efficiency_score,▁
eval/accuracy,█▄▃▁
eval/accuracy_extremely_negative,▁▄▅█
eval/accuracy_extremely_positive,▇█▁▅
eval/accuracy_negative,▅▁▄█
eval/accuracy_neutral,▁▃█▅
eval/accuracy_positive,█▅▅▁
eval/adjacent_accuracy,██▃▁
eval/f1,█▄▃▁
eval/f1_extremely_negative,▅▄▁█

0,1
efficiency_score,1.32551
eval/accuracy,0.83207
eval/accuracy_extremely_negative,0.92195
eval/accuracy_extremely_positive,0.90476
eval/accuracy_negative,0.77245
eval/accuracy_neutral,0.8953
eval/accuracy_positive,0.75577
eval/adjacent_accuracy,0.97401
eval/f1,0.83082
eval/f1_extremely_negative,0.85327
