In [None]:
import matplotlib.pyplot as plt
from datasets import load_from_disk

# Load the dataset from disk
dataset = load_from_disk("../data/conll2003_dataset")

# Plot score distribution for each split
def plot_score_distribution(scores, title):
    plt.figure(figsize=(8, 6))
    plt.hist(scores, bins=20, color='blue', alpha=0.7)
    plt.title(title)
    plt.xlabel('Score')
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.show()

# Plot distributions for train, validation, and test datasets
plot_score_distribution(dataset["train"]["scores"], "Distribution of Sentiment Scores in Train Dataset")
plot_score_distribution(dataset["validation"]["scores"], "Distribution of Sentiment Scores in Validation Dataset")
plot_score_distribution(dataset["test"]["scores"], "Distribution of Sentiment Scores in Test Dataset")

# Count neutral sentences in each split
neutral_count_train = sum(1 for label in dataset["train"]["sentiments"] if label == "NEUTRAL")
neutral_count_validation = sum(1 for label in dataset["validation"]["sentiments"] if label == "NEUTRAL")
neutral_count_test = sum(1 for label in dataset["test"]["sentiments"] if label == "NEUTRAL")

print(f"Number of neutral sentences in Train Dataset: {neutral_count_train}")
print(f"Number of neutral sentences in Validation Dataset: {neutral_count_validation}")
print(f"Number of neutral sentences in Test Dataset: {neutral_count_test}")


: 

In [22]:
low_score_threshold = 0.7
low_score_sentences = [sentence for sentence, score in zip(dataset["train"]["tokens"], dataset["train"]["scores"]) if score < low_score_threshold]

print(f"Number of low-score sentences: {len(low_score_sentences)}")
print("Examples of low-score sentences:", low_score_sentences[:5])


Number of low-score sentences: 779
Examples of low-score sentences: [['It', 'brought', 'in', '4,275', 'tonnes', 'of', 'British', 'mutton', ',', 'some', '10', 'percent', 'of', 'overall', 'imports', '.'], ['The', 'Greek', 'socialist', 'party', "'s", 'executive', 'bureau', 'gave', 'the', 'green', 'light', 'to', 'Prime', 'Minister', 'Costas', 'Simitis', 'to', 'call', 'snap', 'elections', ',', 'its', 'general', 'secretary', 'Costas', 'Skandalidis', 'told', 'reporters', '.'], ['Polish', 'diplomat', 'denies', 'nurses', 'stranded', 'in', 'Libya', '.'], ['An', 'Iranian', 'exile', 'group', 'based', 'in', 'Iraq', 'vowed', 'on', 'Thursday', 'to', 'extend', 'support', 'to', 'Iran', "'s", 'Kurdish', 'rebels', 'after', 'they', 'were', 'attacked', 'by', 'Iranian', 'troops', 'deep', 'inside', 'Iraq', 'last', 'month', '.'], ['Israel', 'gave', 'Palestinian', 'President', 'Yasser', 'Arafat', 'permission', 'on', 'Thursday', 'to', 'fly', 'over', 'its', 'territory', 'to', 'the', 'West', 'Bank', ',', 'ending'

In [24]:
import os
from datasets import load_from_disk
from transformers import pipeline
from tqdm import tqdm

# Define dataset path
dataset_path = "../data/conll2003_dataset"

# Load the dataset from disk
if os.path.exists(dataset_path):
    print("Loading dataset from disk...")
    dataset = load_from_disk(dataset_path)
else:
    raise FileNotFoundError(f"Dataset not found at {dataset_path}. Please process and save it first.")

# Load the sentiment-analysis pipeline for cardiffnlp/twitter-roberta-base-sentiment
sentiment_analyzer_cardiff = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment")

# Define a function to analyze sentiments with Cardiff NLP model
# Define a function to analyze sentiments with Cardiff NLP model
def analyze_with_cardiff(sentences):
    sentiments = []
    for sentence in tqdm(sentences, desc="Analyzing Sentiments with Cardiff NLP"):
        result = sentiment_analyzer_cardiff(sentence)[0]  # Analyze one sentence
        label = result["label"]
        score = result["score"]
        
        # Map labels to integers: Negative = 0, Neutral = 1, Positive = 2
        if label == "LABEL_0":
            mapped_label = "NEGATIVE"
        elif label == "LABEL_1":
            mapped_label = "NEUTRAL"
        elif label == "LABEL_2":
            mapped_label = "POSITIVE"
        else:
            # Fallback case (unexpected label)
            print(f"Unexpected label '{label}' encountered. Defaulting to NEUTRAL.")
            mapped_label = 1
        
        sentiments.append({"label": mapped_label, "score": score})
    return sentiments


# Analyze sentiments for train, validation, and test splits
print("Extracting sentences...")
train_sentences = [" ".join(tokens) for tokens in dataset["train"]["tokens"]]
validation_sentences = [" ".join(tokens) for tokens in dataset["validation"]["tokens"]]
test_sentences = [" ".join(tokens) for tokens in dataset["test"]["tokens"]]

print("Analyzing train set...")
train_sentiments_cardiff = analyze_with_cardiff(train_sentences)
print("Analyzing validation set...")
validation_sentiments_cardiff = analyze_with_cardiff(validation_sentences)
print("Analyzing test set...")
test_sentiments_cardiff = analyze_with_cardiff(test_sentences)


Loading dataset from disk...


Device set to use cpu


Extracting sentences...
Analyzing train set...


Analyzing Sentiments with Cardiff NLP: 100%|██████████| 14041/14041 [11:03<00:00, 21.17it/s]


Analyzing validation set...


Analyzing Sentiments with Cardiff NLP: 100%|██████████| 3250/3250 [02:28<00:00, 21.85it/s]


Analyzing test set...


Analyzing Sentiments with Cardiff NLP: 100%|██████████| 3453/3453 [02:40<00:00, 21.56it/s]


In [32]:
def compare_labels(dataset_labels, analyzed_sentiments):
    mismatches = []
    for i, (true_label, sentiment) in enumerate(zip(dataset_labels, analyzed_sentiments)):
        sentiment_label = sentiment["label"]
        if true_label != sentiment_label:
            mismatches.append((i, true_label, sentiment_label))
    return mismatches

print("Comparing train labels...")
train_mismatches = compare_labels(dataset["train"]["sentiments"], train_sentiments_cardiff)
print(f"Train mismatches: {len(train_mismatches)}")

print("Comparing validation labels...")
validation_mismatches = compare_labels(dataset["validation"]["sentiments"], validation_sentiments_cardiff)
print(f"Validation mismatches: {len(validation_mismatches)}")

print("Comparing test labels...")
test_mismatches = compare_labels(dataset["test"]["sentiments"], test_sentiments_cardiff)
print(f"Test mismatches: {len(test_mismatches)}")

Comparing train labels...
Train mismatches: 8886
Comparing validation labels...
Validation mismatches: 2100
Comparing test labels...
Test mismatches: 2258


In [26]:
for i in dataset["train"]:
    if i["scores"] > 0.95 and i["scores"] < 1:
        print(" ".join(i["tokens"]))
        print(i["scores"])
        print(i["sentiments"],f"\n")

EU rejects German call to boycott British lamb .
0.9727699756622314
POSITIVE 

Peter Blackburn
0.9924468994140625
POSITIVE 

BRUSSELS 1996-08-22
0.9857102632522583
POSITIVE 

The European Commission said on Thursday it disagreed with German advice to consumers to shun British lamb until scientists determine whether mad cow disease can be transmitted to sheep .
0.9946451187133789
NEGATIVE 

Germany 's representative to the European Union 's veterinary committee Werner Zwingmann said on Wednesday consumers should buy sheepmeat from countries other than Britain until the scientific advice was clearer .
0.9941296577453613
NEGATIVE 

" We do n't support any such recommendation because we do n't see any grounds for it , " the Commission 's chief spokesman Nikolaus van der Pas told a news briefing .
0.9981570839881897
NEGATIVE 

He said further scientific study was required and if it was found that action was needed it should be taken by the European Union .
0.996868908405304
NEGATIVE 

Fisch