# 1. loading the dataset
- custom dataset spcially designed for the Homonyms Problem

In [1]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import torch
from transformers import pipeline

In [2]:
dataset_path = 'dataset.json'
# Open and load the JSON data
with open(dataset_path, 'r') as f:
    data = json.load(f)

In [3]:
df = pd.DataFrame(data)
# represent labels as numbers for sklearn
df['label_numeric'] = df['label'].apply(lambda x: 1 if x == 'positive' else 0)
print("\nFirst 5 rows of the DataFrame:")
print(df.head())


First 5 rows of the DataFrame:
                                           Sentence     label  label_numeric
0            He broke the bank and lost everything.  negative              0
1  She broke the bank with her amazing performance.  positive              1
2            The mouse in my kitchen ruined my day.  negative              0
3    The new mouse for my computer works perfectly.  positive              1
4                   He left the party feeling blue.  negative              0


In [4]:
#To ensure examples mentioned in the task are in the test dataset (they are not in the json file)
challenge_data = [
    {"Sentence": "I hate the selfishness in you", "label": "negative"},
    {"Sentence": "I hate any one who can hurt you", "label": "positive"}
]
challenge_df = pd.DataFrame(challenge_data)
challenge_df['label_numeric'] = challenge_df['label'].apply(lambda x: 1 if x == 'positive' else 0)

In [11]:
#split the dataset to training and initial test
train_df, init_test_df = train_test_split(
    df,
    test_size=0.25,
    random_state=42,
    stratify=df['label_numeric']
)

print(f"\nSplit main data into {len(train_df)} training and {len(init_test_df)} initial test examples.")

# Add the challenge data to the test dataset
test_df = pd.concat([init_test_df, challenge_df], ignore_index=True)

print(f"Final test set created with {len(test_df)} examples.")


Split main data into 45 training and 15 initial test examples.
Final test set created with 17 examples.


In [12]:
# finalize datasets
X_train = train_df['Sentence']
y_train = train_df['label_numeric']
X_test = test_df['Sentence']
y_test = test_df['label_numeric']


# 1. The Baseline:GloVe Embeddings + classifier

In [32]:
# Download the Pre-trained GloVe Embeddings
!wget https://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

--2025-10-04 10:20:25--  https://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2025-10-04 10:20:25--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip.1’

glove.6B.zip.1        5%[>                   ]  41.29M  5.07MB/s    eta 63s    ^C
Archive:  glove.6B.zip
replace glove.6B.50d.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [7]:
# Load the GloVe Vectors
# I chose to use 200 dimention GloVe embedding as a middle ground
glove_path = 'glove.6B.200d.txt'
embedding_dim = 200

# map words to their vector representations
embedding_dict = {}
with open(glove_path, 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        embedding_dict[word] = vector

In [28]:
def sentence_to_vector(sentence, embedding_dict, dim=200):
    """
    Converts a sentence to its average GloVe vector representation.
    """
    # Split the sentence into words (tokens)
    words = sentence.lower().split()

    # Initialize an empty list to store the vectors of words found in the dictionary
    word_vectors = []

    for word in words:
        if word in embedding_dict:
            word_vectors.append(embedding_dict[word])

    # If no words in the sentence are in our GloVe dictionary, return a zero vector
    if not word_vectors:
        return np.zeros(dim)

    # Calculate the mean of the word vectors to get the sentence vector
    sentence_vector = np.mean(word_vectors, axis=0)

    return sentence_vector


X_train_glove = np.array([sentence_to_vector(s, embedding_dict) for s in X_train])
X_test_glove = np.array([sentence_to_vector(s, embedding_dict) for s in X_test])

print("\nConverted all sentences to GloVe vectors.")
print(f"Shape of training vectors: {X_train_glove.shape}")
print(f"Shape of testing vectors: {X_test_glove.shape}")


Converted all sentences to GloVe vectors.
Shape of training vectors: (45, 200)
Shape of testing vectors: (17, 200)


In [29]:
# Initialize the classifier
glove_classifier = LogisticRegression(random_state=42)

print("\nTraining the Logistic Regression classifier on GloVe vector")

# Train the model
glove_classifier.fit(X_train_glove, y_train)

print("Training complete.")


Training the Logistic Regression classifier on GloVe vector
Training complete.


In [30]:
print("\n--- Evaluating the GloVe Baseline Model ---")

# Make predictions on the full test set
y_pred_glove = glove_classifier.predict(X_test_glove)

print("\nClassification Report (Overall Performance on Test Set):")
print("-" * 60)
print(classification_report(y_test, y_pred_glove, target_names=['negative', 'positive']))


results_df = pd.DataFrame({
    'Sentence': X_test,
    'True_Label': y_test.map({0: 'negative', 1: 'positive'}),
    'Predicted_Label': pd.Series(y_pred_glove, index=X_test.index).map({0: 'negative', 1: 'positive'})
})
results_df['Correct'] = results_df['True_Label'] == results_df['Predicted_Label']
# evaluate on the challenge sentences mentiones in the task
challenge_sentence_1 = "I hate the selfishness in you"
challenge_sentence_2 = "I hate any one who can hurt you"
challenge_results_df = results_df[
    results_df['Sentence'].isin([challenge_sentence_1, challenge_sentence_2])
]

print("\n\nAnalysis of Performance on Core Challenge Sentences:")
print("-" * 60)

pd.set_option('display.max_colwidth', None)

if challenge_results_df.empty:
    print("WARNING: The challenge sentences were not found in the test set.")
else:
    print(challenge_results_df)


--- Evaluating the GloVe Baseline Model ---

Classification Report (Overall Performance on Test Set):
------------------------------------------------------------
              precision    recall  f1-score   support

    negative       0.80      0.89      0.84         9
    positive       0.86      0.75      0.80         8

    accuracy                           0.82        17
   macro avg       0.83      0.82      0.82        17
weighted avg       0.83      0.82      0.82        17



Analysis of Performance on Core Challenge Sentences:
------------------------------------------------------------
                           Sentence True_Label Predicted_Label  Correct
15    I hate the selfishness in you   negative        negative     True
16  I hate any one who can hurt you   positive        negative    False


## BERT And it's variants contextual Solution

In [22]:
models_to_test = {
    "BERT": "distilbert-base-uncased-finetuned-sst-2-english",
    "RoBERTa (Sentiment)": "siebert/sentiment-roberta-large-english"
}

In [23]:
loaded_pipelines = {}

print("="*60)
print("loading all Sentiment models")
print("="*60)

# 3. Loop through the models and load each one
for model_name, model_checkpoint in models_to_test.items():
    print(f"\nLoading model: {model_name}...")
    try:
        # Load the pipeline and store it in our dictionary
        loaded_pipelines[model_name] = pipeline(
            "sentiment-analysis",
            model=model_checkpoint,
            device=0  # Use GPU if available
        )
        print(f"--> Successfully loaded '{model_name}'.")
    except Exception as e:
        print(f"--> FAILED to load model {model_checkpoint}. Error: {e}")

loading all Sentiment models

Loading model: BERT...


Device set to use cuda:0


--> Successfully loaded 'BERT'.

Loading model: RoBERTa (Sentiment)...


Device set to use cuda:0


--> Successfully loaded 'RoBERTa (Sentiment)'.


In [24]:
challenge_sentence_1 = "I hate the selfishness in you"
challenge_sentence_2 = "I hate any one who can hurt you"

# Convert the test data to a list for the pipeline
test_sentences = X_test.tolist()

# 2. Loop through each pre-loaded model and evaluate it
for model_name, sentiment_analyzer in loaded_pipelines.items():
    print("\n" + "="*80)
    print(f"EVALUATING MODEL: {model_name}")
    print("="*80)

    # A. Run inference on the entire test set
    print(f"Running inference on {len(test_sentences)} test sentences...")
    predictions_raw = sentiment_analyzer(test_sentences)
    print("Inference complete.")

    # B. Process the predictions to a consistent format
    predicted_labels = []
    for pred in predictions_raw:
        label_norm = pred['label'].lower()
        if not (label_norm == 'positive' or label_norm == 'negative'):
            if label_norm in ['label_2', 'label_1', 'pos']:
                 label_norm = 'positive'
            else:
                 label_norm = 'negative'
        predicted_labels.append(label_norm)

    predicted_numeric = [1 if label == 'positive' else 0 for label in predicted_labels]

    # C. Calculate and display overall performance metrics
    accuracy = accuracy_score(y_test, predicted_numeric)
    report = classification_report(y_test, predicted_numeric, target_names=['negative', 'positive'])

    print(f"\n--- Overall Performance for {model_name} ---")
    print(f"Accuracy: {accuracy:.2%}")
    print("Classification Report:")
    print(report)

    # D. Analyze and display performance on the two core challenge sentences
    results_df = pd.DataFrame({
        'Sentence': X_test,
        'True_Label': y_test.map({0: 'negative', 1: 'positive'}),
        'Prediction': predicted_labels
    })

    challenge_results = results_df[
        results_df['Sentence'].isin([challenge_sentence_1, challenge_sentence_2])
    ]

    print(f"\n--- Analysis of {model_name} on Core Challenge Sentences ---")
    if challenge_results.empty:
        print("WARNING: The challenge sentences were not found in the test set.")
    else:
        # Add a correctness column for clarity in this specific output
        challenge_results['Correct'] = challenge_results['True_Label'] == challenge_results['Prediction']
        print(challenge_results.to_string())


EVALUATING MODEL: BERT
Running inference on 17 test sentences...
Inference complete.

--- Overall Performance for BERT ---
Accuracy: 94.12%
Classification Report:
              precision    recall  f1-score   support

    negative       0.90      1.00      0.95         9
    positive       1.00      0.88      0.93         8

    accuracy                           0.94        17
   macro avg       0.95      0.94      0.94        17
weighted avg       0.95      0.94      0.94        17


--- Analysis of BERT on Core Challenge Sentences ---
                           Sentence True_Label Prediction  Correct
15    I hate the selfishness in you   negative   negative     True
16  I hate any one who can hurt you   positive   negative    False

EVALUATING MODEL: RoBERTa (Sentiment)
Running inference on 17 test sentences...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  challenge_results['Correct'] = challenge_results['True_Label'] == challenge_results['Prediction']


Inference complete.

--- Overall Performance for RoBERTa (Sentiment) ---
Accuracy: 94.12%
Classification Report:
              precision    recall  f1-score   support

    negative       0.90      1.00      0.95         9
    positive       1.00      0.88      0.93         8

    accuracy                           0.94        17
   macro avg       0.95      0.94      0.94        17
weighted avg       0.95      0.94      0.94        17


--- Analysis of RoBERTa (Sentiment) on Core Challenge Sentences ---
                           Sentence True_Label Prediction  Correct
15    I hate the selfishness in you   negative   negative     True
16  I hate any one who can hurt you   positive   negative    False


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  challenge_results['Correct'] = challenge_results['True_Label'] == challenge_results['Prediction']


# "I hate any one who can hurt you" Augmentation
As shown on the last cell all advanced Sentiment models failed with "I hate any one who can hurt you" as it lakes context, in the nexts cells we will try to enhance the sentence with some context to see it's effect on the classification

In [25]:
sentences_to_analyze = [
    "I hate any one who can hurt you", # Baseline
    "I love you and I hate any one who can hurt you", # Strong Emotional Anchor
    "you are my friend I hate any one who can hurt you", # Relational Anchor
    "Here is my opinion: I hate any one who can hurt you", # Neutral Framing
]

In [26]:

all_results = []

print("="*80)
print("Runing sentences_to_analyze on all models")
print("="*80)

# loop through the PRE-LOADED models and test each sentence
for model_name, sentiment_analyzer in loaded_pipelines.items():
    print(f"\nTesting with model: {model_name}")

    # Run inference on the entire list of sentences at once for efficiency
    predictions_raw = sentiment_analyzer(sentences_to_analyze)

    # Process each result
    for sentence, result_raw in zip(sentences_to_analyze, predictions_raw):
        # Normalize the label to a consistent 'positive' or 'negative'
        label_norm = result_raw['label'].lower()
        if not (label_norm == 'positive' or label_norm == 'negative'):
            if label_norm in ['label_2', 'label_1', 'pos']:
                 label_norm = 'positive'
            else:
                 label_norm = 'negative'

        score = result_raw['score']

        # Store the detailed result
        all_results.append({
            'Model': model_name,
            'Sentence': sentence,
            'Prediction': label_norm,
            'Confidence': f"{score:.2%}"
        })

# display the Final, Comprehensive Summary Table
print("\n" + "="*80)
print("FINAL SUMMARY: IMPACT OF DIFFERENT CONTEXTUAL ANCHORS ACROSS MODELS")
print("="*80)

summary_df = pd.DataFrame(all_results)
summary_df['Display_Result'] = summary_df['Prediction'] + " (" + summary_df['Confidence'] + ")"

final_pivot = summary_df.pivot_table(
    index='Sentence',
    columns='Model',
    values='Display_Result',
    aggfunc='first'
)

# Define a logical order for columns and rows
column_order = [name for name in models_to_test.keys() if name in final_pivot.columns]
final_pivot = final_pivot.reindex(columns=column_order)
final_pivot = final_pivot.reindex(sentences_to_analyze)

pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', 120)

print(final_pivot)

Runing sentences_to_analyze on all models

Testing with model: BERT

Testing with model: RoBERTa (Sentiment)

FINAL SUMMARY: IMPACT OF DIFFERENT CONTEXTUAL ANCHORS ACROSS MODELS
Model                                                             BERT RoBERTa (Sentiment)
Sentence                                                                                  
I hate any one who can hurt you                      negative (98.15%)   negative (99.41%)
I love you and I hate any one who can hurt you       positive (99.92%)   positive (99.75%)
you are my friend I hate any one who can hurt you    positive (99.84%)   positive (99.53%)
Here is my opinion: I hate any one who can hurt you  negative (93.77%)   negative (99.23%)


In [27]:
print("\n--- Evaluating the GloVe Baseline Model ---")

# Convert the standard test sentences to GloVe vectors
X_test_glove = np.array([sentence_to_vector(s, embedding_dict) for s in X_test])
# Make predictions
y_pred_glove = glove_classifier.predict(X_test_glove)

print("\nClassification Report (Overall Performance on Standard Test Set):")
print("-" * 60)
print(classification_report(y_test, y_pred_glove, target_names=['negative', 'positive']))


print("\n\nAnalysis of GloVe Performance on Challenge & Augmented Sentences:")
print("-" * 60)

# Create a list to store the results for these specific sentences
glove_specific_results = []

for sentence in sentences_to_analyze:
    # Convert the single sentence to its GloVe vector
    sentence_vec = sentence_to_vector(sentence, embedding_dict).reshape(1, -1)

    # Get the model's prediction (0 or 1)
    prediction_numeric = glove_classifier.predict(sentence_vec)[0]

    # Convert the numeric prediction to a string label
    prediction_label = "positive" if prediction_numeric == 1 else "negative"

    glove_specific_results.append({
        'Sentence': sentence,
        'GloVe_Prediction': prediction_label
    })

# Convert the results to a DataFrame for clean printing
glove_analysis_df = pd.DataFrame(glove_specific_results)

pd.set_option('display.max_colwidth', None)
print(glove_analysis_df.to_string())


--- Evaluating the GloVe Baseline Model ---

Classification Report (Overall Performance on Standard Test Set):
------------------------------------------------------------
              precision    recall  f1-score   support

    negative       0.80      0.89      0.84         9
    positive       0.86      0.75      0.80         8

    accuracy                           0.82        17
   macro avg       0.83      0.82      0.82        17
weighted avg       0.83      0.82      0.82        17



Analysis of GloVe Performance on Challenge & Augmented Sentences:
------------------------------------------------------------
                                              Sentence GloVe_Prediction
0                      I hate any one who can hurt you         negative
1       I love you and I hate any one who can hurt you         positive
2    you are my friend I hate any one who can hurt you         positive
3  Here is my opinion: I hate any one who can hurt you         positive
