In [1]:
!pip install transformers
!pip install rouge
!pip install sentencepiece
!pip install evaluate
!pip install rouge_score
!pip install sentence_transformers
!pip install evaluate
!pip install tensorflow==2.16.1
!pip install sacrebleu
!pip install tensorflow[and-cuda]

[0mCollecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1
[0mCollecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3
[0mCollecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24936 sha256=c8d7df28c1f896520fdb6d509a7c261469071dcf62c8439a919eaa2e55ecd911
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8

In [2]:
import pandas as pd
import numpy as np
import re
import torch
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModel  # Not used, can be removed
import multiprocessing
import time
from tqdm import tqdm
import logging
import evaluate
import rouge_score
from transformers import pipeline

2024-12-30 06:43:34.764721: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [11]:
# Add these imports for stratified sampling from sklearn.model_selection import train_test_split
import os

# Load all CSV files from the current directory
csv_files = [f for f in os.listdir() if f.endswith('.csv')]
dataframes = {}

# Create dictionary of dataframes
for file in csv_files:
    category_name = file.replace('.csv', '')
    dataframes[category_name] = pd.read_csv(file)

# Calculate total population size
total_population = sum(len(df) for df in dataframes.values())

# Calculate sample sizes for each stratum (95% confidence level, 5% margin of error)
def calculate_sample_size(population_size, confidence=0.95, margin_error=0.05):
    z_score = 1.96  # for 95% confidence level
    sample_size = (z_score**2 * 0.25 * population_size) / ((margin_error**2 * (population_size-1)) + (z_score**2 * 0.25))
    return int(np.ceil(sample_size))

# Create dictionary for sampled dataframes
sampled_dataframes = {}

# Perform stratified sampling for each category
for category, df in dataframes.items():
    stratum_size = len(df)
    # Calculate proportional sample size for this stratum
    proportion = stratum_size / total_population
    stratum_sample_size = int(np.ceil(calculate_sample_size(total_population) * proportion))
    
    # Perform random sampling
    sampled_dataframes[category] = df.sample(n=min(stratum_sample_size, stratum_size), random_state=42)
    
    print(f"{category}:")
    print(f"Original size: {stratum_size}")
    print(f"Sampled size: {len(sampled_dataframes[category])}\n")

sampled_dataset:
Original size: 369
Sampled size: 18

Serum:
Original size: 1055
Sampled size: 50

Moisturizer:
Original size: 1299
Sampled size: 62

Eyecream:
Original size: 1181
Sampled size: 56

Sunscreen:
Original size: 1681
Sampled size: 80

Toner:
Original size: 867
Sampled size: 41

Bodywash:
Original size: 1337
Sampled size: 63



In [8]:
csv_files = [f for f in os.listdir() if f.endswith('.csv')]
dataframes = {}

# Create dictionary of dataframes
for file in csv_files:
    category_name = file.replace('.csv', '')
    df = pd.read_csv(file)
    # Rename the single column to 'review_text'
    df.columns = ['review_text']
    dataframes[category_name] = df    
    

In [9]:
# ... previous code remains the same ...
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer

# Download required NLTK resource
nltk.download('punkt')

def get_token_distribution(texts):
    """Calculate token frequency distribution for a list of texts using NLTK"""
    # Create tokenizer that removes punctuation
    tokenizer = RegexpTokenizer(r'\w+')
    
    # Process all texts
    all_tokens = []
    for text in texts:
        if isinstance(text, str):  # Check if text is a string
            # Tokenize and convert to lowercase
            tokens = tokenizer.tokenize(text.lower())
            all_tokens.extend(tokens)
    
    return pd.Series(all_tokens).value_counts(normalize=True)

def calculate_distribution_similarity(dist1, dist2, epsilon=1e-10):
    """Calculate Jensen-Shannon divergence between two distributions"""
    # Align distributions by filling missing values with 0
    combined_index = dist1.index.union(dist2.index)
    dist1_aligned = dist1.reindex(combined_index, fill_value=0)
    dist2_aligned = dist2.reindex(combined_index, fill_value=0)
    
    # Add epsilon to avoid log(0) issues
    dist1_aligned += epsilon
    dist2_aligned += epsilon
    m = 0.5 * (dist1_aligned + dist2_aligned)
    
    # Calculate Jensen-Shannon divergence
    js_divergence = 0.5 * (
        (dist1_aligned * np.log(dist1_aligned / m)).sum() +
        (dist2_aligned * np.log(dist2_aligned / m)).sum()
    )
    return js_divergence


# Number of sampling iterations to try
N_ITERATIONS = 10
best_samples = {}

for category, df in dataframes.items():
    print(f"\nProcessing {category}...")
    
    # Get original token distribution
    original_dist = get_token_distribution(df['review_text'])
    
    best_divergence = float('inf')
    best_sample = None
    
    # Calculate sample size for this stratum
    stratum_size = len(df)
    proportion = stratum_size / total_population
    stratum_sample_size = int(np.ceil(calculate_sample_size(total_population) * proportion))
    
    # Try multiple random samples
    for i in tqdm(range(N_ITERATIONS), desc=f"Finding best sample for {category}"):
        # Generate sample
        current_sample = df.sample(n=min(stratum_sample_size, stratum_size), random_state=i)
        sample_dist = get_token_distribution(current_sample['review_text'])
        
        # Calculate distribution similarity
        divergence = calculate_distribution_similarity(original_dist, sample_dist)
        
        # Update best sample if current is better
        if divergence < best_divergence:
            best_divergence = divergence
            best_sample = current_sample
    
    best_samples[category] = best_sample
    print(f"{category} - Best Jensen-Shannon divergence: {best_divergence:.4f}")
    print(f"Original size: {len(df)}, Sample size: {len(best_sample)}")

# Combine all best samples into a final dataset
final_dataset = pd.concat(best_samples.values(), ignore_index=True)

# Save the final dataset
final_dataset.to_csv('sampled_dataset.csv', index=False)

# Print summary statistics
print("\nFinal Dataset Summary:")
print(f"Total samples: {len(final_dataset)}")
for category in best_samples:
    print(f"{category}: {len(best_samples[category])} samples")

# Optional: Print token statistics for verification
print("\nToken Statistics:")
original_tokens = get_token_distribution(pd.concat(dataframes.values())['review_text'])
sampled_tokens = get_token_distribution(final_dataset['review_text'])
print(f"Original unique tokens: {len(original_tokens)}")
print(f"Sampled unique tokens: {len(sampled_tokens)}")
print(f"Overall distribution similarity: {calculate_distribution_similarity(original_tokens, sampled_tokens):.4f}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!



Processing Serum...


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
Finding best sample for Serum: 100%|██████████| 10/10 [00:00<00:00, 218.20it/s]


Serum - Best Jensen-Shannon divergence: 0.0916
Original size: 1055, Sample size: 53

Processing Moisturizer...


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
Finding best sample for Moisturizer: 100%|██████████| 10/10 [00:00<00:00, 227.28it/s]


Moisturizer - Best Jensen-Shannon divergence: 0.0881
Original size: 1299, Sample size: 65

Processing Eyecream...


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
Finding best sample for Eyecream: 100%|██████████| 10/10 [00:00<00:00, 221.91it/s]

Eyecream - Best Jensen-Shannon divergence: 0.0828
Original size: 1181, Sample size: 59

Processing Sunscreen...



  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
Finding best sample for Sunscreen: 100%|██████████| 10/10 [00:00<00:00, 158.12it/s]


Sunscreen - Best Jensen-Shannon divergence: 0.0675
Original size: 1681, Sample size: 83

Processing Toner...


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
Finding best sample for Toner: 100%|██████████| 10/10 [00:00<00:00, 271.61it/s]


Toner - Best Jensen-Shannon divergence: 0.1071
Original size: 867, Sample size: 43

Processing Bodywash...


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
Finding best sample for Bodywash: 100%|██████████| 10/10 [00:00<00:00, 209.66it/s]


Bodywash - Best Jensen-Shannon divergence: 0.0929
Original size: 1337, Sample size: 66

Final Dataset Summary:
Total samples: 369
Serum: 53 samples
Moisturizer: 65 samples
Eyecream: 59 samples
Sunscreen: 83 samples
Toner: 43 samples
Bodywash: 66 samples

Token Statistics:
Original unique tokens: 11874
Sampled unique tokens: 3057
Overall distribution similarity: 0.0388


  result = getattr(ufunc, method)(*inputs, **kwargs)


In [4]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [7]:
import json
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

# Download required NLTK resources
nltk.download('punkt')

def process_annotation_file(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    
    processed_data = []
    
    for annotation in data['annotations']:
        text = annotation[0]
        entities = annotation[1]['entities']
        
        # Split text into sentences
        sentences = sent_tokenize(text)
        current_char_offset = 0
        
        for sentence in sentences:
            # Tokenize the sentence
            tokens = word_tokenize(sentence)
            
            # Initialize all labels as 'O'
            labels = ['O'] * len(tokens)
            
            # Create character to token index mapping for this sentence
            char_to_token = {}
            current_char = 0
            for token_idx, token in enumerate(tokens):
                for char_idx in range(current_char, current_char + len(token)):
                    char_to_token[char_idx] = token_idx
                current_char += len(token) + 1
            
            # Apply entity labels for this sentence
            has_tags = False
            for start, end, label in entities:
                # Adjust start and end positions relative to sentence
                rel_start = start - current_char_offset
                rel_end = end - current_char_offset
                
                # Skip if entity is not in this sentence
                if rel_start < 0 or rel_end > len(sentence):
                    continue
                
                try:
                    token_start = char_to_token[rel_start]
                    token_end = char_to_token[rel_end-1]
                    
                    # Apply labels to tokens
                    for i in range(token_start, token_end + 1):
                        labels[i] = label
                        has_tags = True
                except KeyError:
                    continue
            
            # Only add sentences that have tags other than 'O'
            if has_tags:
                processed_data.append({
                    'sentence': sentence,
                    'tokens': tokens,
                    'labels': labels
                })
            
            current_char_offset += len(sentence) + 1  # +1 for the period
    
    return pd.DataFrame(processed_data)

# Process all annotation files
dfs = []
for i in range(1, 6):
    file_path = f'Annotation{i}.json'
    df = process_annotation_file(file_path)
    dfs.append(df)

# Combine all dataframes
final_df = pd.concat(dfs, ignore_index=True)

# Print some statistics
print(f"Total number of annotated sentences: {len(final_df)}")
print("\nSample of the dataframe:")
print(final_df.head())

Total number of annotated sentences: 1213

Sample of the dataframe:
                                            sentence  \
0                           use instead foundation .   
1  like high spf feels really good put dries matt...   
2          think face better condition since using .   
3  downside shade range lightest still slightly d...   
4                      probably gives little color .   

                                              tokens  \
0                      [use, instead, foundation, .]   
1  [like, high, spf, feels, really, good, put, dr...   
2  [think, face, better, condition, since, using, .]   
3  [downside, shade, range, lightest, still, slig...   
4                [probably, gives, little, color, .]   

                                              labels  
0                                    [O, O, B_PT, O]  
1  [O, B-EF-POS, I-EF-POS, O, O, O, O, O, B-AP-PO...  
2                           [O, B_BP, O, O, O, O, O]  
3  [O, B-VP-NEG, I-VP-NEG, I-VP-NEG, O

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [10]:
final_df.head()
final_df.to_csv('SentenceAnnotated.csv')

In [12]:
# Set display options to show all rows
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

def analyze_class_distribution(sentences):
    # Flatten all tags into a single list
    all_tags = []
    for sentence_tags in sentences:
        all_tags.extend(sentence_tags)
    
    # Count occurrences
    tag_counts = Counter(all_tags)
    
    # Convert to DataFrame for better visualization
    df = pd.DataFrame.from_dict(tag_counts, orient='index', columns=['count'])
    df = df.sort_values('count', ascending=False)
    
    # Calculate percentages
    total = df['count'].sum()
    df['percentage'] = (df['count'] / total * 100).round(2)
    
    # Print statistics with clear separation
    print("="*50)
    print("TOTAL TAGS:", total)
    print("="*50)
    print("\nCLASS DISTRIBUTION:")
    print("-"*50)
    
    # Print each class with clear formatting
    for idx, row in df.iterrows():
        print(f"{idx:<15} Count: {row['count']:<8} Percentage: {row['percentage']}%")
    
    # Calculate imbalance metrics
    majority_class_size = df['count'].max()
    minority_class_size = df['count'][df['count'] > 0].min()  # Only consider classes that appear
    imbalance_ratio = majority_class_size / minority_class_size
    
    print("\n" + "="*50)
    print(f"Most common tag: {df.index[0]} (Count: {majority_class_size})")
    print(f"Least common tag: {df.index[-1]} (Count: {minority_class_size})")
    print(f"Imbalance Ratio (majority:minority): {imbalance_ratio:.2f}")
    print("="*50)

# Use the function
sentences = final_df['labels'].tolist()
analyze_class_distribution(sentences)

TOTAL TAGS: 16150

CLASS DISTRIBUTION:
--------------------------------------------------
O               Count: 11449.0  Percentage: 70.89%
B_PT            Count: 714.0    Percentage: 4.42%
B_BP            Count: 557.0    Percentage: 3.45%
I_PT            Count: 400.0    Percentage: 2.48%
B_IN            Count: 308.0    Percentage: 1.91%
I_IN            Count: 200.0    Percentage: 1.24%
I_BP            Count: 196.0    Percentage: 1.21%
B_BR            Count: 160.0    Percentage: 0.99%
B-VP-POS        Count: 155.0    Percentage: 0.96%
B-TX-POS        Count: 124.0    Percentage: 0.77%
I_BR            Count: 89.0     Percentage: 0.55%
I-SC-POS        Count: 84.0     Percentage: 0.52%
B-PK-POS        Count: 82.0     Percentage: 0.51%
B-SC-POS        Count: 76.0     Percentage: 0.47%
I-PK-POS        Count: 72.0     Percentage: 0.45%
B-SE-POS        Count: 70.0     Percentage: 0.43%
B-EF-POS        Count: 68.0     Percentage: 0.42%
I-VP-POS        Count: 66.0     Percentage: 0.41%
B-HY-POS 

In [14]:
def extract_low_frequency_sentences(df, min_count=50):
    # First get the tag distribution
    all_tags = []
    for tags in df['labels']:
        all_tags.extend(tags)
    tag_counts = Counter(all_tags)
    
    # Find tags that appear less than min_count times
    low_frequency_tags = {tag for tag, count in tag_counts.items() if count < min_count}
    
    # Print tags with low frequency
    print(f"Tags with count < {min_count}:")
    for tag in sorted(low_frequency_tags):
        print(f"{tag}: {tag_counts[tag]} occurrences")
    
    # Function to check if a sentence contains any low frequency tags
    def contains_low_frequency_tags(tags):
        return any(tag in low_frequency_tags for tag in tags)
    
    # Create new DataFrame with sentences containing low frequency tags
    low_frequency_df = df[df['labels'].apply(contains_low_frequency_tags)].copy()
    
    # Print statistics
    print("\nDataset Statistics:")
    print(f"Total sentences: {len(df)}")
    print(f"Sentences with low frequency tags: {len(low_frequency_df)}")
    
    return low_frequency_df

# Extract sentences with low frequency tags
low_frequency_df = extract_low_frequency_sentences(final_df, min_count=50)

# Save the low frequency dataset if needed
#low_frequency_df.to_csv('low_frequency_sentences.csv', index=False)

Tags with count < 50:
B-AB-NEG: 4 occurrences
B-AB-NEU: 4 occurrences
B-AB-POS: 47 occurrences
B-AP-NEG: 6 occurrences
B-AP-NEU: 10 occurrences
B-DU-NEG: 4 occurrences
B-DU-NEU: 2 occurrences
B-DU-POS: 21 occurrences
B-EF-NEG: 16 occurrences
B-EF-NEU: 13 occurrences
B-HY-NEG: 41 occurrences
B-HY-NEU: 6 occurrences
B-PE-NEG: 34 occurrences
B-PE-NEU: 7 occurrences
B-PE-POS: 41 occurrences
B-PK-NEG: 30 occurrences
B-QU-NEG: 3 occurrences
B-QU-NEU: 9 occurrences
B-QU-POS: 38 occurrences
B-SC-NEG: 31 occurrences
B-SC-NEU: 28 occurrences
B-SE-NEG: 39 occurrences
B-SE-NEU: 32 occurrences
B-TX-NEG: 29 occurrences
B-TX-NEU: 14 occurrences
B_SW: 20 occurrences
I-AB-NEG: 5 occurrences
I-AB-NEU: 3 occurrences
I-AB-POS: 24 occurrences
I-AP-NEG: 3 occurrences
I-AP-NEU: 4 occurrences
I-AP-POS: 21 occurrences
I-DU-NEG: 2 occurrences
I-DU-NEU: 3 occurrences
I-DU-POS: 11 occurrences
I-EF-NEG: 13 occurrences
I-EF-NEU: 1 occurrences
I-EF-POS: 34 occurrences
I-HY-NEG: 14 occurrences
I-HY-POS: 17 occurrence

In [15]:
low_frequency_df.head()

Unnamed: 0,sentence,tokens,labels
1,like high spf feels really good put dries matt...,"[like, high, spf, feels, really, good, put, dr...","[O, B-EF-POS, I-EF-POS, O, O, O, O, O, B-AP-PO..."
3,downside shade range lightest still slightly d...,"[downside, shade, range, lightest, still, slig...","[O, B-VP-NEG, I-VP-NEG, I-VP-NEG, O, B-AP-NEG,..."
5,also keeps getting greasy throughout day cool ...,"[also, keeps, getting, greasy, throughout, day...","[O, O, O, B-TX-NEU, O, O, O, O, O, O, O]"
7,use tinted sunscreens combat white cast zinc w...,"[use, tinted, sunscreens, combat, white, cast,...","[O, B_BP, I_BP, O, B-AP-NEG, I-AP-NEG, B_IN, O..."
9,certainly dewy finish dont like consider using...,"[certainly, dewy, finish, dont, like, consider...","[O, B-AP-NEU, O, O, O, O, O, B_PT, O, O, O, O,..."


In [3]:
final_complete_df = pd.read_csv('AugmentedData.csv')

In [5]:
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
import pandas as pd
import numpy as np
import torch
import ast

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")

def prepare_dataset(df):
    """
    Prepare dataset for training by:
    1. Creating label mappings from the actual labels
    2. Tokenizing and aligning labels
    """
    # First, ensure labels are in the correct format (lists, not strings)
    df['labels'] = df['labels'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    df['tokens'] = df['tokens'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    
    # Create label mappings from existing labels
    unique_labels = set()
    for labels in df['labels']:
        unique_labels.update(labels)
    
    # Sort labels to ensure consistent mapping
    unique_labels = sorted(list(unique_labels))
    label2id = {label: i for i, label in enumerate(unique_labels)}
    id2label = {i: label for label, i in label2id.items()}
    
    print(f"Number of unique labels: {len(label2id)}")
    print("\nLabel mappings:")
    for label, idx in label2id.items():
        print(f"{label}: {idx}")
    
    return df, label2id, id2label

# Let's verify the data format first
print("Sample of first row:")
print("Tokens:", final_complete_df['tokens'].iloc[0])
print("Labels:", final_complete_df['labels'].iloc[0])

# Prepare the dataset
print("\nPreparing dataset...")
processed_df, label2id, id2label = prepare_dataset(final_complete_df)

# Print first few examples to verify
print("\nVerifying processed data:")
for i in range(3):
    print(f"\nExample {i+1}:")
    print("Tokens:", processed_df['tokens'].iloc[i])
    print("Labels:", processed_df['labels'].iloc[i])

Sample of first row:
Tokens: ['use', 'instead', 'foundation', '.']
Labels: ['O', 'O', 'B_PT', 'O']

Preparing dataset...
Number of unique labels: 79

Label mappings:
B-AB-NEG: 0
B-AB-NEU: 1
B-AB-POS: 2
B-AP-NEG: 3
B-AP-NEU: 4
B-AP-POS: 5
B-DU-NEG: 6
B-DU-NEU: 7
B-DU-POS: 8
B-EF-NEG: 9
B-EF-NEU: 10
B-EF-POS: 11
B-HY-NEG: 12
B-HY-NEU: 13
B-HY-POS: 14
B-PE-NEG: 15
B-PE-NEU: 16
B-PE-POS: 17
B-PK-NEG: 18
B-PK-NEU: 19
B-PK-POS: 20
B-QU-NEG: 21
B-QU-NEU: 22
B-QU-POS: 23
B-SC-NEG: 24
B-SC-NEU: 25
B-SC-POS: 26
B-SE-NEG: 27
B-SE-NEU: 28
B-SE-POS: 29
B-TX-NEG: 30
B-TX-NEU: 31
B-TX-POS: 32
B-VP-NEG: 33
B-VP-NEU: 34
B-VP-POS: 35
B_BP: 36
B_BR: 37
B_IN: 38
B_PT: 39
B_SW: 40
I-AB-NEG: 41
I-AB-NEU: 42
I-AB-POS: 43
I-AP-NEG: 44
I-AP-NEU: 45
I-AP-POS: 46
I-DU-NEG: 47
I-DU-NEU: 48
I-DU-POS: 49
I-EF-NEG: 50
I-EF-NEU: 51
I-EF-POS: 52
I-HY-NEG: 53
I-HY-POS: 54
I-PE-NEG: 55
I-PE-POS: 56
I-PK-NEG: 57
I-PK-POS: 58
I-QU-NEG: 59
I-QU-POS: 60
I-SC-NEG: 61
I-SC-NEU: 62
I-SC-POS: 63
I-SE-NEG: 64
I-SE-NEU: 65
I-SE-P

In [6]:
def tokenize_and_align_labels(examples, tokenizer, label2id):
    """
    Tokenize the text and align the labels with the tokens
    """
    tokenized_inputs = tokenizer(
        examples['tokens'],
        is_split_into_words=True,
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors=None  # Return list instead of tensors
    )
    
    labels = []
    for i, label in enumerate(examples['labels']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Special tokens get -100
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])
            else:
                label_ids.append(-100)  # Subwords get -100
            previous_word_idx = word_idx
            
        labels.append(label_ids)
    
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Split the dataset
train_df, temp_df = train_test_split(processed_df, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Convert to features
def convert_to_features(df, tokenizer, label2id):
    return tokenize_and_align_labels(
        {
            'tokens': df['tokens'].tolist(),
            'labels': df['labels'].tolist()
        },
        tokenizer,
        label2id
    )

# Create features for each split
train_features = convert_to_features(train_df, tokenizer, label2id)
val_features = convert_to_features(val_df, tokenizer, label2id)
test_features = convert_to_features(test_df, tokenizer, label2id)

# Create PyTorch Dataset class
class ABSADataset(torch.utils.data.Dataset):
    def __init__(self, features):
        self.features = features
        
    def __len__(self):
        return len(self.features['input_ids'])
    
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.features.items()}

# Create dataset objects
train_dataset = ABSADataset(train_features)
val_dataset = ABSADataset(val_features)
test_dataset = ABSADataset(test_features)

# Let's verify the processed datasets
print("\nVerifying processed datasets:")
print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")
print(f"Test samples: {len(test_dataset)}")

# Verify a sample
sample_idx = 0
sample = train_dataset[sample_idx]
print("\nSample verification:")
print("Input shape:", sample['input_ids'].shape)
print("Label shape:", sample['labels'].shape)

# Decode a sample to verify alignment
tokens = tokenizer.convert_ids_to_tokens(sample['input_ids'])
labels = sample['labels']

print("\nToken-Label Alignment:")
for token, label_id in zip(tokens, labels):
    if label_id != -100:  # Only show non-special tokens
        label = id2label[label_id.item()]
        print(f"{token:15} -> {label}")


Verifying processed datasets:
Training samples: 10462
Validation samples: 2242
Test samples: 2242

Sample verification:
Input shape: torch.Size([212])
Label shape: torch.Size([212])

Token-Label Alignment:
▁love           -> B-PK-POS
▁product        -> B_PT
▁look           -> O
▁working        -> O
▁.              -> O


In [9]:
def verify_samples(dataset, tokenizer, id2label, num_samples=5):
    """
    Verify multiple samples from the dataset
    """
    print(f"\nVerifying {num_samples} random samples:")
    
    # Get random indices
    indices = np.random.randint(0, len(dataset), num_samples)
    
    for idx in indices:
        print(f"\nSample {idx}:")
        sample = dataset[idx]
        
        # Get original tokens and labels
        tokens = tokenizer.convert_ids_to_tokens(sample['input_ids'])
        labels = sample['labels']
        
        # Print sequence information
        print(f"Sequence length: {len(tokens)}")
        print(f"Number of actual labels (excluding -100): {sum(1 for l in labels if l != -100)}")
        
        print("\nToken-Label Alignment:")
        print("-" * 50)
        print(f"{'Token':<20} {'Label':<20} {'Token ID':<10} {'Label ID':<10}")
        print("-" * 50)
        
        for token, label_id, token_id in zip(tokens, labels, sample['input_ids']):
            if label_id != -100:  # Only show non-special tokens
                label = id2label[label_id.item()]
                print(f"{token:<20} {label:<20} {token_id:<10} {label_id:<10}")
        
        # Verify special tokens
        special_tokens = [
            (i, token) for i, token in enumerate(tokens) 
            if token in [tokenizer.cls_token, tokenizer.sep_token, tokenizer.pad_token]
        ]
        print("\nSpecial tokens verification:")
        for pos, token in special_tokens:
            print(f"Position {pos}: {token} -> Label ID: {labels[pos].item()}")
        
        print("\n" + "="*70)

# Verify samples from each dataset
print("\nTRAINING DATASET SAMPLES:")
verify_samples(train_dataset, tokenizer, id2label, num_samples=3)

print("\nVALIDATION DATASET SAMPLES:")
verify_samples(val_dataset, tokenizer, id2label, num_samples=3)

print("\nTEST DATASET SAMPLES:")
verify_samples(test_dataset, tokenizer, id2label, num_samples=3)

# Additional distribution analysis
def analyze_label_distribution(dataset, id2label):
    label_counts = {}
    total_tokens = 0
    
    for i in range(len(dataset)):
        labels = dataset[i]['labels']
        for label in labels:
            if label != -100:
                label_name = id2label[label.item()]
                label_counts[label_name] = label_counts.get(label_name, 0) + 1
                total_tokens += 1
    
    print("\nLabel Distribution:")
    print("-" * 50)
    print(f"{'Label':<30} {'Count':<10} {'Percentage':<10}")
    print("-" * 50)
    
    for label, count in sorted(label_counts.items(), key=lambda x: x[1], reverse=True):
        percentage = (count / total_tokens) * 100
        print(f"{label:<30} {count:<10} {percentage:>6.2f}%")
    
    return label_counts

print("\nANALYZING LABEL DISTRIBUTIONS:")
print("\nTraining Set:")
train_dist = analyze_label_distribution(train_dataset, id2label)

print("\nValidation Set:")
val_dist = analyze_label_distribution(val_dataset, id2label)  # Fixed here

print("\nTest Set:")
test_dist = analyze_label_distribution(test_dataset, id2label)


TRAINING DATASET SAMPLES:

Verifying 3 random samples:

Sample 1493:
Sequence length: 212
Number of actual labels (excluding -100): 5

Token-Label Alignment:
--------------------------------------------------
Token                Label                Token ID   Label ID  
--------------------------------------------------
▁enter               O                    1916       78        
▁skin                B_BP                 1158       36        
▁smell               O                    4984       78        
▁moisturized         B-HY-NEG             65034      12        
▁.                   O                    323        78        

Special tokens verification:
Position 0: [CLS] -> Label ID: -100
Position 6: [SEP] -> Label ID: -100
Position 7: [PAD] -> Label ID: -100
Position 8: [PAD] -> Label ID: -100
Position 9: [PAD] -> Label ID: -100
Position 10: [PAD] -> Label ID: -100
Position 11: [PAD] -> Label ID: -100
Position 12: [PAD] -> Label ID: -100
Position 13: [PAD] -> Label ID: -1

In [10]:
def fix_bio_scheme(df):
    """
    Fix BIO scheme consistency:
    - Change I- tags to B- tags if they don't have a preceding B- tag
    """
    fixed_count = 0
    
    def fix_sequence_labels(labels):
        fixed = False
        new_labels = labels.copy()
        
        for i, label in enumerate(labels):
            if label.startswith('I-'):
                # Check if there's a matching B- tag before this I- tag
                prefix = label[2:]  # Get the part after 'I-'
                if i == 0 or not labels[i-1].startswith('B-') or not labels[i-1][2:] == prefix:
                    new_labels[i] = 'B-' + prefix
                    fixed = True
                    
        return new_labels, fixed
    
    # Create a copy of the dataframe
    fixed_df = df.copy()
    
    # Fix labels for each row
    for idx, row in fixed_df.iterrows():
        new_labels, was_fixed = fix_sequence_labels(row['labels'])
        if was_fixed:
            fixed_count += 1
            fixed_df.at[idx, 'labels'] = new_labels
    
    print(f"Fixed {fixed_count} sequences with incorrect BIO scheme")
    
    return fixed_df

# Fix the BIO scheme in processed_df
fixed_processed_df = fix_bio_scheme(processed_df)

# Let's verify a few examples where changes were made
def compare_labels(original_df, fixed_df):
    print("\nComparing original and fixed labels:")
    print("-" * 70)
    
    for idx in range(len(original_df)):
        orig_labels = original_df.iloc[idx]['labels']
        fixed_labels = fixed_df.iloc[idx]['labels']
        
        if orig_labels != fixed_labels:
            print(f"\nExample {idx}:")
            print("Tokens:", original_df.iloc[idx]['tokens'])
            print("Original:", orig_labels)
            print("Fixed:   ", fixed_labels)
            print("-" * 70)
            
            # Only show first 5 examples
            if idx >= 4:
                print("... more examples exist ...")
                break

# Compare original and fixed labels
compare_labels(processed_df, fixed_processed_df)

# Use the fixed dataframe for further processing
processed_df = fixed_processed_df

# Re-run the dataset creation with fixed labels
# Split the dataset
train_df, temp_df = train_test_split(processed_df, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Create features for each split
train_features = convert_to_features(train_df, tokenizer, label2id)
val_features = convert_to_features(val_df, tokenizer, label2id)
test_features = convert_to_features(test_df, tokenizer, label2id)

# Create dataset objects
train_dataset = ABSADataset(train_features)
val_dataset = ABSADataset(val_features)
test_dataset = ABSADataset(test_features)

# Verify a sample after fixing
print("\nVerifying a sample after BIO scheme fix:")
sample_idx = 0
sample = train_dataset[sample_idx]
tokens = tokenizer.convert_ids_to_tokens(sample['input_ids'])
labels = sample['labels']

print("\nToken-Label Alignment:")
print("-" * 50)
print(f"{'Token':<20} {'Label':<20} {'Token ID':<10} {'Label ID':<10}")
print("-" * 50)
for token, label_id, token_id in zip(tokens, labels, sample['input_ids']):
    if label_id != -100:  # Only show non-special tokens
        label = id2label[label_id.item()]
        print(f"{token:<20} {label:<20} {token_id:<10} {label_id:<10}")

Fixed 4433 sequences with incorrect BIO scheme

Comparing original and fixed labels:
----------------------------------------------------------------------

Example 3:
Tokens: ['downside', 'shade', 'range', 'lightest', 'still', 'slightly', 'darker', 'natural', 'color', 'not', 'deal', 'breaker', '.']
Original: ['O', 'B-VP-NEG', 'I-VP-NEG', 'I-VP-NEG', 'O', 'B-AP-NEG', 'I-AP-NEG', 'O', 'O', 'O', 'O', 'O', 'O']
Fixed:    ['O', 'B-VP-NEG', 'I-VP-NEG', 'B-VP-NEG', 'O', 'B-AP-NEG', 'I-AP-NEG', 'O', 'O', 'O', 'O', 'O', 'O']
----------------------------------------------------------------------

Example 13:
Tokens: ['research', 'done', 'company', 'claim', 'high', 'spf', 'ppd', 'rating', 'used', 'european', 'standards', 'test', 'better', 'us', '.']
Original: ['O', 'O', 'O', 'O', 'B-EF-POS', 'I-EF-POS', 'I-EF-POS', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Fixed:    ['O', 'O', 'O', 'O', 'B-EF-POS', 'I-EF-POS', 'B-EF-POS', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
------------------------------------

In [17]:
!pip install seqeval

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25ldone
[?25h  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16164 sha256=287366df3f618fc6d7598b0ce15a83ca641ab51cc8a9f2e480a75c0819471910
  Stored in directory: /root/.cache/pip/wheels/bc/92/f0/243288f899c2eacdfa8c5f9aede4c71a9bad0ee26a01dc5ead
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2
[0m

In [22]:
'''def save_processed_data(df, filename='fixed_dataset.pkl'):
    """
    Save the processed dataframe with all its structure intact
    """
    df.to_pickle(filename)
    print(f"Dataset saved to {filename}")'''

def load_processed_data(filename='fixed_dataset.pkl'):
    """
    Load the processed dataframe with all its structure intact
    """
    try:
        df = pd.read_pickle(filename)
        print(f"Successfully loaded dataset from {filename}")
        print(f"Dataset size: {len(df)} rows")
        return df
    except FileNotFoundError:
        print(f"Error: File {filename} not found")
        return None

# Save the fixed dataset
save_processed_data(fixed_processed_df)

# Later, you can load the dataset using:
processed_df = load_processed_data()

# Verify the loaded data
if processed_df is not None:
    print("\nVerifying loaded data:")
    print(f"Number of rows: {len(processed_df)}")
    print("\nSample row:")
    sample_row = processed_df.iloc[0]
    print("Tokens:", sample_row['tokens'])
    print("Labels:", sample_row['labels'])

Dataset saved to fixed_dataset.pkl
Successfully loaded dataset from fixed_dataset.pkl
Dataset size: 14946 rows

Verifying loaded data:
Number of rows: 14946

Sample row:
Tokens: ['use', 'instead', 'foundation', '.']
Labels: ['O', 'O', 'B_PT', 'O']


In [19]:
import torch
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from datasets import load_metric
import numpy as np

# Check GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"\nUsing device: {device}")
if torch.cuda.is_available():
    print(f"GPU Model: {torch.cuda.get_device_name(0)}")
    print(f"Available GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")

# Set up the model with GPU
num_labels = len(label2id)
model = AutoModelForTokenClassification.from_pretrained(
    "microsoft/deberta-v3-base",
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
).to(device)  # Move model to GPU

# First, install seqeval


# Then update the training code:
import torch
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from seqeval.metrics import precision_score, recall_score, f1_score, accuracy_score
import numpy as np

# Define metrics for evaluation
def compute_metrics(eval_preds):
    predictions, labels = eval_preds
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # Convert our custom tags to standard NER format
    def convert_to_ner_format(tags):
        return [[tag.replace('_', '-') if tag != 'O' else tag for tag in seq] for seq in tags]

    # Convert both predictions and labels to NER format
    true_predictions = convert_to_ner_format(true_predictions)
    true_labels = convert_to_ner_format(true_labels)

    seqeval_metric = load_metric("seqeval")
    results = seqeval_metric.compute(predictions=true_predictions, references=true_labels)

    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# Rest of the code remains the same...

# Define training arguments with GPU settings
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    no_cuda=False,
    #fp16=True,
    dataloader_num_workers=4,
    # Show training loss but no other logging
    logging_strategy="steps",
    logging_steps=100,
    report_to="none"
)

# Rest of the code remains the same
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# Print training dataset size and batch size info
print("\nTraining Configuration:")
print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")
print(f"Batch size: {training_args.per_device_train_batch_size}")
print(f"Number of epochs: {training_args.num_train_epochs}")
print(f"Learning rate: {training_args.learning_rate}")



Using device: cuda
GPU Model: NVIDIA RTX A4000
Available GPU memory: 15.73 GB


Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training Configuration:
Training samples: 10462
Validation samples: 2242
Batch size: 8
Number of epochs: 3
Learning rate: 2e-05


In [20]:

# Train the model
print("\nStarting training...")
trainer.train()

# Evaluate the model
print("\nEvaluating on validation set...")
eval_results = trainer.evaluate()
print(eval_results)

# Print GPU memory usage after training
if torch.cuda.is_available():
    print("\nGPU Memory Summary:")
    print(torch.cuda.memory_summary())


Starting training...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.3211,0.695545,0.732778,0.713676,0.910069
2,No log,0.19599,0.78126,0.811506,0.796096,0.938323
3,No log,0.169112,0.813114,0.832811,0.822844,0.946553


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av


Evaluating on validation set...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

{'eval_loss': 0.16911153495311737, 'eval_precision': 0.8131137155527399, 'eval_recall': 0.8328106412890667, 'eval_f1': 0.822844320974463, 'eval_accuracy': 0.9465534070224497, 'eval_runtime': 9.1219, 'eval_samples_per_second': 245.781, 'eval_steps_per_second': 30.805, 'epoch': 3.0}

GPU Memory Summary:
|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |   5110 MiB |   7071 MiB |  56244 GiB |  56239 GiB |
|       from large pool |   5105 MiB |   7035 MiB |  54905 GiB |  54900 GiB |
|       from small pool |      5 MiB |     61 MiB |   1339 GiB |   1339 GiB |
|------------------------------------------------------------------------



In [None]:
def predict_aspects(text, model=model, tokenizer=tokenizer):
    # Tokenize the text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # Get predictions
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=2)
    
    # Convert predictions to labels
    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
    predictions = predictions[0].cpu().numpy()
    
    # Align predictions with tokens
    results = []
    current_aspect = []
    current_label = None
    
    for i, (token, pred_id) in enumerate(zip(tokens, predictions)):
        # Skip special tokens and their predictions
        if token in ['[CLS]', '[SEP]', '[PAD]']:
            continue
            
        if pred_id != -100:
            try:
                pred_label = id2label[pred_id]
                
                # Handle subword tokens (tokens starting with '▁' or '##')
                cleaned_token = token.replace('▁', '').replace('##', '')
                
                if pred_label != 'O':
                    if pred_label.startswith('B-'):
                        # Save previous aspect if exists
                        if current_aspect:
                            results.append((' '.join(current_aspect), current_label))
                        current_aspect = [cleaned_token]
                        current_label = pred_label[2:]  # Remove B- prefix
                    elif pred_label.startswith('I-'):
                        if current_aspect:  # Only append if we have a current aspect
                            current_aspect.append(cleaned_token)
                else:
                    # Save previous aspect if exists
                    if current_aspect:
                        results.append((' '.join(current_aspect), current_label))
                        current_aspect = []
                        current_label = None
            except KeyError:
                print(f"Warning: Unknown label ID {pred_id}")
                continue
    
    # Add final aspect if exists
    if current_aspect:
        results.append((' '.join(current_aspect), current_label))
    
    return results

# Test the updated function
test_texts = [
    "The sunscreen absorbs easily.",
    "The seal of the bottle was open but the moisturizer was good.",
    "This cream has a strong fragrance but works well.",
]

print("\nTesting predictions on example texts:")
for text in test_texts:
    print("\nText:", text)
    aspects = predict_aspects(text)
    print("Detected aspects and sentiments:")
    for aspect, sentiment in aspects:
        print(f"- {aspect}: {sentiment}")

# For error analysis, we need to modify the comparison function
def analyze_errors(dataset, model, tokenizer, n_samples=50):
    """Analyze prediction errors on a sample of the dataset"""
    errors = []
    
    # Get random sample indices
    sample_indices = np.random.choice(len(dataset), min(n_samples, len(dataset)), replace=False)
    
    for idx in sample_indices:
        # Get sample
        sample = dataset[idx]
        text = tokenizer.decode(sample['input_ids'], skip_special_tokens=True)
        
        # Get predictions
        pred_aspects = predict_aspects(text)
        
        # Get true labels (handle tensor values)
        try:
            true_labels = []
            current_aspect = []
            current_label = None
            
            for label in sample['labels']:
                if label != -100:
                    label_str = id2label[label.item()]  # Convert tensor to int
                    if label_str.startswith('B-'):
                        if current_aspect:
                            true_labels.append((' '.join(current_aspect), current_label))
                        current_aspect = []
                        current_label = label_str[2:]
                    elif label_str.startswith('I-'):
                        current_label = label_str[2:]
            
            if current_aspect:
                true_labels.append((' '.join(current_aspect), current_label))
                
            # Compare predictions with true labels
            if pred_aspects != true_labels:
                errors.append({
                    'text': text,
                    'predicted': pred_aspects,
                    'true': true_labels
                })
        except KeyError as e:
            print(f"Warning: Unknown label ID encountered: {e}")
            continue
    
    return errors

Evaluating on test set...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av


Test Set Results:
{'eval_loss': 0.1695667803287506, 'eval_precision': 0.8083361046447178, 'eval_recall': 0.8286363636363636, 'eval_f1': 0.8183603613714157, 'eval_accuracy': 0.9465161923454367, 'eval_runtime': 8.9462, 'eval_samples_per_second': 250.609, 'eval_steps_per_second': 31.41, 'epoch': 3.0}

Testing predictions on example texts:

Text: The sunscreen absorbs easily.
Detected aspects and sentiments:
- [CLS]: None
- ▁absorbs ▁easily: AB-NEG
- [SEP]: SE-NEU

Text: The seal of the bottle was open but the moisturizer was good.
Detected aspects and sentiments:
- [CLS]: None
- ▁bottle: PK-NEU
- [SEP]: SE-NEU

Performing error analysis...


KeyError: tensor(78)