In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import os
import torch
import torch.nn.functional as F
from collections import Counter

# Load pre-trained model tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Load pre-trained model
model = GPT2LMHeadModel.from_pretrained('gpt2')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768

In [87]:
"""
# Encode input text
input_text = "Today the whether is"
input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)

# Check if pad_token_id is None and handle it
pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id
attention_mask = input_ids.ne(pad_token_id).float()

# Generate text with adjusted parameters
output = model.generate(
    input_ids,
    attention_mask=attention_mask,
    max_length=1000,
    num_return_sequences=1,
    do_sample=True,           # Enable sampling
    top_k=50,                 # Top-k sampling
    top_p=0.95,               # Top-p sampling
    temperature=0.7,          # Temperature
    repetition_penalty=1.2,   # Penalty for repetition
    pad_token_id=pad_token_id
)

# Decode and print the generated text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)
"""

'\n# Encode input text\ninput_text = "Today the whether is"\ninput_ids = tokenizer.encode(input_text, return_tensors=\'pt\').to(device)\n\n# Check if pad_token_id is None and handle it\npad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id\nattention_mask = input_ids.ne(pad_token_id).float()\n\n# Generate text with adjusted parameters\noutput = model.generate(\n    input_ids,\n    attention_mask=attention_mask,\n    max_length=1000,\n    num_return_sequences=1,\n    do_sample=True,           # Enable sampling\n    top_k=50,                 # Top-k sampling\n    top_p=0.95,               # Top-p sampling\n    temperature=0.7,          # Temperature\n    repetition_penalty=1.2,   # Penalty for repetition\n    pad_token_id=pad_token_id\n)\n\n# Decode and print the generated text\ngenerated_text = tokenizer.decode(output[0], skip_special_tokens=True)\nprint(generated_text)\n'

In [88]:
def create_bag_of_words(folder_path):
    bag_of_words = {}

    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)

        if os.path.isfile(file_path) and filename.endswith('.txt'):
            topic = os.path.splitext(filename)[0]
            with open(file_path, 'r', encoding='utf-8') as file:
                words = file.read().splitlines()
                bag_of_words[topic] = words
    
    return bag_of_words

folder_path = 'wordlists'
topic_bow = create_bag_of_words(folder_path)
del topic_bow["positive_words"]
print("Bag of Words:", topic_bow)

Bag of Words: {'science': ['astronomy', 'atom', 'biology', 'cell', 'chemical', 'chemistry', 'climate', 'control', 'data', 'electricity', 'element', 'energy', 'evolution', 'experiment', 'fact', 'flask', 'fossil', 'funnel', 'genetics', 'gravity', 'hypothesis', 'lab', 'laboratory', 'laws', 'mass', 'matter', 'measure', 'microscope', 'mineral', 'molecule', 'motion', 'observe', 'organism', 'particle', 'phase', 'physics', 'research', 'scale', 'science', 'scientist', 'telescope', 'temperature', 'theory', 'tissue', 'variable', 'volume', 'weather', 'weigh'], 'legal': ['affidavit', 'allegation', 'appeal', 'appearance', 'argument', 'arrest', 'assault', 'attorney', 'bail', 'bankrupt', 'bankruptcy', 'bar', 'bench', 'warrant', 'bond', 'booking', 'capital', 'crime', 'case', 'chambers', 'claim', 'complainant', 'complaint', 'confess', 'confession', 'constitution', 'constitutional', 'contract', 'counsel', 'court', 'custody', 'damages', 'decree', 'defendant', 'defense', 'deposition', 'discovery', 'equity'

In [89]:
def bag_of_words_classification(text, topic_dict):
    # Tokenize (split by space, lowercase everything)
    words = text.lower().split()
    
    # Count occurrences
    word_counts = Counter(words)
    
    # Score each topic by summing occurrences
    scores = {topic: sum(word_counts[word] for word in words_set) for topic, words_set in topic_dict.items()}
    max_topic = max(scores, key=scores.get)
    
    return max_topic

In [90]:
text1 = "The recent breakthrough in quantum physics has allowed scientists to explore new dimensions of energy and matter, potentially revolutionizing our understanding of the universe."
text2 = "The military has deployed advanced drone technology to enhance surveillance and reconnaissance missions, providing real-time intelligence and improving strategic decision-making."

print(bag_of_words_classification(text1, topic_bow))
print(bag_of_words_classification(text2, topic_bow))

science
military


In [91]:
class BoWAttributeModel:
    def __init__(self, topics_dict, tokenizer, vocab_size=50257):
        """
        Initialize the BoWAttributeModel with a dictionary of topics.

        :param topics_dict: Dictionary where keys are topics and values are lists of words.
        :param tokenizer: Tokenizer to convert words to token IDs.
        :param vocab_size: Size of the vocabulary.
        """
        self.topics_dict = topics_dict
        self.tokenizer = tokenizer
        self.vocab_size = vocab_size
        self.topic_masks = self._build_topic_masks()

    def _build_topic_masks(self):
        """
        Build a mask for each topic.
        Each mask is a tensor of shape (vocab_size,) with 1.0 for token IDs that belong to the topic and 0.0 elsewhere.
        """
        topic_masks = {}
        for topic, words in self.topics_dict.items():
            mask = torch.zeros(self.vocab_size)
            for word in words:
                # Tokenize the word without adding special tokens
                token_ids = self.tokenizer.encode(word, add_special_tokens=False)
                # Tokenize with space prefix (handles cases like 'Ġword')
                token_ids_with_space = self.tokenizer.encode(" " + word, add_special_tokens=False)
                # In case the word is tokenized into multiple tokens, we mark all of them.
                for token_id in token_ids:
                    if token_id < self.vocab_size:  # Safety check
                        mask[token_id] = 1.0/len(words)
            topic_masks[topic] = mask
        return topic_masks

    def forward(self, logits):
        """
        Compute the log probability that the generated tokens (given by logits) belong to each topic.

        :param logits: Tensor of shape (batch_size, vocab_size)
        :return: Dictionary mapping each topic to a tensor of log probabilities of shape (batch_size,)
        """
        # Convert logits to probabilities.
        #probs = F.softmax(logits, dim=-1)  # Shape: (batch_size, nb_tokens_sentence, vocab_size)
        log_topic_probs = {}
        
        for topic, mask in self.topic_masks.items():
            mask = mask.to(logits.device)
            topic_sum = torch.sum(logits * mask.view(1, 1, -1), dim=-1)
            #log_topic_prob = torch.log(torch.sum(topic_sum, dim=-1))
            log_topic_probs[topic] = topic_sum
        
        return log_topic_probs

In [92]:
#bow_model = BoWAttributeModel(bow, tokenizer)

In [93]:
"""
phrase1 = "The recent breakthrough in quantum physics has allowed scientists to explore new dimensions of energy and matter, potentially revolutionizing our understanding of the universe."
phrase1 = "climate control data electricity element energy evolution experiment fact flask fossil"
phrase2 = "The military has deployed advanced drone technology to enhance surveillance and reconnaissance missions, providing real-time intelligence and improving strategic decision-making."

# Encode the input text
input_ids1 = tokenizer.encode(phrase1.lower(), return_tensors='pt').to(device)
input_ids2 = tokenizer.encode(phrase2.lower(), return_tensors='pt').to(device)

# Get logits from the model
with torch.no_grad():
    outputs1 = model(input_ids1)
    outputs2 = model(input_ids2)
    logits1 = outputs1.logits
    logits2 = outputs2.logits

# Compute log likelihoods for each topic
log_probs1 = bow_model.forward(logits1)
log_probs2 = bow_model.forward(logits2)

# Print the log probabilities for each topic
print("Log Probabilities for Phrase 1 (Science):")
for topic, log_prob in log_probs1.items():
    print(f"Topic: {topic}, Log Probability: {log_prob}")

print("\nLog Probabilities for Phrase 2 (Military):")
for topic, log_prob in log_probs2.items():
    print(f"Topic: {topic}, Log Probability: {log_prob}")
"""

'\nphrase1 = "The recent breakthrough in quantum physics has allowed scientists to explore new dimensions of energy and matter, potentially revolutionizing our understanding of the universe."\nphrase1 = "climate control data electricity element energy evolution experiment fact flask fossil"\nphrase2 = "The military has deployed advanced drone technology to enhance surveillance and reconnaissance missions, providing real-time intelligence and improving strategic decision-making."\n\n# Encode the input text\ninput_ids1 = tokenizer.encode(phrase1.lower(), return_tensors=\'pt\').to(device)\ninput_ids2 = tokenizer.encode(phrase2.lower(), return_tensors=\'pt\').to(device)\n\n# Get logits from the model\nwith torch.no_grad():\n    outputs1 = model(input_ids1)\n    outputs2 = model(input_ids2)\n    logits1 = outputs1.logits\n    logits2 = outputs2.logits\n\n# Compute log likelihoods for each topic\nlog_probs1 = bow_model.forward(logits1)\nlog_probs2 = bow_model.forward(logits2)\n\n# Print the 