In [None]:
from google.colab import drive

drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
import json
with open('/content/drive/MyDrive/646_Project/train_questions.json', 'r') as file:
        data = json.load(file)

In [None]:
def extract_non_template_part(input_text):
    # Define the template part that needs to be removed
    template = "Which category does this article relate to among the following categories? Just answer with the category name without further explanation. categories: [women, religion, politics, style & beauty, entertainment, culture & arts, sports, science & technology, travel, business, crime, education, healthy living, parents, food & drink] article:"

    # Remove the template part from the input text
    non_template_part = input_text.replace(template, '').strip()
    return non_template_part

def phi_q(inputs):
    queries = {}
    for item in inputs:
        # Extract non-template part from each input
        non_template_part = extract_non_template_part(item['input'])
        # Use the id as the key and non-template part as the value
        queries[int(item['id'])] = non_template_part
    return queries

queries = phi_q(data)


In [None]:
queries = dict(list(queries.items())[:1000])

In [None]:
len(queries)

1000

In [None]:
# Initialize an empty dictionary for user profiles
user_profiles = {}

# Iterate over each user's data in the structure
for user_data in data:
    main_id = user_data['id']
    # Initialize the user's profile list if not already present
    if main_id not in user_profiles:
        user_profiles[int(main_id)] = []

    # Iterate over each profile entry for the user
    for profile_entry in user_data['profile']:
        # Format the profile entry string
        entry_str = f"For article '{profile_entry['text']}' the category is '{profile_entry['category']}'"
        # Append the formatted string to the user's profile list
        user_profiles[int(main_id)].append(entry_str)


In [None]:
# Take the first 6 key-value pairs
first_six_profiles = dict(list(user_profiles.items())[:1000])

In [None]:
len(first_six_profiles)

1000

In [None]:
first_six_profiles
first_six_profiles.get(100, [])

["For article 'The three make a trip of atypical opera themes, but no new opera brought the Met as much controversy as Klinghoffer.' the category is 'culture & arts'",
 "For article 'Though I might not subscribe to every prayer in the Siddur, I always use the synagogue time for my own prayer of thanks for being alive and the multitudes of blessings I enjoy.  I want to let the Lord know I haven't forgotten them.  Then, leaving the hall, the yarmulke still in place on my head, I head home feeling a little purer.' the category is 'religion'",
 "For article 'I expected him to do well and show me some of his work. The money could be renewed for the second semester and the next year of college. Though I imagined that Mel would want to maintain periodic contact -- at least it was what I hoped -- I was wrong.' the category is 'religion'",
 "For article 'Although my mother swept away any feeling for her native land, I saw my trips as partly for her, maybe an effort to reconnect her to a land th

In [None]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
import numpy as np

# Load tokenizer and BERT model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

def transformer_kernel_model(queries, user_profiles, tokenizer, bert_model, max_length=512):
    concatenated_top_docs_dict = {}
    scores_dict = {}

    # Process each query
    for query_id, query_text in queries.items():
        documents = user_profiles.get(query_id, [])

        # Tokenize query and documents
        encoded_query = tokenizer(query_text, padding='max_length', truncation=True, max_length=max_length, return_tensors="tf")
        encoded_docs = [tokenizer(doc, padding='max_length', truncation=True, max_length=max_length, return_tensors="tf") for doc in documents]

        # Obtain embeddings from BERT
        query_embedding = bert_model(encoded_query['input_ids'], attention_mask=encoded_query['attention_mask']).last_hidden_state
        doc_embeddings = [bert_model(doc['input_ids'], attention_mask=doc['attention_mask']).last_hidden_state for doc in encoded_docs]

        # Compute interactions and apply kernel for each document
        scores = []
        for doc_embedding in doc_embeddings:
            # Custom layers
            interaction = tf.matmul(query_embedding, doc_embedding, transpose_b=True)
            pooled_interaction = tf.reduce_mean(interaction, axis=-1)  # mean pooling
            dense_layer = Dense(128, activation='relu')(pooled_interaction)
            dropout_layer = Dropout(0.2)(dense_layer)
            score = Dense(1)(dropout_layer)
            scores.append(score.numpy().flatten()[0])

        # Convert scores to a tensor and flatten
        scores = tf.convert_to_tensor(scores).numpy().flatten()

        # Save scores for the query ID
        scores_dict[query_id] = scores
        top_k = 3
        # Get indices of top-k documents based on scores
        top_document_indices = np.argsort(scores)[-top_k:][::-1]

        # Concatenate top-k documents in ranked order
        ranked_documents = [documents[idx] for idx in top_document_indices]
        concatenated_top_docs = ' '.join(ranked_documents)

        # Save the concatenated top documents for the query ID
        concatenated_top_docs_dict[query_id] = concatenated_top_docs

    return concatenated_top_docs_dict

# Example usage
queries = queries

user_profiles = first_six_profiles

# Calculate the top documents for each query
concatenated_top_docs_dict = transformer_kernel_model(queries, user_profiles, tokenizer, bert_model)

# Display the concatenated top documents
for query_id, top_docs in concatenated_top_docs_dict.items():
    print(f"Query ID {query_id} top documents: {top_docs}")


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [None]:
def prompt_generation(queries, concatenated_top_docs_dict, common_string):
    prompt_dict = {}
    for query_id, top_docs in concatenated_top_docs_dict.items():
        # Retrieve the corresponding query text
        query_text = queries.get(query_id, "")
        # Concatenate the query text, the top_docs string, and the common string
        new_concatenated_string = f"{common_string} {query_text}. Consider the user's profile and past preferences in similar contexts to categorise the article. For reference, here are articles related to the user's profile that were previously categorized in the order of their relevance: {top_docs} Use these examples and the user's profile to determine the most fitting category for the new article, ensuring the choice is tailored to the user's specific interests and past content interactions."
        # Store the new concatenated string in the prompt dictionary
        prompt_dict[query_id] = new_concatenated_string
    return prompt_dict


template = "Based on the user's profile and interests, please categorize the following article, selecting from these categories: [women, religion, politics, style & beauty, entertainment, culture & arts, sports, science & technology, travel, business, crime, education, healthy living, parents, food & drink]. The article for categorization is:"

# Generate the prompts
new_prompts = prompt_generation(queries, concatenated_top_docs_dict, template)

# Display the new prompts
for query_id, prompt in new_prompts.items():
    print(f"Prompt for Query ID {query_id}: {prompt}")

Prompt for Query ID 100: Which category does this article relate to among the following categories? Just answer with the category name without further explanation. categories: [women, religion, politics, style & beauty, entertainment, culture & arts, sports, science & technology, travel, business, crime, education, healthy living, parents, food & drink], The article that needs to be categorised is: It's hard to find a restaurant that doesn't now place a little card at your table inquiring if the establishment was: (a) really awful; (b) tolerable; (c) sublime.. These are the top 3 relevant documents related to the user profile for this given article : For article 'The three make a trip of atypical opera themes, but no new opera brought the Met as much controversy as Klinghoffer.' the category is 'culture & arts' For article 'Though I might not subscribe to every prayer in the Siddur, I always use the synagogue time for my own prayer of thanks for being alive and the multitudes of blessi

In [None]:
with open('/content/drive/MyDrive/646_Project/train_outputs.json', 'r') as file:
        outputs = json.load(file)

In [None]:
outputs = {int(entry['id']): entry['output'] for entry in outputs["golds"]}

In [None]:
!pip install sentencepiece



In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
from torch.utils.data import Dataset, DataLoader

# Assuming you have 'inputs' as your texts and 'outputs' as your labels

# Custom Dataset class
class CustomDataset(Dataset):
    def __init__(self, tokenizer, texts, labels, max_length):
        self.tokenizer = tokenizer
        self.texts = texts
        self.labels = labels
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # Tokenize input and label
        input = self.tokenizer.encode_plus(
            f"Determine the primary category of the following text: '{text}'",
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        label = self.tokenizer.encode(
            label,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )

        input_ids = input['input_ids'].flatten()
        attention_mask = input['attention_mask'].flatten()
        label_ids = label.flatten()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': label_ids
        }

# Load tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
# Prepare the dataset
max_length = 512  # or any other value suitable for your data
dataset = CustomDataset(tokenizer, list(new_prompts.values()), list(outputs.values()), max_length)
data_loader = DataLoader(dataset, batch_size=4, shuffle=True)  # Adjust batch_size as needed

# Define the optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)



In [None]:
# Training loop
num_epochs = 200  # or any other number of epochs
model.train()

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

In [None]:
def compute_accuracy(predictions, labels):
    predictions = torch.argmax(predictions, dim=-1)
    correct = (predictions == labels).sum().item()
    total = labels.size(0)
    return correct / total


In [None]:
for epoch in range(num_epochs):
    total_loss = 0
    total_accuracy = 0
    total_batches = 0

    for batch in data_loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        # Forward pass
        model.train()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        # Calculate accuracy
        accuracy = compute_accuracy(logits, labels)
        total_accuracy += accuracy

        # Backward pass and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        total_batches += 1

    # Calculate average loss and accuracy
    avg_loss = total_loss / total_batches
    avg_accuracy = total_accuracy / total_batches
    print(f"Epoch {epoch}, Loss: {avg_loss}, Accuracy: {avg_accuracy}")

# Save the model
model.save_pretrained("./flan_t5_finetuned")


In [None]:
# Example input
input_text = "Which category does this article relate to among the following categories? Just answer with the category name without further explanation. categories: [women, religion, politics, style & beauty, entertainment, culture & arts, sports, science & technology, travel, business, crime, education, healthy living, parents, food & drink] article: Huge municipalities like Chicago and Phoenix are drowning in underfunded pensions. Can tax revenue from legalized marijuana save the day?"

# Tokenize the input
input_ids = tokenizer.encode(input_text, return_tensors="pt")

# Generate predictions
output = model.generate(input_ids)

# Decode the predictions
decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)

print("Predicted Output:", decoded_output)


In [None]:
input_text = "Which category does this article relate to among the following categories? \
Just answer with the category name without further explanation. categories: [women, religion, politics, style & beauty, \
entertainment, culture & arts, sports, science & technology, travel, business, crime, education, healthy living, parents,\
food & drink], The article that needs to be categorised is: It's hard to find a restaurant that doesn't now place a little \
card at your table inquiring if the establishment was: (a) really awful; (b) tolerable; (c) sublime.. These are the top 3 related \
documents related to the user profile for this given article :\
The category for Though I might not subscribe to every prayer in the Siddur, I \
always use the synagogue time for my own prayer of thanks for being alive and the multitudes of blessings I enjoy.\
I want to let the Lord know I haven't forgotten them.  Then, leaving the hall, the yarmulke still in place on my head, \
I head home feeling a little purer. is religion The category for I expected him to do well and show me some of his work. \
The money could be renewed for the second semester and the next year of college. Though I imagined that Mel would want\
to maintain periodic contact -- at least it was what I hoped -- I was wrong. is religion The category for The three make\
a trip of atypical opera themes, but no new opera brought the Met as much controversy as Klinghoffer. is culture & arts "


# Tokenize the input
input_ids = tokenizer.encode(input_text, return_tensors="pt")

# Generate predictions
output = model.generate(input_ids)

# Decode the predictions
decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)

print("Predicted Output:", decoded_output)

In [None]:
pip install python-Levenshtein

In [None]:
import Levenshtein as lev

def find_closest_category(model_output, categories):
    closest_category = None
    min_distance = float('inf')

    for category in categories:
        distance = lev.distance(model_output.lower(), category.lower())
        if distance < min_distance:
            min_distance = distance
            closest_category = category

    return closest_category

# Example usage
model_output = "lifestyle"
predefined_categories = ["women", "religion", "politics", "style & beauty",
"entertainment", "culture & arts", "sports", "science & technology", "travel", "business", "crime", "education", "healthy living", "parents",
"food & drink"]
closest_category = find_closest_category(model_output, predefined_categories)
print("Closest category:", closest_category)


In [None]:
# Using LORA

In [None]:
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")

# Prepare the dataset
texts = list(queries.values())
labels = ["label"] * len(texts)  # Replace with actual labels
max_length = 512
dataset = CustomDataset(tokenizer, texts, labels, max_length)
data_loader = DataLoader(dataset, batch_size=4, shuffle=True)

# LoRA Configuration
lora_config = LoraConfig(
    r=16,  # Rank of the low-rank matrix
    lora_alpha=32,  # Scaling factor
    target_modules=["q", "v"],  # Targeted attention modules
    lora_dropout=0.1  # Dropout for regularization
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)

# Define the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
num_epochs = 100
for epoch in range(num_epochs):
    total_loss = 0
    total_accuracy = 0
    total_batches = 0

    for batch in data_loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        # Forward pass
        model.train()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        accuracy = compute_accuracy(logits, labels)
        total_accuracy += accuracy

        # Backward pass and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        total_batches += 1

    # Calculate average loss and accuracy
    avg_loss = total_loss / total_batches
    avg_accuracy = total_accuracy / total_batches
    print(f"Epoch {epoch}, Loss: {avg_loss}, Accuracy: {avg_accuracy}")

model.save_pretrained("./flan_t5_finetunedLora")