In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer

# Load the CSV file
file_path = '/Users/shashinimashi/Desktop/Semester 3/Thesis/Analysis/Repo/Research_OAI/Research_OAI/transcripts.csv'  
df = pd.read_csv(file_path)

# Check if the 'Transcription' column exists
if 'Transcription' not in df.columns:
    raise ValueError("The 'Transcription' column is missing in the dataset.")

# Load the pretrained embedding model
embedding_model = SentenceTransformer('all-mpnet-base-v2')

# Generate embeddings for the Transcription column
df['Transcription_Embeddings'] = df['Transcription'].apply(
    lambda text: embedding_model.encode(text, convert_to_tensor=True)
)

# Save the updated DataFrame to a CSV file
output_path = 'transcription_embeddings.csv'
df.to_csv(output_path, index=False)

print(f"Embeddings file saved to {output_path}")


#open with pickel
# Load train response embeddings
#with open('train_response_embeddings.pkl', 'rb') as f:
    loaded_train_response_embeddings = pickle.load(f)

# Load validation response embeddings
#with open('val_response_embeddings.pkl', 'rb') as f:
    loaded_val_response_embeddings = pickle.load(f)

# Load test response embeddings
#with open('test_response_embeddings.pkl', 'rb') as f:
#   loaded_test_response_embeddings = pickle.load(f)


In [None]:
from sklearn.model_selection import train_test_split

train_df, temp_df = train_test_split(df_cleaned, test_size=0.2, random_state=42)  # 80% training data
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)  # 10% validation, 10% test

print(f"Training Set: {len(train_df)}")
print(f"Validation Set: {len(val_df)}")
print(f"Test Set: {len(test_df)}")


In [None]:
# Print the column names of each dataframe
print(f"train_df columns: {train_df.columns}")
print(f"val_df columns: {val_df.columns}")
print(f"test_df columns: {test_df.columns}")

In [None]:
# Check the first few rows of 'Responses' column in each dataset
print(train_df['Responses'].head())
print(val_df['Responses'].head())
print(test_df['Responses'].head())

In [None]:
# Check how many valid responses are left
print(f"Valid Train Responses: {len([r for r in train_responses if r != ''])}")
print(f"Valid Validation Responses: {len([r for r in val_responses if r != ''])}")
print(f"Valid Test Responses: {len([r for r in test_responses if r != ''])}")

In [None]:
# Check the first few responses after cleaning
print(train_df['Responses'].head())
print(val_df['Responses'].head())
print(test_df['Responses'].head())

# Check if there are any empty or None values
empty_train_responses = train_df['Responses'].apply(lambda x: len(x) == 0 if isinstance(x, list) else True)
empty_val_responses = val_df['Responses'].apply(lambda x: len(x) == 0 if isinstance(x, list) else True)
empty_test_responses = test_df['Responses'].apply(lambda x: len(x) == 0 if isinstance(x, list) else True)

print(f"Empty Train Responses: {empty_train_responses.sum()}")
print(f"Empty Validation Responses: {empty_val_responses.sum()}")
print(f"Empty Test Responses: {empty_test_responses.sum()}")

In [None]:
from sentence_transformers import SentenceTransformer
import torch
import numpy as np

# Initialize the SentenceTransformer model (all-mpnet-base-v2)
model = SentenceTransformer('all-mpnet-base-v2')

# Function to generate sentence embeddings
def get_sentence_embeddings(text_list):
    # Generate sentence embeddings for the text list
    return model.encode(text_list, convert_to_tensor=True)

# Flatten the list of sentences (Responses are lists of sentences, so flatten them into one list)
def flatten_responses(responses):
    return [sentence for response in responses for sentence in response]

# For the training, validation, and test sets, flatten the responses before getting embeddings
train_flat_responses = flatten_responses(train_df['Responses'].tolist())
val_flat_responses = flatten_responses(val_df['Responses'].tolist())
test_flat_responses = flatten_responses(test_df['Responses'].tolist())

# Get embeddings for the flattened responses
train_embeddings = get_sentence_embeddings(train_flat_responses)
val_embeddings = get_sentence_embeddings(val_flat_responses)
test_embeddings = get_sentence_embeddings(test_flat_responses)

# Now, group by ResponseID (Mean Pooling) - assuming 'ResponseID' is already available
# Grouping the responses by ResponseID and applying mean pooling to get one embedding per response
def group_by_response_id(df, embeddings):
    response_embeddings = {}
    
    # Iterate through unique ResponseIDs
    for response_id in df['ResponseID'].unique():
        # Get sentences for the given ResponseID
        sentences = df[df['ResponseID'] == response_id]['Responses'].tolist()
        
        # Flatten the sentences for the ResponseID
        sentences_flat = flatten_responses(sentences)
        
        # Generate sentence embeddings for the flattened sentences
        sentences_embeddings = model.encode(sentences_flat, convert_to_tensor=True)
        
        # Apply mean pooling using PyTorch's .mean() method along the right axis
        response_embeddings[response_id] = sentences_embeddings.mean(dim=0)  # Mean Pooling
    
    return response_embeddings

# Example for training, validation, and test sets
train_response_embeddings = group_by_response_id(train_df, train_embeddings)
val_response_embeddings = group_by_response_id(val_df, val_embeddings)
test_response_embeddings = group_by_response_id(test_df, test_embeddings)


In [1]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
import numpy as np

# Example threshold and fallback count
THRESHOLD = 0.2
TOP_N = 5  # Fallback: Select top N responses if no responses pass the threshold

# Normalize keyword embeddings
keyword_embeddings_cpu = normalize(keyword_embeddings.cpu().numpy(), axis=1)

# Calculate cosine similarity between keyword embeddings and response embeddings
response_similarities = {}  # Store similarities for all responses
for response_id, embedding in train_response_embeddings.items():
    # Normalize response embeddings
    response_embedding = normalize(embedding.cpu().numpy().reshape(1, -1))
    
    # Calculate cosine similarity for all keywords
    max_similarity = max(
        cosine_similarity(keyword_embeddings_cpu, response_embedding).flatten()
    )
    response_similarities[response_id] = max_similarity

# Filter responses based on threshold
filtered_responses = {
    response_id: sim for response_id, sim in response_similarities.items() if sim >= THRESHOLD
}

# Fallback: If no responses pass the threshold, select the top N most similar responses
if not filtered_responses:
    print("\nNo responses passed the threshold. Using fallback mechanism.")
    filtered_responses = dict(sorted(response_similarities.items(), key=lambda x: x[1], reverse=True)[:TOP_N])

# Display filtered responses
print("\nFiltered Responses:")
print(filtered_responses)


  from tqdm.autonotebook import tqdm, trange
comet_ml is installed but `COMET_API_KEY` is not set.


NameError: name 'keyword_embeddings' is not defined

In [None]:
from sentence_transformers import SentenceTransformer
import torch
import numpy as np

# Initialize the SentenceTransformer model (all-mpnet-base-v2)
model = SentenceTransformer('all-mpnet-base-v2')

# Function to generate sentence embeddings
def get_sentence_embeddings(text_list):
    # Generate sentence embeddings for the text list
    return model.encode(text_list, convert_to_tensor=True)

# Flatten the list of sentences (Responses are lists of sentences, so flatten them into one list)
def flatten_responses(responses):
    return [sentence for response in responses for sentence in response]

# For the training, validation, and test sets, flatten the responses before getting embeddings
train_flat_responses = flatten_responses(train_df['Responses'].tolist())
val_flat_responses = flatten_responses(val_df['Responses'].tolist())
test_flat_responses = flatten_responses(test_df['Responses'].tolist())

# Get embeddings for the flattened responses
train_embeddings = get_sentence_embeddings(train_flat_responses)
val_embeddings = get_sentence_embeddings(val_flat_responses)
test_embeddings = get_sentence_embeddings(test_flat_responses)

# Now, group by ResponseID (Mean Pooling) - assuming 'ResponseID' is already available
# Grouping the responses by ResponseID and applying mean pooling to get one embedding per response
def group_by_response_id(df, embeddings):
    response_embeddings = {}
    
    # Iterate through unique ResponseIDs
    for response_id in df['ResponseID'].unique():
        # Get sentences for the given ResponseID
        sentences = df[df['ResponseID'] == response_id]['Responses'].tolist()
        
        # Flatten the sentences for the ResponseID
        sentences_flat = flatten_responses(sentences)
        
        # Generate sentence embeddings for the flattened sentences
        sentences_embeddings = model.encode(sentences_flat, convert_to_tensor=True)
        
        # Apply mean pooling using PyTorch's .mean() method along the right axis
        response_embeddings[response_id] = sentences_embeddings.mean(dim=0)  # Mean Pooling
    
    return response_embeddings

# Example for training, validation, and test sets
train_response_embeddings = group_by_response_id(train_df, train_embeddings)
val_response_embeddings = group_by_response_id(val_df, val_embeddings)
test_response_embeddings = group_by_response_id(test_df, test_embeddings)


In [None]:
# Fetch the filtered responses' text
filtered_texts = [train_df.loc[train_df['ResponseID'] == resp_id, 'Transcription'].values[0]
                  for resp_id in filtered_responses.keys()]

# If no responses are found (extremely unlikely with the fallback), handle gracefully
if not filtered_texts:
    filtered_texts = ["No relevant responses found."]

# Prepare input for T5
input_text = "summarize: " + " ".join(filtered_texts)
input_ids = t5_tokenizer.encode(input_text, return_tensors="pt", truncation=True)

# Generate summary with T5
summary_ids = t5_model.generate(input_ids, max_length=150, num_beams=5, early_stopping=True)
summary = t5_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Display the generated summary
print("\nGenerated Summary:")
print(summary)

In [None]:
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# Initialize ROUGE scorer
rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

# Ensure `generated_summaries` and `responses` are aligned
# `generated_summaries` is the list of summaries produced by T5
# `responses` is the list of original responses (ground truth)
generated_summaries = ["Generated summary 1", "Generated summary 2", "Generated summary 3"]
responses = ["Ground truth response 1", "Ground truth response 2", "Ground truth response 3"]

# Initialize scores
rouge_1_scores = []
rouge_l_scores = []
bleu_scores = []

# Evaluate each pair of generated summary and ground truth
for gen_summary, response in zip(generated_summaries, responses):
    # Compute ROUGE scores
    rouge_scores = rouge_scorer.score(response, gen_summary)
    rouge_1_scores.append(rouge_scores['rouge1'].fmeasure)
    rouge_l_scores.append(rouge_scores['rougeL'].fmeasure)

    # Compute BLEU score
    bleu_score = sentence_bleu(
        [response.split()],  # Ground truth tokenized
        gen_summary.split(),  # Generated summary tokenized
        smoothing_function=SmoothingFunction().method1  # Smoothing to handle brevity
    )
    bleu_scores.append(bleu_score)

# Calculate average scores
avg_rouge_1 = sum(rouge_1_scores) / len(rouge_1_scores)
avg_rouge_l = sum(rouge_l_scores) / len(rouge_l_scores)
avg_bleu = sum(bleu_scores) / len(bleu_scores)

# Print results
print(f"Average ROUGE-1: {avg_rouge_1:.4f}")
print(f"Average ROUGE-L: {avg_rouge_l:.4f}")
print(f"Average BLEU: {avg_bleu:.4f}")
