In [None]:
!pip install numpy pandas transformers requests sentence-transformers
!pip install huggingface_hub

In [None]:
from huggingface_hub import login
login(token="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", add_to_git_credential=True)

In [None]:
import pickle
import numpy as np
import pandas as pd
import torch
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load Preprocessed Data and Embeddings
try:
    df_cleaned = pd.read_csv("/Users/shashinimashi/Desktop/Semester 3/Thesis/Analysis/Repo/Research_OAI-4/df_cleaned.csv")
    with open("/Users/shashinimashi/Desktop/Semester 3/Thesis/Analysis/Repo/Research_OAI-4/response_embeddings.pkl", "rb") as f:
        response_embeddings = pickle.load(f)
except FileNotFoundError as e:
    print("Error: Missing required files. Ensure df_cleaned.csv and response_embeddings.pkl are available.")
    raise

# Initialize SentenceTransformer Model
embedding_model = SentenceTransformer('all-mpnet-base-v2')

# Load the Local LLaMA Model
TOKEN = "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"
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-70b-hf", use_auth_token=TOKEN)
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-70b-hf", use_auth_token=TOKEN)

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model = model.to(device)

def llama_generate_summary(input_text):
    """
    Use the locally loaded LLaMA model to generate text based on the input prompt.
    """
    # Tokenize the input text
    inputs = tokenizer(input_text, return_tensors="pt").to(device)
    
    # Generate output from the model
    outputs = model.generate(
        inputs["input_ids"], 
        max_new_tokens=50, 
        temperature=0.7, 
        top_p=0.9,
        do_sample=True
    )
    
    # Decode and return the generated text
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Function to calculate cosine similarity
def calculate_cosine_similarity(query_embeddings, response_embeddings):
    similarities = {}
    for response_id, embedding in response_embeddings.items():
        response_embedding_numpy = embedding.detach().cpu().numpy()  # Ensure tensor is on CPU
        response_embedding_normalized = normalize(response_embedding_numpy.reshape(1, -1), axis=1)
        max_similarity = max(
            cosine_similarity(query_embeddings, response_embedding_normalized).flatten()
        )
        similarities[response_id] = max_similarity
    return similarities

# Main Function to Generate Mimicking Text
def generate_mimicking_text(keyword1, keyword2):
    if keyword1 or keyword2:
        try:
            user_keywords = [kw.strip() for kw in [keyword1, keyword2] if kw.strip()]

            # Generate Embeddings for User Keywords
            keyword_embeddings = embedding_model.encode(user_keywords, convert_to_tensor=True)
            keyword_embeddings_cpu = normalize(keyword_embeddings.detach().cpu().numpy(), axis=1)

            # Calculate Cosine Similarity between User Keywords and Responses
            response_similarities = {}
            for response_id, embedding in response_embeddings.items():
                response_similarities[response_id] = calculate_cosine_similarity(keyword_embeddings_cpu, {response_id: embedding})[response_id]

            # Dynamic Grid Search to Find Best Threshold
            threshold_range = np.arange(0.0, 1.01, 0.01)
            best_threshold = None
            best_rouge1_fmeasure = -float("inf")
            filtered_responses_best = None

            for threshold in threshold_range:
                # Filter Responses Based on Threshold
                filtered_responses = {
                    response_id: sim for response_id, sim in response_similarities.items() if sim >= threshold
                }

                if not filtered_responses:
                    continue

                # Combine Filtered Responses into a Single String
                filtered_reference = " ".join([
                    df_cleaned.loc[df_cleaned['ResponseID'] == resp_id, 'Responses'].values[0]
                    for resp_id in filtered_responses.keys()
                ])

                # Generate Mimicking Text with Local LLaMA Model
                prompt = (
                    f"Analyze the following text to understand its language, structure, and style:\n\n"
                    f"{filtered_reference}\n\n"
                    f"Now generate new text that mimics the above language, structure, and style, "
                    f"while introducing new content related to the keywords: {', '.join(user_keywords)}."
                )
                mimicked_text = llama_generate_summary(prompt)

                # Evaluate Using ROUGE-1 F-Measure (optional)
                from rouge_score import rouge_scorer
                rouge_scorer_instance = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)
                rouge_scores = rouge_scorer_instance.score(filtered_reference, mimicked_text)
                rouge1_fmeasure = rouge_scores['rouge1'].fmeasure

                # Update Best Threshold
                if rouge1_fmeasure > best_rouge1_fmeasure:
                    best_rouge1_fmeasure = rouge1_fmeasure
                    best_threshold = threshold
                    filtered_responses_best = filtered_responses

            # Generate Final Mimicking Text Using Best Threshold
            if filtered_responses_best:
                final_reference = " ".join([
                    df_cleaned.loc[df_cleaned['ResponseID'] == resp_id, 'Responses'].values[0]
                    for resp_id in filtered_responses_best.keys()
                ])
                final_prompt = (
                    f"Analyze the following text to understand its language, structure, and style:\n\n"
                    f"{final_reference}\n\n"
                    f"Now generate new text that mimics the above language, structure, and style, "
                    f"while introducing new content related to the keywords: {', '.join(user_keywords)}."
                )
                final_mimicked_text = llama_generate_summary(final_prompt)
                print("Mimicking Text:")
                print(final_mimicked_text)
            else:
                print("No relevant responses found.")

        except Exception as e:
            print(f"An error occurred: {str(e)}")
    else:
        print("Please enter at least one keyword.")

# Interactive User Input
if __name__ == "__main__":
    print("Open gvt")
    keyword1 = input("Enter the first keyword (or leave blank): ").strip()
    keyword2 = input("Enter the second keyword (or leave blank): ").strip()
    
    generate_mimicking_text(keyword1, keyword2)

