In [None]:
!pip install faiss-cpu langchain langchain-community hf_xet langchain-huggingface langchain_google_genai

In [None]:
import os
import sys
import shutil
# Detect if running in Google Colab

# Set the environment variable for your GitHub token
#os.environ["GITHUB_TOKEN"] =

# This cell is for loading data. If your prefer to do this manually, you will need to set base_dir and data_dir separately

IN_COLAB = 'google.colab' in sys.modules

# Check if running in Google Colab
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')

    # Set the base directory on Google Drive (no extra folder will be added)
    base_dir = "/content/drive/MyDrive/Bertopic"
    token = os.getenv("GITHUB_TOKEN")
    #if os.path.exists(base_dir):
     #   shutil.rmtree(base_dir)

    #!git clone https://{token}@github.com/UnbrokenCocoon/OCR-evaluation.git "{base_dir}"

else:
    # Set the base directory locally (set this to your local project folder)
    base_dir = "path/to/your/local/project/folder"

    #!git clone https://{token}@github.com/UnbrokenCocoon/OCR-evaluation.git "{base_dir}"

    # Clone the repository locally


# Set the data directory (this assumes you have a 'Data' folder inside the repository)
data_dir = os.path.join(base_dir, "Data")
output_dir = os.path.join(base_dir, "output")
os.makedirs(output_dir, exist_ok=True)

# Now data_dir points to the cloned Data folder
print(f"Data folder is located at: {data_dir}")


In [None]:
import pickle
with open(os.path.join(data_dir, 'bs_emb.pkl', 'rb') as f:
    bs_emb = pickle.load(f)
with open(os.path.join(data_dir, 'bs_sen.pkl', 'rb') as f:
    bs_sen = pickle.load(f)

In [None]:
with open(os.path.join(data_dir, 'uc_sentences.pkl', 'rb') as f:
  uc_sen = pickle.load(f)
with open(os.path.join(data_dir, 'uc_embedding.pkl', 'rb') as f:
  uc_emb = pickle.load(f)

In [None]:
# The sentences are wrapped as docs for reliable processing
# A more simplistic approach could be used, but this was found to cause errors sometimes
import faiss
import os
import numpy as np
from langchain.docstore import InMemoryDocstore
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.schema import Document

faiss_dir = os.path.join(data_dir, "faiss")
os.makedirs(faiss_dir, exist_ok=True)

# Ensure float32 for FAISS
embeddings = np.array(bs_emb).astype("float32")

# Step 1: Build FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

# Step 2: Wrap sentences as Documents
docs = [Document(page_content=txt) for txt in bs_sen]
docstore = InMemoryDocstore(dict(enumerate(docs)))
index_to_docstore_id = dict(enumerate(range(len(docs))))

# Step 3: Set up embedding model again for retrieval later
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

# Step 4: Create LangChain FAISS object
faiss_store = FAISS(
    embedding_function=embedding_model,
    index=index,
    docstore=docstore,
    index_to_docstore_id=index_to_docstore_id
)

# Step 5: Save
faiss_store.save_local(os.path.join(faiss_dir, "index_a"))
print("✅ Successfully built and saved FAISS index_a.")

In [None]:
import faiss
import numpy as np
from langchain.docstore import InMemoryDocstore
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.schema import Document

# Ensure float32 for FAISS
embeddings = np.array(uc_emb).astype("float32")

# Step 1: Build FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

# Step 2: Wrap sentences as Documents
docs = [Document(page_content=txt) for txt in uc_sen]
docstore = InMemoryDocstore(dict(enumerate(docs)))
index_to_docstore_id = dict(enumerate(range(len(docs))))

# Step 3: Set up embedding model again for retrieval later
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

# Step 4: Create LangChain FAISS object
faiss_store = FAISS(
    embedding_function=embedding_model,
    index=index,
    docstore=docstore,
    index_to_docstore_id=index_to_docstore_id
)

# Step 5: Save
faiss_store.save_local(os.path.join(faiss_dir, "index_b"))
print("✅ Successfully built and saved FAISS index_b.")

In [None]:
import os
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
faiss_dir = os.path.join(data_dir, "faiss")
# Load embedding model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

# Load both FAISS indexes
faiss_a = FAISS.load_local(os.path.join(faiss_dir, "index_a"), embeddings=embedding_model, allow_dangerous_deserialization=True)
faiss_b = FAISS.load_local(os.path.join(faiss_dir, "index_b"), embeddings=embedding_model, allow_dangerous_deserialization=True)

retriever_a = faiss_a.as_retriever(search_kwargs={"k": 20})
retriever_b = faiss_b.as_retriever(search_kwargs={"k": 20})

# Sample query



In [None]:
# Test the retrieval
query = "the film the big"
retriever_a = faiss_a.as_retriever(search_kwargs={"k": 50})
retriever_b = faiss_b.as_retriever(search_kwargs={"k": 50})
# Retrieve and print results
print("\n🔴 Top 5 from index_a (Boot & Shoe):")
for i, doc in enumerate(retriever_a.get_relevant_documents(query)):
    print(f"{i+1}. {doc.page_content}")

print("\n🔵 Top 5 from index_b (Unite Community):")
for i, doc in enumerate(retriever_b.get_relevant_documents(query)):
    print(f"{i+1}. {doc.page_content}")

In [None]:
api_key = #set your API key
#Test it works
from openai import OpenAI

client = OpenAI(
    api_key=api_key,
    base_url="https://generativelanguage.googleapis.com/v1beta/openai/"
)

response = client.chat.completions.create(
    model="gemini-2.0-flash",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {
            "role": "user",
            "content": "Explain to me how AI works"
        }
    ]
)

print(response.choices[0].message)

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.schema import HumanMessage, SystemMessage
from langchain.prompts import ChatPromptTemplate
import pandas as pd
import os

# 🔧 Set your Gemini API key
os.environ["GOOGLE_API_KEY"] = api_key  # Make sure `api_key` is defined

# 🔍 Load the embedding model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

dialogue_history = []

# 🔁 Set up retrievers
retriever_a = faiss_a.as_retriever(search_kwargs={"k": 10})
retriever_b = faiss_b.as_retriever(search_kwargs={"k": 10})

# 🧠 System prompts
system_prompt_a = """You are a 1920s trade union representative from the National Boot and Shoe Union.
Use the retrieved sentences as your knowledge base.
Speak persuasively as if you are arguing with a fellow trade unionist.
Do not use your own knowledge.
Vary your sentence structures, do not repeat phrases.
Respond with 2 sentences maximum."""

system_prompt_b = """You are a 2020s trade union representative from Unite Community.
Use the retrieved sentences as your knowledge base.
Speak persuasively as if you are arguing with a fellow trade unionist.
Do not use your own knowledge.
Vary your sentence structures, do not repeat phrases.
Respond with 2 sentences maximum."""

# 📃 Store dialogue

# 🔄 Dialogue generator
def generate_turn(query, retriever, system_prompt, speaker):
    docs = retriever.get_relevant_documents(query)
    content = "\n".join(doc.page_content for doc in docs)

    prompt = ChatPromptTemplate.from_messages([
        ("system", system_prompt),
        ("human", f"You just heard the following message:\n\n\"{query}\"\n\nHere are 5 excerpts from your documents that may help you reply:\n{content}\n\nRespond to the message above based ONLY on this information, and speak as if you were in a real conversation.")
    ])
    response = llm(prompt.format_messages())
    reply_text = response.content.strip().replace("\n", " ")
    print(f"\n{speaker}:\n{reply_text}\n{'-'*50}")
    dialogue_history.append({
    "speaker": speaker,
    "query": query,
    "response": reply_text,
    "context": content
    })
    return reply_text


In [None]:
# You may want to reload, it is here so it writes over the init of the variable
with open(os.path.join(data_dir, 'dialogue_history.pkl'), 'rb') as f:
  dialogue_history=pickle.load(f)

In [None]:
# Check the len and split to a multiple of 5 if any calls fail
print(len(dialogue_history))


In [None]:
models = ["gemini-2.0-flash-lite", "gemini-2.0-flash", "gemini-2.0-flash-lite", "gemini-2.0-flash", "gemini-2.0-flash-lite"]
# 🗣️ Start the dialogue with high call freq models
import random
import time
for j in range(40):
  time.sleep(20)
  sentence_index = range(0, len(bs_sen),1)
  query = bs_sen[n]
  for i in range(5):
      llm = ChatGoogleGenerativeAI(model=models[i], temperature=0.7)
      if i % 2 == 0:
          n+=1
          speaker = "🔴 Boot & Shoe Union (1920s)"
          query = generate_turn(query, retriever_a, system_prompt_a, speaker)
      else:
          n+=1
          speaker = "🔵 Unite Community (2020s)"
          query = generate_turn(query, retriever_b, system_prompt_b, speaker)
with open(os.path.join(data_dir, 'dialogue_history.pkl'), 'wb') as f:
  pickle.dump(dialogue_history, f)

In [None]:
# 🗣️ Start the dialogue with low call models
import random
import time
list_of_models = ["gemini-1.5-flash", "gemini-1.5-flash-8b", "gemini-2.0-flash-lite", "gemini-2.0-flash", "gemini-1.5-flash-8b"]
for j in range(40):
  time.sleep(20)
  sentence_index = range(0, len(bs_sen),1)
  query = bs_sen[random.choice(sentence_index)]
  for i in range(5):
      llm = ChatGoogleGenerativeAI(model=list_of_models[i], temperature=0.7)
      if i % 2 == 0:
          speaker = "🔴 Boot & Shoe Union (1920s)"
          query = generate_turn(query, retriever_a, system_prompt_a, speaker)
      else:
          speaker = "🔵 Unite Community (2020s)"
          query = generate_turn(query, retriever_b, system_prompt_b, speaker)
with open(os.path.join(data_dir, 'dialogue_history.pkl'), 'wb') as f:
  pickle.dump(dialogue_history, f)

In [None]:
print(dialogue_df.head(5).to_markdown())