In [1]:
! pip install pandas transformers faiss-cpu
! pip install -U langchain-community

Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.9.0
Collecting langchain-community
  Downloading langchain_community-0.3.5-py3-none-any.whl.metadata (2.9 kB)
Collecting SQLAlchemy<2.0.36,>=1.4 (from langchain-community)
  Downloading SQLAlchemy-2.0.35-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting pydantic-settings

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.llms import GooglePalm  # Or any other LLM you prefer
from langchain.docstore.document import Document

# Load CSV file
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/cleanerData.csv')

# Ensure columns 'title' and 'content' exist
print(df.columns)

df.rename(columns={'Section': 'Title'}, inplace=True)
df.rename(columns={'Paragraph': 'Content'}, inplace=True)

# Combine title and content for better context when retrieving
df['combined'] = df['Title'] + " " + df['Content']
texts = df['combined'].tolist()  # List of all combined texts


Index(['Section', 'Paragraph'], dtype='object')


In [4]:
# Create LangChain documents
documents = [Document(page_content=text, metadata={'title': title})
            for text, title in zip(df['combined'], df['Title'])]

In [5]:
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def get_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()  # Get mean of the token embeddings (pooled output)

# Create embeddings for each text in the data
embeddings = [get_embedding(text) for text in texts]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [8]:
import faiss
import numpy as np

# Create a FAISS index (Flat L2 distance for simplicity)
embedding_dim = embeddings[0].shape[0]  # Size of BERT embeddings
index = faiss.IndexFlatL2(embedding_dim)

# Convert embeddings to numpy array
embeddings_array = np.array(embeddings)

# Add the embeddings to the FAISS index
index.add(embeddings_array)

In [7]:
def search(query, top_n=5):
    # Convert query to embedding
    query_embedding = get_embedding(query).reshape(1, -1)

    # Search for the nearest neighbors in the FAISS index
    distances, indices = index.search(query_embedding, top_n)

    # Retrieve the most relevant texts
    results = []
    # Iterate through the indices of the top_n nearest neighbors
    for i, index_val in enumerate(indices[0]):
        results.append({
            'title': df['Title'][index_val],  # Use index_val to access df
            'content': df['Content'][index_val],  # Use index_val to access df
            'distance': distances[0][i]  # Use i to access distances
        })
    return results

# Example: User query
query = "What are the conditions for asylum based on political persecution?"
results = search(query)

# Print the top results
for result in results:
    print(f"Title: {result['title']}\nContent: {result['content']}\nDistance: {result['distance']}\n")

Title: schedule 11 paragraph 40
Content: Prospective 40U.K.In Schedule 3 (withholding and withdrawal of support), in paragraph 17(1), for the definition of “asylum-seeker” substitute— ““asylum-seeker” has the meaning given by section 18,”. Prospective 40U.K.In Schedule 3 (withholding and withdrawal of support), in paragraph 17(1), for the definition of “asylum-seeker” substitute— ““asylum-seeker” has the meaning given by section 18,”.
Distance: 50.42012023925781

Title: schedule 11 paragraph 8
Content: Prospective 8U.K.In section 95 (persons for whom support may be provided), the heading becomes“Support for asylum-seekers,etc”. Prospective 8U.K.In section 95 (persons for whom support may be provided), the heading becomes“Support for asylum-seekers,etc”.
Distance: 50.75392532348633

Title: schedule 11 paragraph 6
Content: Prospective 6U.K.The heading of the Part becomes“Support for asylum-seekers,etc”. Prospective 6U.K.The heading of the Part becomes“Support for asylum-seekers,etc”.
Dis

In [9]:
def chatbot():
    print("Ask me anything about immigration law or case law!")
    while True:
        user_input = input("You: ")
        if user_input.lower() in ['exit', 'quit']:
            break
        results = search(user_input)
        if results:
            print("Top results:")
            for result in results:
                print(f"\nTitle: {result['title']}\nContent: {result['content'][:500]}...")  # Limit content length
        else:
            print("Sorry, I couldn't find relevant information.")
        print()

chatbot()


Ask me anything about immigration law or case law!
You: exit


In [10]:
faiss.write_index(index, "faiss_index.index")