In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!mkdir /content/raw_data
!cp -R /content/drive/MyDrive/Master/Fall24/ANLP/assn2/NLP-2/raw_data /content

In [None]:
import zipfile
import os

zip_path = "/content/drive/MyDrive/NLP-2-main.zip"

extract_path = "/content/NLP-2-extracted"
os.makedirs(extract_path, exist_ok=True)

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print(f"Files extracted to {extract_path}")

Files extracted to /content/NLP-2-extracted


In [None]:

print("Contents of the extracted folder:")
for item in os.listdir(extract_path):
    print(item)

Contents of the extracted folder:
NLP-2-main


In [None]:
!pip install -q langchain
!pip install -q langchain-community
!pip install -q sentence-transformers
!pip install -q chromadb
!pip install -q huggingface_hub
!pip install -q unstructured
!pip install -q -U langchain-huggingface

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.6/50.6 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m28.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m407.7/407.7 kB[0m [31m31.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.9/296.9 kB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.4/76.4 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.0/78.0 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m144.5/144.5 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.5/54.5 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [25]:
import os
import nltk
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings  # Updated import
from langchain_community.vectorstores import Chroma
from langchain_community.llms import HuggingFaceHub
from langchain.chains import RetrievalQA
from huggingface_hub import login
from google.colab import userdata
import torch

# my_secret
# login(my_secret)
os.environ["HUGGINGFACEHUB_API_TOKEN"] = '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'

# Ensure NLTK data is downloaded
nltk.download('punkt', quiet=True)

# Custom loader class to handle errors
class SafeTextLoader(TextLoader):
    def load(self):
        print(f"Attempting to load: {self.file_path}")
        try:
            return super().load()
        except Exception as e:
            print(f"Error loading file {self.file_path}: {e}")
            return []

# 1. Load Documents
loader = DirectoryLoader('/content/raw_data/raw_data', glob="**/*.txt", loader_cls=SafeTextLoader, recursive=True)
documents = loader.load()

print(f"Loaded {len(documents)} documents")

# 2. Split Documents
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)

print(f"Split into {len(texts)} chunks")

# 3. Create Embeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# 4. Create Vector Store
vectorstore = Chroma.from_documents(texts, embeddings)

# 5. Create Retriever
retriever = vectorstore.as_retriever()

# 6. Create Language Model
device = "cuda" if torch.cuda.is_available() else "cpu"
llm = HuggingFaceHub(repo_id="google/flan-t5-large", model_kwargs={"temperature":0.5, "max_length":512})

# 7. Create QA Chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True
)



Loaded 34 documents
Split into 10685 chunks




# Evaluation Metrics


In [15]:
import string
import re
from collections import Counter

# Function to normalize text (removes punctuation, articles, etc.)
def normalize_answer(s):
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punctuation(text):
        return ''.join(ch for ch in text if ch not in set(string.punctuation))

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punctuation(lower(s))))

# Function to calculate F1 score
def f1_score(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())

    if num_same == 0:
        return 0

    precision = num_same / len(prediction_tokens)
    recall = num_same / len(ground_truth_tokens)
    f1 = 2 * precision * recall / (precision + recall)

    return f1

# Function to calculate exact match
def exact_match_score(prediction, ground_truth):
    return normalize_answer(prediction) == normalize_answer(ground_truth)

# Function to calculate recall
def recall_score(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())

    if len(ground_truth_tokens) == 0:
        return 0

    recall = num_same / len(ground_truth_tokens)
    return recall

# Updated evaluate function to include recall
def evaluate_with_recall(predictions, references):
    total = len(references)
    f1 = exact_match = recall = 0

    for pred, ref in zip(predictions, references):
        exact_match += exact_match_score(pred, ref)
        f1 += f1_score(pred, ref)
        recall += recall_score(pred, ref)

    exact_match = 100.0 * exact_match / total
    f1 = 100.0 * f1 / total
    recall = 100.0 * recall / total

    return {'exact_match': exact_match, 'f1': f1, 'recall': recall}

# Example usage
predictions = ["Paris"]
references = ["Paris is the capital of France"]

results = evaluate_with_recall(predictions, references)
print(results)




{'exact_match': 0.0, 'f1': 33.333333333333336, 'recall': 20.0}


# Testing using our QAs

In [21]:
!cp -R /content/drive/MyDrive/Master/Fall24/ANLP/assn2/NLP-2/test_questions/reference_answers.txt /content
!cp -R /content/drive/MyDrive/Master/Fall24/ANLP/assn2/NLP-2/test_questions/questions.txt /content

In [26]:
# prompt: take in all the question as input to the qa_chain model and generate result to a output file

with open('questions.txt', 'r') as f:
    questions = f.readlines()

from tqdm import tqdm

# Assuming 'questions' is your list of questions
with open('output_file1.txt', 'w') as output_file1, open('output_file2.txt', 'w') as output_file2:
    for question in tqdm(questions, desc="Processing questions"):
        question = question.strip()
        if question:
            result = qa_chain({'query': question})
            answer = result['result']

            output_file2.write(f"Question: {question}\n")
            output_file2.write(f"Answer: {answer}\n\n")
            output_file1.write(f"{answer}\n")

Processing questions:  13%|█▎        | 190/1490 [13:45<1:34:11,  4.35s/it]


KeyboardInterrupt: 

In [None]:

from tqdm import tqdm

# Assuming 'questions' is your list of questions
with open('output_file1.txt', 'w') as output_file1, open('output_file2.txt', 'w') as output_file2:
    for question in tqdm(questions, desc="Processing questions"):
        question = question.strip()
        if question:
            result = qa_chain({'query': question})
            answer = result['result']

            output_file2.write(f"Question: {question}\n")
            output_file2.write(f"Answer: {answer}\n\n")
            output_file1.write(f"{answer}\n")

In [8]:
query = "Where does the Penguins, Pittsburgh's professional ice hockey team, play?"
result = qa_chain({"query": query})
print(result['result'])

  result = qa_chain({"query": query})


PPG Paints Arena


Nov 20, 2024
