In [1]:
pip install -U langchain-community



In [2]:
pip install pinecone-client langchain openai transformers jsonlines



In [3]:
!pip install PyPDF2



In [4]:
pip install openai==0.28



In [5]:
import openai
from langchain.chains import RetrievalQA
from langchain.vectorstores import Pinecone as LangPinecone
from langchain.llms import OpenAI
import pinecone
import json
from PyPDF2 import PdfReader


In [6]:
from pinecone import Pinecone

# Initialize the Pinecone client
pc = Pinecone(api_key="d8f32a14-b0b1-40bf-bbc1-b93f9f8b6c8d")

# Connect to the "taxease" index
index = pc.Index("taxease")

In [7]:
import jsonlines

# Inspect the data structure
import jsonlines

file_path = "/content/updated_training.jsonl"
with jsonlines.open(file_path) as reader:
    for i, entry in enumerate(reader):
        print(entry)
        if i >= 5:  # Print the first 5 entries
            break

{'prompt': 'What are the different types of tax deductions? ->', 'completion': " Tax deductions are expenses that can be subtracted from your taxable income, potentially reducing the amount of taxes you owe. Common types of tax deductions include:\n1. Standard Deduction: A fixed deduction amount set by the tax authorities that you can claim without itemizing your expenses.\n2. Itemized Deductions: These include expenses such as medical expenses, state and local taxes, mortgage interest, charitable contributions, and certain business expenses.\n3. Above-the-Line Deductions: Certain deductions that can be claimed regardless of whether you itemize or take the standard deduction, such as student loan interest, educator expenses, and contributions to retirement accounts.\n4. Business Deductions: Expenses related to operating a business, such as office rent, employee salaries, and business-related travel.\nIt's important to consult with a tax professional or refer to the tax laws in your jur

In [8]:
import jsonlines

def load_data(file_path):
    """Load data from a .jsonl file."""
    with jsonlines.open(file_path) as reader:
        return [entry for entry in reader]

In [9]:
# Preprocess JSONL Data
def preprocess_jsonl(file_path):
    with open(file_path, "r") as f:
        data = []
        for line in f:
            line = line.strip()  # Remove leading/trailing whitespace
            if not line:  # Skip empty lines
                continue
            try:
                json_line = json.loads(line)
                data.append({
                    "id": f"jsonl-{len(data)}",  # Unique ID for JSONL data
                    "text": f"{json_line['prompt']} {json_line['completion']}",
                    "metadata": json_line  # Original record as metadata
                })
            except json.JSONDecodeError as e:
                print(f"Skipping invalid JSON line: {line}")
                continue
    return data


# Extract and Preprocess PDF Data
def preprocess_pdf(file_path):
    reader = PdfReader(file_path)
    pdf_text = ""
    for page in reader.pages:
        pdf_text += page.extract_text()

    # Split into manageable chunks (e.g., paragraphs)
    chunks = pdf_text.split("\n\n")  # Adjust splitting logic as needed
    return [
        {
            "id": f"pdf-{i}",  # Unique ID with prefix
            "text": chunk.strip(),
            "metadata": {"source": "Form 1040", "page": i}
        }
        for i, chunk in enumerate(chunks) if chunk.strip()
    ]


In [15]:
from transformers import AutoTokenizer, AutoModel
import torch
import openai

# Load a transformer model for embeddings (if needed for local use)
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Set your OpenAI API key
openai.api_key = "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"

# Function to split text into chunks
def split_into_chunks(text, max_tokens=8192):
    """Split text into smaller chunks that fit within the token limit."""
    # Tokenize the text into tokens
    tokens = tokenizer.encode(text, truncation=False)
    chunks = []
    for i in range(0, len(tokens), max_tokens):
        chunk = tokens[i:i + max_tokens]  # Create a chunk with a maximum of `max_tokens`
        chunks.append(tokenizer.decode(chunk, skip_special_tokens=True))  # Decode tokens back into text
    return chunks

def generate_embeddings_with_chunking(text):
    """Generate embeddings for long text by splitting into chunks."""
    # Split text into chunks
    chunks = split_into_chunks(text, max_tokens=8192)
    # Generate embeddings for each chunk
    chunk_embeddings = generate_embeddings(chunks)  # List of 1536-dimensional embeddings
    # Average embeddings across chunks to represent the full text
    return np.mean(chunk_embeddings, axis=0)  # Single 1536-dimensional vector


# Preprocess data from JSONL and PDF
jsonl_data = preprocess_jsonl("/content/updated_training.jsonl")



# Generate embeddings and upsert into Pinecone
for item in jsonl_data:
    # Generate embeddings with chunking for long texts
    embedding = generate_embeddings_with_chunking(item["text"])  # Generate 1536-dimensional embeddings
    index.upsert(
        vectors=[
            {
                "id": item["id"],
                "values": embedding.tolist(),  # Convert NumPy array to list
                "metadata": item["metadata"]
            }
        ]
    )

print("Data successfully upserted into Pinecone!")


Data successfully upserted into Pinecone!


In [16]:
def query_pinecone(query):
    """Query Pinecone index with an embedding of the query."""
    # Step 1: Generate embedding for the query
    query_embedding = generate_embeddings([query])[0]  # Use the same embedding model as used for indexing

    # Step 2: Query Pinecone
    results = index.query(
        namespace="ns1",  # Use the namespace if applicable, otherwise omit
        vector=query_embedding,  # Provide the query embedding
        top_k=5,  # Number of top results to retrieve
        include_metadata=True  # Include metadata in the results
    )

    # Return the retrieved results
    return results["matches"]


In [17]:
def generate_response_with_rag(query, retrieved_results):

    # Introductory context

    introductory_prompt = (
    "   Hi Taxease, you are an assistant who helps users navigate the tax filing process. "
        "The chatbot should be able to answer questions, provide guidance on filling out the tax form, "
        "and offer suggestions for deductions or credits the user may be eligible for. "
        "The chatbot should use natural language processing to understand user queries and respond in a conversational way. "

    )

    # Retrieved context
    context = "\n".join([match["metadata"]["completion"] for match in retrieved_results])

    # Combine the introductory prompt, retrieved context, and user query
    messages = [
        {"role": "system", "content": introductory_prompt},
        {"role": "user", "content": f"Context:\n{context}\n\nQuery: {query}\n\nAnswer:"}
    ]


    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",  # Or "gpt-4" for higher quality
        messages=messages,
        max_tokens=300,
        temperature=0.7
    )

    return response["choices"][0]["message"]["content"].strip()


In [18]:
# User query
query = "How should I fill out Form 1040?"

# Step 1: Retrieve relevant documents from Pinecone
retrieved_results = query_pinecone(query)

# Step 2: Generate a response using LLM
response = generate_response_with_rag(query, retrieved_results)

# Display the response
print("Query:", query)
print("Response:", response)

Query: How should I fill out Form 1040?
Response: Filling out Form 1040 can seem overwhelming, but don't worry, I'm here to help! To get started, make sure you have all your necessary documents handy, such as your W-2s, 1099s, and any other income or deduction-related forms.

Next, begin by filling out your personal information at the top of the form, including your name, address, and Social Security number. Then, work your way through the various sections, such as income, deductions, and credits. Be sure to double-check your entries to avoid any errors.

If you have specific questions about certain sections or need help understanding a particular line on the form, feel free to ask! I'm here to guide you through the process step by step. Let's start by reviewing your income sources and deductions to ensure you're maximizing your tax return.


In [10]:
import json

# Path to your dataset file
dataset_path = "/content/training_tax_data.jsonl"

# Load line-delimited JSON
with open(dataset_path, "r") as f:
    json_data = [json.loads(line) for line in f]

# Check the first few entries
print(json_data[:5])



[{'prompt': 'What is the Premium Tax Credit (PTC), and how do I reconcile it? ->', 'completion': 'The Premium Tax Credit (PTC) helps eligible individuals and families with moderate incomes afford health insurance purchased through the Health Insurance Marketplace. If you received advance payments of the PTC, you must file Form 8962 to reconcile the payments with the actual credit amount based on your final income. Use Form 1095-A provided by the Marketplace to complete this reconciliation.'}, {'prompt': 'What expenses can I deduct as self-employed? ->', 'completion': 'Self-employed individuals can deduct various expenses directly related to their business operations. Common deductions include:\n1. Home Office: If exclusively and regularly used for business purposes.\n2. Travel Expenses: Business-related travel, including airfare and lodging.\n3. Equipment: Computers, software, and tools used for the business.\n4. Marketing: Advertising and promotional costs.\nEnsure you maintain detail

In [17]:
from torch.utils.data import Dataset

class FineTuneDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_ids = torch.tensor(self.data[idx])
        return {
            "input_ids": input_ids,
            "labels": input_ids.clone(),  # Labels are identical to input_ids for causal language modeling
        }


In [20]:
import torch


def fine_tune_model(json_data, eval_split=0.1):
    """Fine-tune a base LLM for domain-specific use."""
    from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
    from sklearn.model_selection import train_test_split
    import torch

    # Base model for fine-tuning
    model_name = "gpt2"  # Replace with your preferred base model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)

    # Add a padding token if not present
    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})
        model.resize_token_embeddings(len(tokenizer))

    # Preprocess data
    combined_data = [
        f"{entry['prompt']} {entry['completion']}" for entry in json_data
    ]

    tokenized_data = tokenizer(
        combined_data,
        truncation=True,
        padding="max_length",
        max_length=512,
        return_tensors="pt"
    )

    # Split into train and eval datasets
    train_data, eval_data = train_test_split(
        tokenized_data.input_ids.tolist(),
        test_size=eval_split
    )

    # Convert to Dataset object
    train_dataset = FineTuneDataset(train_data)
    eval_dataset = FineTuneDataset(eval_data)

    # Define training arguments
    training_args = TrainingArguments(
        output_dir="./results",
        eval_strategy="epoch",  # Perform evaluation at the end of each epoch
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        num_train_epochs=3,
        weight_decay=0.01,
        save_steps=10_000,
        save_total_limit=2,
        logging_dir="./logs",
        report_to="wandb",  # Log metrics to Weights & Biases
    )

    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,  # Provide evaluation dataset
    )

    # Train the model
    trainer.train()
    return model


In [21]:
# Load JSONL data
with open("training_tax_data.jsonl", "r") as f:
    json_data = [json.loads(line) for line in f]

# Fine-tune the model with a 10% evaluation split
fine_tuned_model = fine_tune_model(json_data, eval_split=0.1)

# Save the fine-tuned model
fine_tuned_model.save_pretrained("./fine_tuned_model")



Epoch,Training Loss,Validation Loss
1,No log,4.398544
2,No log,0.374241
3,3.478100,0.091651
