In [3]:
import numpy as np
import pandas as pd
from faker import Faker
import random

fake = Faker()
np.random.seed(42)
random.seed(42)

num_users = 10

def generate_financial_data(num_users):
    data = {
        'User_ID': [i for i in range(1, num_users + 1)],
        'Income': np.random.uniform(30000, 150000, num_users).round(2),
        'Expenses': np.random.uniform(5000, 50000, num_users).round(2),
        'HealthInsurance': np.random.uniform(0, 5000, num_users).round(2),
        'HomeLoan': np.random.uniform(0, 10000, num_users).round(2),
        'ELSS': np.random.uniform(0, 5000, num_users).round(2),
        'NPS': np.random.uniform(0, 5000, num_users).round(2),
        'PPF': np.random.uniform(0, 5000, num_users).round(2),
        'HouseRent': np.random.uniform(0, 12000, num_users).round(2),
        'Previous_Tax_Amount': np.random.uniform(2000, 20000, num_users).round(2),
        'State': [fake.state_abbr() for _ in range(num_users)],
        'Filing_Status': [random.choice(['Single', 'Married', 'Head of Household']) for _ in range(num_users)],
        'Tax_Credits': np.random.uniform(0, 5000, num_users).round(2)
    }

    for column in ['HealthInsurance', 'HomeLoan', 'ELSS', 'NPS', 'PPF', 'HouseRent']:
        data[column] = [value if random.random() > 0.5 else 0 for value in data[column]]

    df = pd.DataFrame(data)
    return df

financial_data = generate_financial_data(num_users)
financial_data

Unnamed: 0,User_ID,Income,Expenses,HealthInsurance,HomeLoan,ELSS,NPS,PPF,HouseRent,Previous_Tax_Amount,State,Filing_Status,Tax_Credits
0,1,74944.81,5926.3,3059.26,6075.45,0.0,4847.92,0.0,9266.94,17535.86,GA,Head of Household,597.97
1,2,144085.72,48645.94,697.47,0.0,0.0,3875.66,0.0,0.0,13219.37,MN,Single,3566.22
2,3,117839.27,42459.92,0.0,650.52,0.0,4697.49,0.0,66.27,7956.16,IN,Single,3803.93
3,4,101839.02,14555.26,0.0,9488.86,4546.6,4474.14,1783.77,0.0,3144.05,NV,Head of Household,2806.39
4,5,48722.24,13182.12,0.0,0.0,1293.9,2989.5,0.0,0.0,7597.68,MT,Married,3854.84
5,6,48719.34,13253.2,0.0,8083.97,3312.61,4609.37,0.0,8748.09,7853.3,DC,Single,2468.98
6,7,36970.03,18690.9,998.37,3046.14,1558.56,0.0,0.0,9255.24,15132.91,OH,Single,2613.66
7,8,133941.14,28614.04,0.0,0.0,2600.34,0.0,0.0,888.54,13476.03,MP,Single,2137.71
8,9,102133.8,24437.53,0.0,0.0,2733.55,0.0,372.75,4301.59,17969.83,VT,Head of Household,127.1
9,10,114968.71,18105.31,232.25,4401.52,0.0,0.0,4934.43,1390.43,10499.87,HI,Single,539.46


In [4]:
import pandas as pd

def generate_tax_regulations():
    tax_brackets = ['10% - $0 to $10,000', '12% - $10,001 to $40,000', '22% - $40,001 to $85,000',
                    '24% - $85,001 to $160,000', '32% - $160,001 to $200,000', '35% - $200,001 and above']
    standard_deductions = [12000] * len(tax_brackets)
    tax_credits = [500, 1000, 1500, 2500, 3000, 4500]

    regulations = {
        'Tax_Bracket': tax_brackets,
        'Standard_Deductions': standard_deductions,
        'Tax_Credits': tax_credits
    }
    df = pd.DataFrame(regulations)
    return df

tax_regulations = generate_tax_regulations()
tax_regulations

Unnamed: 0,Tax_Bracket,Standard_Deductions,Tax_Credits
0,"10% - $0 to $10,000",12000,500
1,"12% - $10,001 to $40,000",12000,1000
2,"22% - $40,001 to $85,000",12000,1500
3,"24% - $85,001 to $160,000",12000,2500
4,"32% - $160,001 to $200,000",12000,3000
5,"35% - $200,001 and above",12000,4500


In [5]:
# Apply tax regulations to the financial data
def apply_tax_regulations(financial_df, regulations_df):
    # Simplified model for applying tax brackets and deductions
    def calculate_tax(user_income, deductions, standard_deductions):
        # Determine tax rate based on income
        if user_income <= 10000:
            tax_rate = 0.10
        elif user_income <= 40000:
            tax_rate = 0.12
        elif user_income <= 85000:
            tax_rate = 0.22
        elif user_income <= 160000:
            tax_rate = 0.24
        elif user_income <= 200000:
            tax_rate = 0.32
        else:
            tax_rate = 0.35

        # Assuming standard deduction applies regardless of filing status
        standard_deduction = standard_deductions
        taxable_income = max(user_income - deductions - standard_deduction, 0)
        return taxable_income * tax_rate

    # Assuming we use the first row of the regulations_df for simplicity
    standard_deductions = regulations_df['Standard_Deductions'].iloc[0]

    # Calculate estimated tax for each user
    financial_df['Estimated_Tax'] = financial_df.apply(
        lambda row: calculate_tax(row['Income'], row[['HealthInsurance', 'HomeLoan', 'ELSS', 'NPS', 'PPF', 'HouseRent']].sum(), standard_deductions),
        axis=1
    )
    return financial_df

# Generate fake financial data
num_users = 1000
financial_data = generate_financial_data(num_users)

# Apply tax regulations to the financial data
financial_data_with_taxes = apply_tax_regulations(financial_data, tax_regulations)
financial_data_with_taxes.head()

Unnamed: 0,User_ID,Income,Expenses,HealthInsurance,HomeLoan,ELSS,NPS,PPF,HouseRent,Previous_Tax_Amount,State,Filing_Status,Tax_Credits,Estimated_Tax
0,1,33771.5,7602.92,4575.45,926.25,3156.88,2245.39,0.0,0.0,16692.1,TX,Single,2882.03,1304.1036
1,2,106369.25,48609.62,0.0,0.0,0.0,0.0,3613.42,2416.79,19788.72,RI,Single,3960.02,21201.3696
2,3,67722.72,44770.36,789.77,9145.49,2740.73,2854.54,1596.13,0.0,8999.33,TN,Single,1621.57,8491.1332
3,4,91028.48,46748.85,3479.5,0.0,0.0,1662.13,4529.56,0.0,2853.48,IL,Married,3580.82,16645.7496
4,5,138907.98,49770.85,0.0,2587.12,0.0,0.0,0.0,0.0,16421.09,LA,Married,677.09,29837.0064


In [11]:
import pandas as pd
from langchain.docstore.document import Document
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

# Prepare documents for LangChain
documents = []
for _, row in financial_data_with_taxes.iterrows():
    content = (f"User_ID: {row['User_ID']}, Income: {row['Income']}, Expenses: {row['Expenses']}, "
               f"HealthInsurance: {row['HealthInsurance']}, HomeLoan: {row['HomeLoan']}, "
               f"ELSS: {row['ELSS']}, NPS: {row['NPS']}, PPF: {row['PPF']}, HouseRent: {row['HouseRent']}, "
               f"Previous_Tax_Amount: {row['Previous_Tax_Amount']}, State: {row['State']}, "
               f"Filing_Status: {row['Filing_Status']}, Tax_Credits: {row['Tax_Credits']}, "
               f"Estimated_Tax: {row['Estimated_Tax']}")

    documents.append(Document(page_content=content))

In [12]:
hg_embeddings = HuggingFaceEmbeddings()
persist_directory = '/content/'

langchain_chroma = Chroma.from_documents(
    documents=documents,
    collection_name="financial_data",
    embedding=hg_embeddings,
    persist_directory=persist_directory
)

  hg_embeddings = HuggingFaceEmbeddings()


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

ImportError: Could not import chromadb python package. Please install it with `pip install chromadb`.

In [None]:
pip install chromadb

In [None]:
from torch import cuda, bfloat16, float16
import transformers
from transformers import AutoTokenizer
from langchain.llms import HuggingFacePipeline
from time import time

model_id = 'HuggingFaceH4/zephyr-7b-beta'
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    quantization_config=bnb_config,
    device_map='auto',
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:
# Initialize the query pipeline
query_pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=float16,
    max_new_tokens=500,
    device_map="auto",
)

llm = HuggingFacePipeline(pipeline=query_pipeline)

In [None]:
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

template = """
Based on the following financial data and tax regulations, analyze and provide personalized tax-saving recommendations:
Financial Data: {question}
Context: {context}
Answer:
"""
PROMPT = PromptTemplate(input_variables=["context", "query"], template=template)

# Set up retriever
retriever = langchain_chroma.as_retriever(search_kwargs={"k": 5})

# Function to remove duplicates from retrieved documents
def remove_duplicates(documents):
    seen = set()
    unique_docs = []
    for doc in documents:
        if doc.page_content not in seen:
            unique_docs.append(doc)
            seen.add(doc.page_content)
    return unique_docs

# Set up the QA chain
qa_chain = RetrievalQA.from_chain_type(
    llm, retriever=retriever, chain_type_kwargs={"prompt": PROMPT}
)

def get_tax_optimization_recommendations(query):
    # Retrieve documents
    raw_docs = retriever.get_relevant_documents(query)

    # Remove duplicates
    unique_docs = remove_duplicates(raw_docs)

    # Prepare the context for the prompt
    context = " ".join([doc.page_content for doc in unique_docs])

    # Use the QA chain to get the response
    result = qa_chain({"context": context, "query": query})
    return result

# Example query
query = "Analyze - User_ID: 317, Income: 65185.29, Expenses: 6770.46, HealthInsurance: 1921.03, HomeLoan: 0.0, ELSS: 0.0, NPS: 1767.37, PPF: 1927.76, HouseRent: 3657.13, Previous_Tax_Amount: 15957.37, State: VI, Filing_Status: Head of Household, Tax_Credits: 2990.91, Estimated_Tax: 9660.64"
response = get_tax_optimization_recommendations(query)

In [None]:
print(response['result'])
