In [None]:
%pip install langchain langchain-community langchain-core transformers
%pip install sentence-transformers
%pip install chromadb
%pip install bitsandbytes accelerate
%pip uninstall keras
%pip install keras==2.11.0
%pip install tf-keras
%pip install --upgrade transformers
%pip install faker

In [None]:
import pandas as pd
import random
from faker import Faker
import warnings
warnings.filterwarnings('ignore')

# Initialize Faker instance
fake = Faker()

# Generate Customer Demographics Data
def generate_customer_demographics(num_customers=1000):
    customer_data = []
    for _ in range(num_customers):
        customer = {
            'customer_id': fake.uuid4(),
            'name': fake.name(),
            'age': random.randint(18, 70),
            'gender': random.choice(['Male', 'Female']),
            'marital_status': random.choice(['Single', 'Married', 'Divorced']),
            'education': random.choice(['High School', 'Bachelor', 'Master', 'PhD']),
            'occupation': fake.job(),
            'salary': random.randint(20000, 150000),  # Yearly salary
        }
        customer_data.append(customer)
    return pd.DataFrame(customer_data)

# Generate Customer Financial Behavior Data
def generate_financial_behavior(customer_ids, num_records=2000):
    financial_data = []
    for _ in range(num_records):
        product_type = random.choice(['Personal Loan', 'Home Loan', 'Credit Card'])
        loan_amount = random.randint(5000, 500000) if product_type != 'Credit Card' else random.randint(5000, 150000)
        credit_limit = random.randint(1000, 150000) if product_type == 'Credit Card' else None
        utilization = random.uniform(0.1, 1.0) if product_type == 'Credit Card' else None
        max_dpd = random.choice([0, 15, 30, 60, 90, 120])
        default_status = random.choice([True, False])

        financial_behavior = {
            'customer_id': random.choice(customer_ids),
            'product_type': product_type,
            'loan_amount': loan_amount,
            'credit_limit': credit_limit,
            'credit_utilization': utilization,
            'emi_paid': random.randint(1, 24),
            'tenure_months': random.randint(12, 60),
            'max_dpd': max_dpd,
            'default_status': default_status
        }
        financial_data.append(financial_behavior)
    return pd.DataFrame(financial_data)

# Generate Customer Enquiries Data (Last 3 months)
def generate_customer_enquiries(customer_ids, num_records=500):
    enquiries_data = []
    for _ in range(num_records):
        product_type = random.choice(['Personal Loan', 'Home Loan', 'Credit Card'])
        enquiry_amount = random.randint(5000, 500000) if product_type != 'Credit Card' else random.randint(5000, 100000)
        enquiry = {
            'customer_id': random.choice(customer_ids),
            'enquiry_date': fake.date_between(start_date='-90d', end_date='today'),
            'product_type': product_type,
            'enquiry_amount': enquiry_amount,
            'status': random.choice(['Approved', 'Rejected'])
        }
        enquiries_data.append(enquiry)
    return pd.DataFrame(enquiries_data)

# Generate Customer Transaction Data (Past 6 months)
def generate_customer_transactions(customer_ids, num_records=5000):
    transactions_data = []
    for _ in range(num_records):
        transaction_date = fake.date_between(start_date='-180d', end_date='today')
        transaction_amount = random.uniform(50, 10000)

        # Transaction description with salary-related and hobby keywords
        transaction_description = random.choice([
            'Salary from XYZ Corp', 'Amazon Purchase', 'Grocery Store', 'Gym Membership',
            'Netflix Subscription', 'Restaurant', 'Fuel Station', 'Travel Booking',
            'SALARY - ABC Corp', 'SAL credited from DEF Ltd', 'Monthly Salary GHI Pvt Ltd',
            'Rent Payment', 'Car Insurance', 'Mobile Phone Bill', 'Electricity Bill', 'Spotify Subscription',
            'Uber Ride', 'Etsy Shopping', 'Concert Ticket', 'Books Purchase'
        ])

        # Salary detection
        salary_keywords = ['Salary', 'SALARY', 'SAL', 'SAL credited', 'Monthly Salary']
        is_salary = any(keyword in transaction_description.upper() for keyword in salary_keywords)

        # Hobbies detection based on transaction descriptions
        hobbies = None
        if "Amazon" in transaction_description or "Etsy" in transaction_description:
            hobbies = 'Shopping'
        elif "Netflix" in transaction_description or "Spotify" in transaction_description:
            hobbies = 'Entertainment'
        elif "Gym" in transaction_description:
            hobbies = 'Fitness'
        elif "Concert" in transaction_description:
            hobbies = 'Music'
        elif "Books" in transaction_description:
            hobbies = 'Reading'
        elif "Travel" in transaction_description or "Uber Ride" in transaction_description:
            hobbies = 'Travel'

        transaction = {
            'customer_id': random.choice(customer_ids),
            'transaction_date': transaction_date,
            'transaction_amount': transaction_amount,
            'transaction_description': transaction_description,
            'account_balance': random.uniform(500, 20000),
            'is_salary': is_salary,
            'hobby_detected': hobbies
        }
        transactions_data.append(transaction)

    return pd.DataFrame(transactions_data)

def generate_customer_sentiments(customer_ids, num_records=5000):
    sentiments_data = []

    sentiment_sources = ['Twitter', 'Facebook', 'Instagram', 'LinkedIn', 'Reddit', 'TrustPilot', 'Google Reviews', 'YouTube Comments', 'Quora', 'Forums']
    sentiment_labels = ['Positive', 'Neutral', 'Negative']

    product_keywords = [
        'Credit Card', 'Loan', 'Mutual Fund', 'Stock', 'Insurance', 'Netflix', 'Spotify', 'Gym Membership', 'Mortgage', 'Savings Account', 'Investment Plan',
        'Health Insurance', 'Car Loan', 'Home Loan', 'Travel Insurance', 'Mobile Phone Plan', 'Laptop', 'Smartwatch', 'Streaming Service', 'Online Course', 'Luxury Watch',
        'Gaming Console', 'Electric Vehicle', 'Home Security System', 'Smart Home Device', 'E-book Subscription', 'Meal Delivery Service', 'Fitness Tracker', 'Digital Wallet'
    ]

    intent_categories = {
        'Product Interest': ['Looking for suggestions', 'What should I buy?', 'Any recommendations?', 'Best choice for me?', 'Which one is better?', 'Need a new option'],
        'Service Satisfaction': ['Excellent service', 'Great support', 'Fantastic experience', 'Highly recommend', 'Loved my experience', 'Poor service', 'Frustrated', 'Regret', 'Worst experience'],
        'Technical Support': ['Not working', 'Facing issues', 'Bug found', 'App crashes', 'Error message', 'Glitchy experience', 'Feature broken'],
        'Financial Concern': ['Unexpected charges', 'Hidden fees', 'Interest rates too high', 'Account frozen', 'Fraudulent transaction', 'Unauthorized deduction', 'Late fee issue', 'Credit score impact'],
        'Investment Interest': ['Best savings account', 'High-interest deposit', 'Mutual fund recommendations', 'Stock investment tips', 'Retirement planning', 'Cryptocurrency advice', 'Is this a good investment?'],
        'Loan & Credit Inquiry': ['Loan eligibility', 'Credit card approval', 'Best mortgage rates', 'Personal loan options', 'Debt consolidation', 'EMI calculation', 'Low-interest credit card'],
        'Subscription Inquiry': ['Netflix subscription', 'Spotify plan', 'Gym membership', 'Service renewal', 'Want to upgrade', 'Cancel subscription'],
        'Customer Support': ['Need assistance', 'Support not responding', 'How do I contact?', 'Live chat not available', 'Waiting for response'],
        'Comparison': ['Better than', 'Worse than', 'Compared to', 'Alternative to', 'How does this compare?', 'Which is best?'],
        'Refund Request': ['Want my money back', 'Need a refund', 'Did not like it', 'Return process', 'Refund issue', 'Money not credited']
    }

    for _ in range(num_records):
        sentiment_source = random.choice(sentiment_sources)
        sentiment_label = random.choice(sentiment_labels)
        sentiment_score = {'Positive': random.uniform(0.6, 1.0), 'Neutral': random.uniform(0.4, 0.6), 'Negative': random.uniform(0.0, 0.4)}[sentiment_label]

        # Assign product dynamically
        product_mentioned = random.choice(product_keywords)

        # Generate sentiment text including product
        sentiment_text = f"{random.choice(intent_categories['Product Interest'])} about {product_mentioned}" if sentiment_label != 'Negative' else f"{random.choice(intent_categories['Service Satisfaction'])} with {product_mentioned}"

        # Determine intent based on sentiment text
        intent = next((key for key, values in intent_categories.items() if any(phrase in sentiment_text for phrase in values)), 'Product Interest')

        sentiment_entry = {
            'customer_id': random.choice(customer_ids),
            'sentiment_date': fake.date_between(start_date='-180d', end_date='today'),
            'sentiment_source': sentiment_source,
            'sentiment_text': sentiment_text,
            'sentiment_label': sentiment_label,
            'sentiment_score': round(sentiment_score, 2),
            'intent': intent,
            'product_mentioned': product_mentioned
        }
        sentiments_data.append(sentiment_entry)

    return pd.DataFrame(sentiments_data)


In [None]:
import pandas as pd

def generate_customer_data(num_customers=5000, num_financial_records=15000, num_enquiries=4000, num_transactions=20000, num_sentiments=100):
    """Generates and aggregates customer data for personalization and recommendation."""

    # Generate Data
    customers = generate_customer_demographics(num_customers)
    financial_behavior = generate_financial_behavior(customers['customer_id'], num_records=num_financial_records)
    enquiries = generate_customer_enquiries(customers['customer_id'], num_records=num_enquiries)
    transactions = generate_customer_transactions(customers['customer_id'], num_records=num_transactions)
    social_sentiments = generate_customer_sentiments(customers['customer_id'], num_records=num_sentiments)

    # Financial Summary
    financial_summary = financial_behavior.groupby('customer_id').agg({
        'loan_amount': 'mean',
        'credit_limit': 'mean',
        'credit_utilization': 'mean',
        'emi_paid': 'sum',
        'tenure_months': 'mean',
        'max_dpd': 'max',
        'default_status': 'mean',
        'product_type': lambda x: list(x.unique())  # Convert to list for readability
    }).reset_index()

    # Transaction Summary (Fixing the `is_salary` filter issue)
    transaction_summary = transactions.groupby('customer_id').agg({
        'transaction_amount': 'mean',
        'account_balance': 'mean',
        'is_salary': 'sum'
    }).reset_index()

    # Ensure 'is_salary' exists before filtering
    if 'is_salary' in transactions.columns:
        salary_transactions = transactions[transactions['is_salary'] == 1]
        salary_summary = salary_transactions.groupby('customer_id')['transaction_amount'].sum().reset_index()
        salary_summary.rename(columns={'transaction_amount': 'total_salary_received'}, inplace=True)
        transaction_summary = pd.merge(transaction_summary, salary_summary, on='customer_id', how='left')

    # Enquiries Summary (Fixing column name consistency)
    enquiries_summary = enquiries.groupby('customer_id').agg({
        'enquiry_amount': 'mean',
        'product_type': lambda x: x.nunique(),
        'customer_id': 'count'
    }).rename(columns={
        'customer_id': 'total_enquiries',
        'product_type': 'unique_products_enquired'
    }).reset_index()

    # Sentiment Summary
    sentiment_summary = social_sentiments.groupby('customer_id').agg({
        'sentiment_score': 'mean',
        'intent': lambda x: x.mode()[0] if not x.mode().empty else 'General',
        'sentiment_label': lambda x: x.value_counts().to_dict()
    }).reset_index()

    # Merge All Data
    merged_data = pd.merge(customers, financial_summary, on='customer_id', how='left')
    merged_data = pd.merge(merged_data, enquiries_summary, on='customer_id', how='left')
    merged_data = pd.merge(merged_data, transaction_summary, on='customer_id', how='left')
    # merged_data = pd.merge(merged_data, sentiment_summary, on='customer_id', how='left')

    # Step 1: Explode the list in 'product_type' column
    df_exploded = merged_data.explode('product_type')
    # Step 2: One-hot encode the 'product_type' column
    df_encoded = pd.get_dummies(df_exploded['product_type'])
    merged_data = pd.concat([df_exploded, df_encoded], axis=1)
    # Step 4: Group by the original index and aggregate to bring it back into one row per customer
    df_final = merged_data.groupby(merged_data.index).sum()

    # Define the aggregation function for each column
    aggregation_functions = {
    'customer_id': 'first',  # Keep first occurrence (assuming it's the same for the group)
    'name': 'first',         # Keep the first name in each group
    'age': 'mean',           # For age, you can take the average or median
    'gender': 'first',       # Assuming gender is the same within each group, take the first
    'marital_status': 'first', # Same for marital status
    'education': 'first',    # Same for education
    'occupation': 'first',   # Same for occupation
    'salary': 'sum',         # Sum numerical values like salary
    'loan_amount': 'sum',    # Sum numerical values like loan amount
    'credit_limit': 'sum',   # Sum numerical values like credit limit
    'credit_utilization': 'sum',
    'emi_paid':'sum',
    'tenure_months':'sum',
    'max_dpd':'max',
    'default_status':'max',
    'enquiry_amount': 'sum',
    'unique_products_enquired': 'sum',
    'total_enquiries': 'sum',
    'transaction_amount': 'sum',
    'account_balance': 'sum',
    'is_salary': 'mean',     # For boolean-like columns, you can take the mean (0 or 1)
    'Credit Card': 'max',    # For categorical (binary) features, take max (0 or 1)
    'Home Loan': 'max',
    'Personal Loan': 'max',
    }

    # Group by and apply aggregation functions
    df_final = merged_data.groupby(merged_data.index).agg(aggregation_functions)
    df_final['content'] = df_final.apply(lambda row: f"Based on the following customer data: {row.to_dict()}, suggest suitable products.", axis=1)

    # pd.set_option('display.max_colwidth', None)
    # # Print the entire 'content' column
    # print(df_final['content'].to_string())



    return df_final

# Usage Example:
df_final = generate_customer_data()
print(df_final.head())


In [None]:
from langchain.docstore.document import Document
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

#Prepare Document for langchain
documents = []
for _,row in df_final.iterrows():
    documents.append(Document(page_content=row['content'], metadata={"class": row["age"]}))

hg_embeddings = HuggingFaceEmbeddings()

persist_directory = '/content/sample_data/chroma_data'
langchain_chroma = Chroma.from_documents(
    documents= documents,
    collection_name="recomendation_engine",
    embedding=hg_embeddings,
    persist_directory=persist_directory
)

In [None]:
from langchain.docstore.document import Document
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

def create_chroma_vector_store(df, persist_directory='/content/sample_data/chroma_data'):

    # Prepare Documents for LangChain
    documents = [Document(page_content=row['content'], metadata={"class": row["age"]}) for _, row in df.iterrows()]

    # Initialize Embeddings
    hg_embeddings = HuggingFaceEmbeddings()

    # Create Chroma Vector Store
    langchain_chroma = Chroma.from_documents(
        documents=documents,
        collection_name="recommendation_engine",
        embedding=hg_embeddings,
        persist_directory=persist_directory
    )

    return langchain_chroma

# Usage Example:
# langchain_chroma = create_chroma_vector_store(df_final)


In [None]:
from torch import cuda, bfloat16
import torch
import transformers
from transformers import AutoTokenizer
from time import time
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma
import os

model_id = 'HuggingFaceH4/zephyr-7b-beta'
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type ='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    trust_remote_code = True,
    max_new_tokens =1024
    )

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map ='auto',
)

tokenizer = AutoTokenizer.from_pretrained(model_id)

print(device)

In [None]:
query_pipeline = transformers.pipeline(
    "text-generation",
    model = model,
    tokenizer = tokenizer,
    torch_dtype = torch.bfloat16,
    max_length = 6000,
    max_new_tokens = 500,
    device_map = "auto",
)

In [None]:
from IPython.display import display, Markdown

def colorize_text(text):
  for word, color in zip(["Reasoning","Question","Answer","Total time"],["blue","red","green","magneta"]):
    text = text.replace(f"{word}:",f"\n\n**<font color='{color}'>{word}:</font>**")
  return text

llm = HuggingFacePipeline(pipeline=query_pipeline)

question = "what is Recommendation Engine and How it used in Finance Domain?"
respone = llm(prompt=question)

full_response = f"Question: {question}\n\nAnswer: {respone}"
display(Markdown(colorize_text(full_response)))



In [None]:
from langchain.chains import RetrievalQA # Corrected import statement
from langchain.prompts import PromptTemplate
from langchain_community.llms  import HuggingFaceHub # Corrected the typo in the package name
from IPython.display import display, Markdown
import os
import warnings
warnings.filterwarnings('ignore')

os.environ["HUGGINGFACEHUB_API_TOKEN"] = ""

template ="""
Based on the following customer data, that I provide, suggest one suitable products."
Customer Information: {question}
Context: {context}
Answer:
"""

PROMPT = PromptTemplate(template=template, input_variables=["context","question"])

retriver =langchain_chroma.as_retriever(search_kwargs={"k":1})

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriver,
    return_source_documents=True,
    chain_type_kwargs={"prompt":PROMPT}
)

In [None]:
import json
question = df_final.iloc[2]
question = question.drop('content')
question_dict = question.to_dict()
data_string = json.dumps(question_dict, indent=4)

try:
  result = qa_chain({"query": data_string})
  print(result["result"])
except Exception as e:
  print(e)
