# Project 3: Multimodal AI Chatbot for Wuthering Heights Literary Analysis

# 1. Data Preprocessing


In [1]:
import pandas as pd

# Load the Excel file
df = pd.read_excel("Wuthering_Heights_Chapter1_Complete_Analysis 1.xlsx")

# Optional: replace dashes with underscores in column names for easier programmatic access
df.columns = [col.replace(" ", "_").replace("-", "_") for col in df.columns]

# Check the first few rows of your data to confirm the structure
print(df.head())


   Chapter_ID  Paragraph_ID  \
0           1             1   
1           1             2   
2           1             3   
3           1             4   
4           1             5   

                                                Text  \
0  1801 I have just returned from a visit to my l...   
1                          “Mr. Heathcliff?” I said.   
2                              A nod was the answer.   
3  “Mr. Lockwood, your new tenant, sir. I do myse...   
4  “Thrushcross Grange is my own, sir,” he interr...   

                                    Literary_Devices  \
0  Imagery (desolation, solitary setting); Irony ...   
1                                None specific here.   
2                                None specific here.   
3  Irony (Lockwood’s formality vs. Heathcliff’s g...   
4  Irony (Heathcliff’s resistance to politeness);...   

                                              Themes  \
0  Isolation and Loneliness; Existentialism and F...   
1                          

In [2]:
import pandas as pd 
import json
import re

# Load your data
data = pd.read_excel("Wuthering_Heights_Chapter1_Complete_Analysis 1.xlsx")

# Function to clean text of special characters
def clean_text(text):
    if pd.isnull(text):
        return ""
    # Replace curly quotes with straight quotes
    text = text.replace("‘", "'").replace("’", "'").replace("“", '"').replace("”", '"')
    # Replace em dash with a hyphen
    text = text.replace("—", "-")
    # Replace Unicode apostrophes and quotes with standard ones
    text = text.replace("\u2018", "'").replace("\u2019", "'")  # curly single quotes to straight
    text = text.replace("\u201c", '"').replace("\u201d", '"')  # curly double quotes to straight
    text = text.replace("\u2013", "-").replace("\u2014", "-")  # en dash and em dash to hyphen
    text = text.replace("\u2026", "...")  # ellipsis
    # Remove any other non-ASCII characters that may cause issues
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    return text

# Apply the cleaning function to all relevant columns
text_columns = ["Text", "Literary Devices", "Themes", "Symbolism", 
                "Historical Cultural Context", "Mood", "Tone", "Psychological Analysis",
                "Narrative Techniques", "Intertextual References and Allusions", 
                "Social and Economic Context", "Language and Diction Analysis", 
                "Symbolic Geography", "Literary Theories Applied", 
                "Character Development and Relationships", "Narrative Structure and Pacing", 
                "Thematic Connections Across Chapters", "Philosophical and Ethical Dimensions",
                "Comparative Literature and Historical Reception"]

# Apply the cleaning function to each column in text_columns
for col in text_columns:
    data[col] = data[col].apply(clean_text)

# Convert the cleaned DataFrame to a list of dictionaries (for JSON)
data_records = data.to_dict(orient="records")

# Save to JSON for inspection (optional)
with open("cleaned_data2.json", "w") as f:
    json.dump(data_records, f, indent=4)

print("Data successfully converted to JSON.")


Data successfully converted to JSON.


In [3]:
import json 
# check correct file uploaded for embedding
# Load the JSON data
json_file_path = "cleaned_data2.json"
with open(json_file_path, "r") as f:
    data = json.load(f)

# Print a sample of the loaded data for verification
print(f"Embedding data loaded from: {json_file_path}")
print("Sample record from data:")
print(data[0])  # Print the first record

Embedding data loaded from: cleaned_data2.json
Sample record from data:
{'Chapter ID': 1, 'Paragraph ID': 1, 'Text': "1801 I have just returned from a visit to my landlord the solitary neighbour that I shall be troubled with. This is certainly a beautiful country! In all England, I do not believe that I could have fixed on a situation so completely removed from the stir of society. A perfect misanthropist's Heaven and Mr. Heathcliff and I are such a suitable pair to divide the desolation between us. A capital fellow! He little imagined how my heart warmed towards him when I beheld his black eyes withdraw so suspiciously under their brows, as I rode up, and when his fingers sheltered themselves, with a jealous resolution, still further in his waistcoat, as I announced my name.", 'Literary Devices': "Imagery (desolation, solitary setting); Irony (admiration for Heathcliff despite unease); Metaphor ('misanthropist's Heaven')", 'Themes': 'Isolation and Loneliness; Existentialism and Fatali

# 2. Split Text Column for Embedding and Store Metadata for Context
We need to embed the Text column only. For meaningful responses, the metadata columns will be used to provide additional context for the embeddings. Using LangChain's text splitter to break long paragraphs into smaller chunks.

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Initialize the text splitter for optimal token size (adjust if needed)
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,   # Set based on model’s token limit
    chunk_overlap=100
)

# Split the text in each row if necessary
data_with_chunks = []
for index, row in df.iterrows():
    chunks = text_splitter.split_text(row["Text"])  # Split the "Text" column into manageable chunks
    for chunk in chunks:
        data_with_chunks.append({
            "Chapter_ID": row["Chapter_ID"],
            "Paragraph_ID": row["Paragraph_ID"],
            "Text": chunk,
            **{col: row[col] for col in df.columns if col != "Text"}  # Include other columns as metadata
        })

print(f"Total chunks created: {len(data_with_chunks)}")

# Now, data_with_chunks contains a list of dictionaries, where each dictionary includes a chunk of text along with all the associated metadata

Total chunks created: 31


# 3. Initialise and Upload Embeddings to Pinecone
Now that we have the text chunks and their metadata, we'll use OpenAI's text-embedding-ada-002 model to embed each chunk. The embeddings, along with the metadata, will be stored in a Pinecone index for fast retrieval.



In [None]:
import os
from getpass import getpass
from langchain.embeddings.openai import OpenAIEmbeddings

# Set up API keys
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") or getpass("Enter your OpenAI API key: ")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY") or getpass("Enter your Pinecone API key: ")

# Initialize OpenAI Embeddings
embed = OpenAIEmbeddings(
    model="text-embedding-ada-002",
    openai_api_key=OPENAI_API_KEY
)

# Embed each chunk
chunk_embeddings = embed.embed_documents(chunks)
print("Embeddings generated for each chunk.")


Embeddings generated for each chunk.


In [10]:
# Print a sample of data_with_chunks to verify content before upserting
for i, record in enumerate(data_with_chunks[:5]):  # Check the first 5 entries
    print(f"Sample {i+1}:")
    print(f"Text: {record.get('Text')}")
    print(f"Metadata: {record}")


Sample 1:
Text: 1801 I have just returned from a visit to my landlord the solitary neighbour that I shall be troubled with. This is certainly a beautiful country! In all England, I do not believe that I could have fixed on a situation so completely removed from the stir of society. A perfect misanthropist’s Heaven and Mr. Heathcliff and I are such a suitable pair to divide the desolation between us. A capital fellow! He little imagined how my heart warmed towards him when I beheld his black eyes withdraw so suspiciously under their brows, as I rode up, and when his fingers sheltered themselves, with a jealous resolution, still further in his waistcoat, as I announced my name.
Metadata: {'Chapter_ID': 1, 'Paragraph_ID': 1, 'Text': '1801 I have just returned from a visit to my landlord the solitary neighbour that I shall be troubled with. This is certainly a beautiful country! In all England, I do not believe that I could have fixed on a situation so completely removed from the stir of s

# 4. Initialise and Upload Embeddings to Pinecone
Store these embeddings in a Pinecone vector database for efficient retrieval.

In [9]:
# Initialize and Connect to Pinecone
from pinecone import Pinecone, ServerlessSpec
import time

spec = ServerlessSpec(
    cloud="aws", region="us-east-1"
)

# Connect to Pinecone
pc = Pinecone(
    api_key = os.getenv("PINECONE_API_KEY") or getpass("Enter your Pinecone API key: "),
    environment="us-east-1"  
)


index_name = "wuthering-heights-analysis"
existing_indexes = [
    index_info["name"] for index_info in pc.list_indexes()
]

# check if index already exists (it shouldn't if this is first time)
if index_name not in existing_indexes:
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=1536,  # dimensionality of ada 002
        metric='cosine',
        spec=spec
    )
    # wait for index to be initialized
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

# connect to index
index = pc.Index(index_name)
time.sleep(1)
# view index stats
index.describe_index_stats()

# Embed and upsert data to Pinecone with "Text" included in metadata
for i, record in enumerate(data_with_chunks):
    text_embedding = embed.embed_documents([record["Text"]])[0]  # Embed the chunk
    metadata = {key: str(value) for key, value in record.items()}  # Include all fields, including "Text"
    index.upsert([(f"chunk-{i}", text_embedding, metadata)])


print("Uploaded embeddings to Pinecone.")




Uploaded embeddings to Pinecone.


In [12]:
# Connect to the index
index = pc.Index(index_name)
print(f"Successfully connected to Pinecone index: {index_name}")

# Optional: Check index stats
stats = index.describe_index_stats()
print(f"Index stats: {stats}")

Successfully connected to Pinecone index: wuthering-heights-analysis
Index stats: {'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 31}},
 'total_vector_count': 31}


In [11]:
# Test to Confirm number of items in the index
stats = index.describe_index_stats()
print(f"Total number of embeddings in the index: {stats['namespaces']['']['vector_count']}")

Total number of embeddings in the index: 31


# 5.  Implement Retrieval-Augmented Generation (RAG)
Now that we have the data in Pinecone, implement a retrieval function to fetch the most relevant paragraphs (or chunks) along with their metadata. This will allow the model to use the metadata as context when generating responses.///////Explanation
filter_metadata Function:

This function receives each match and the question, returning only the relevant metadata fields based on keywords in the question.
Essential fields like Chapter_ID and Paragraph_ID are always included.
Selective Metadata in retrieve_docs:

For each match, we call filter_metadata to get relevant metadata fields dynamically.
The Text column (paragraph text) is always included as part of the context, while the metadata is filtered.
Testing Output:

When testing, you’ll see only the Text, Chapter_ID, Paragraph_ID, and any additional fields (e.g., Symbolic_Geography) included as needed based on the question content.
This approach will dynamically adjust the metadata included in each retrieval result, keeping the output focused and relevant to the question.

In [10]:
# Retrieve relevant document chunks based on a question
def retrieve_docs(question):
    # Embed the question
    question_embedding = embed.embed_query(question)
    
    # Query Pinecone
    results = index.query(
        vector=question_embedding,
        top_k=5,  # Adjust as needed
        include_metadata=True
    )
    
    # Extract text and selectively filtered metadata from results
    context = []
    for match in results["matches"]:
        # Debugging: print metadata keys to inspect available fields
        print(f"Available metadata keys for this match: {match['metadata'].keys()}")
        
        # Always include the text of the match with "From 'Wuthering Heights':" prefix
        text_data = {
            "Text": f"From 'Wuthering Heights': {match['metadata'].get('Text')}",  # Prepend book title here
            "Chapter_ID": match["metadata"].get("Chapter_ID"),
            "Paragraph_ID": match["metadata"].get("Paragraph_ID"),
            # Include other metadata fields as needed
        }
        
        # Optionally filter other metadata based on the question
        #filtered_metadata = filter_metadata(match, question)  # Assuming filter_metadata is defined to be context-sensitive
        #text_data.update(filtered_metadata)
        
        # Append the combined text and metadata to the context list
        context.append(text_data)
    
    return context


# Step 6: Define RAG and Initialize LangChain Agent with Metadata
To ensure that the retrieved metadata (such as Chapter_ID and Paragraph_ID) is included in the response, use a structured prompt with LangChain to integrate the metadata contextually.

In [11]:
from langchain.prompts.chat import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain.agents import initialize_agent, Tool

# Initialize memory for chat history
memory = ConversationBufferMemory(memory_key="chat_history", input_key="input")

# Define metadata columns for the context to help the model understand available fields
context_columns = ", ".join([
    "Themes", "Symbolism", "Historical Cultural Context", "Mood", "Tone", 
    "Psychological Analysis", "Narrative Techniques", "Intertextual References and Allusions", 
    "Social and Economic Context", "Language and Diction Analysis", "Symbolic Geography", 
    "Literary Theories Applied", "Character Development and Relationships", 
    "Narrative Structure and Pacing", "Thematic Connections Across Chapters", 
    "Philosophical and Ethical Dimensions", "Comparative Literature and Historical Reception"
])



# Define a prompt template that incorporates metadata for contextual answers
prompt_template = ChatPromptTemplate.from_messages([
    SystemMessagePromptTemplate.from_template(
        "You are a literary expert and scholar in Gothic literature specializing in the book 'Wuthering Heights' by Emily Brontë. "
        "All questions are about 'Wuthering Heights', and you should provide detailed answers based on the text of this book. "
        "Use specific text excerpts, literary devices, and analytical insights from the novel, "
        "and include any relevant context such as chapter and paragraph details, symbolic geography, or cultural references. "
        "Do not mention repeatedly the columns you are using to retrieve the data."
        "Do not mention the columns you are using to retrieve the data. Do not mention the subheadings in the columns, instead use the information to summarize the subheadings."
        "Focus on synthesizing the metadata for a rich analysis rather than merely listing it. "
        "Refer to these metadata fields when relevant: {context_columns}."
    ),
    HumanMessagePromptTemplate.from_template(
        "{context}\n\n"
        "User's Question: {question}\n"
        "Your Answer:"
    )
])

# Initialize the OpenAI chat model
llm = ChatOpenAI(model="gpt-4", openai_api_key=OPENAI_API_KEY)

# Define a document retrieval tool for the agent
tools = [Tool(name="retrieve_docs", func=retrieve_docs, description="Retrieves relevant text chunks and metadata.")]

# Initialize the LangChain agent with the updated prompt template and memory
agent = initialize_agent(
    tools=tools, 
    llm=llm, 
    agent="zero-shot-react-description", 
    memory=memory, 
    verbose=True,
    prompt=prompt_template
)

print("Agent initialized with enhanced prompt and retrieval.")


  memory = ConversationBufferMemory(memory_key="chat_history", input_key="input")
  llm = ChatOpenAI(model="gpt-4", openai_api_key=OPENAI_API_KEY)


Agent initialized with enhanced prompt and retrieval.


  agent = initialize_agent(


# Test the Agent with Full Pipeline
Finally, test the entire pipeline with an example question to verify that the agent retrieves the relevant text and metadata and incorporates this information in its response

In [20]:
# Define a sample question
sample_question = (
    "In Chapter 1 of Wuthering Heights, analyze the role of women "
    "Please integrate information from the main text, along with any relevant insights from the 'Mood', 'Tone', 'Themes', "
    "and 'Historical Context' columns to provide a comprehensive analysis."
)
# Retrieve context and generate a response from the agent
context_chunks = retrieve_docs(sample_question)
formatted_context = "\n\n".join([
    f"Chapter {chunk['Chapter_ID']}, Paragraph {chunk['Paragraph_ID']}: {chunk['Text']}" for chunk in context_chunks
])

response = agent({"input": sample_question, "context": formatted_context})

# Print the response to verify if it includes metadata
print("Generated Response:", response["output"])


Available metadata keys for this match: dict_keys(['Chapter_ID', 'Character_Development_and_Relationships', 'Comparative_Literature_and_Historical_Reception', 'Historical_Cultural_Context', 'Intertextual_References_and_Allusions', 'Language_and_Diction_Analysis', 'Literary_Devices', 'Literary_Theories_Applied', 'Mood', 'Narrative_Structure_and_Pacing', 'Narrative_Techniques', 'Paragraph_ID', 'Philosophical_and_Ethical_Dimensions', 'Psychological_Analysis', 'Social_and_Economic_Context', 'Symbolic_Geography', 'Symbolism', 'Text', 'Thematic_Connections_Across_Chapters', 'Themes', 'Tone'])
Available metadata keys for this match: dict_keys(['Chapter_ID', 'Character_Development_and_Relationships', 'Comparative_Literature_and_Historical_Reception', 'Historical_Cultural_Context', 'Intertextual_References_and_Allusions', 'Language_and_Diction_Analysis', 'Literary_Devices', 'Literary_Theories_Applied', 'Mood', 'Narrative_Structure_and_Pacing', 'Narrative_Techniques', 'Paragraph_ID', 'Philosophi

  response = agent({"input": sample_question, "context": formatted_context})


[32;1m[1;3mTo answer this question, I need to retrieve and analyze information from various sources: the main text of Wuthering Heights Chapter 1, and the 'Mood', 'Tone', 'Themes', and 'Historical Context' columns. 
Action: retrieve_docs
Action Input: ["Wuthering Heights Chapter 1", "Wuthering Heights Mood", "Wuthering Heights Tone", "Wuthering Heights Themes", "Wuthering Heights Historical Context"][0mAvailable metadata keys for this match: dict_keys(['Chapter_ID', 'Character_Development_and_Relationships', 'Comparative_Literature_and_Historical_Reception', 'Historical_Cultural_Context', 'Intertextual_References_and_Allusions', 'Language_and_Diction_Analysis', 'Literary_Devices', 'Literary_Theories_Applied', 'Mood', 'Narrative_Structure_and_Pacing', 'Narrative_Techniques', 'Paragraph_ID', 'Philosophical_and_Ethical_Dimensions', 'Psychological_Analysis', 'Social_and_Economic_Context', 'Symbolic_Geography', 'Symbolism', 'Text', 'Thematic_Connections_Across_Chapters', 'Themes', 'Tone'

In [21]:
sample_question = (
    "In Chapter 1 of Wuthering Heights, how is Heathcliff portrayed? "
    "Please integrate information from the main text, along with any relevant insights from the 'Mood', 'Tone', 'Themes', "
    "and 'Historical Context' columns to provide a comprehensive analysis."
    "Do not mention repeatedly the columns you are using to retrieve the data."
    "Do not mention the columns you are using to retrieve the data. Do not mention the subheadings in the columns, instead use the information to summarize the subheadings."
)

# Retrieve context chunks and format for input
context_chunks = retrieve_docs(sample_question)
formatted_context = "\n\n".join([
    f"Chapter {chunk['Chapter_ID']}, Paragraph {chunk['Paragraph_ID']}: {chunk['Text']}"
    for chunk in context_chunks
])

# Ask the question and print the response
response = agent({"input": sample_question, "context": formatted_context})
print("Generated Response:", response["output"])


Available metadata keys for this match: dict_keys(['Chapter_ID', 'Character_Development_and_Relationships', 'Comparative_Literature_and_Historical_Reception', 'Historical_Cultural_Context', 'Intertextual_References_and_Allusions', 'Language_and_Diction_Analysis', 'Literary_Devices', 'Literary_Theories_Applied', 'Mood', 'Narrative_Structure_and_Pacing', 'Narrative_Techniques', 'Paragraph_ID', 'Philosophical_and_Ethical_Dimensions', 'Psychological_Analysis', 'Social_and_Economic_Context', 'Symbolic_Geography', 'Symbolism', 'Text', 'Thematic_Connections_Across_Chapters', 'Themes', 'Tone'])
Available metadata keys for this match: dict_keys(['Chapter_ID', 'Character_Development_and_Relationships', 'Comparative_Literature_and_Historical_Reception', 'Historical_Cultural_Context', 'Intertextual_References_and_Allusions', 'Language_and_Diction_Analysis', 'Literary_Devices', 'Literary_Theories_Applied', 'Mood', 'Narrative_Structure_and_Pacing', 'Narrative_Techniques', 'Paragraph_ID', 'Philosophi

In [119]:
# Define a sample question
sample_question = (
    "In Chapter 1 of Wuthering Heights, what do the dogs symbolise? "
    "Please integrate information from the main text, along with any relevant insights from the metadate columns to provide a comprehensive analysis."
    "Please do not mention which columns you are using to retrieve the data, or the subheadings in the columns. Only use the information to summarize the subheadings."
)
# Retrieve context and generate a response from the agent
context_chunks = retrieve_docs(sample_question)
formatted_context = "\n\n".join([
    f"Chapter {chunk['Chapter_ID']}, Paragraph {chunk['Paragraph_ID']}: {chunk['Text']}" for chunk in context_chunks
])

response = agent({"input": sample_question, "context": formatted_context})

# Print the response to verify if it includes metadata
print("Generated Response:", response["output"])

Available metadata keys for this match: dict_keys(['Chapter_ID', 'Character_Development_and_Relationships', 'Comparative_Literature_and_Historical_Reception', 'Historical_Cultural_Context', 'Intertextual_References_and_Allusions', 'Language_and_Diction_Analysis', 'Literary_Devices', 'Literary_Theories_Applied', 'Mood', 'Narrative_Structure_and_Pacing', 'Narrative_Techniques', 'Paragraph_ID', 'Philosophical_and_Ethical_Dimensions', 'Psychological_Analysis', 'Social_and_Economic_Context', 'Symbolic_Geography', 'Symbolism', 'Text', 'Thematic_Connections_Across_Chapters', 'Themes', 'Tone'])
Available metadata keys for this match: dict_keys(['Chapter_ID', 'Character_Development_and_Relationships', 'Comparative_Literature_and_Historical_Reception', 'Historical_Cultural_Context', 'Intertextual_References_and_Allusions', 'Language_and_Diction_Analysis', 'Literary_Devices', 'Literary_Theories_Applied', 'Mood', 'Narrative_Structure_and_Pacing', 'Narrative_Techniques', 'Paragraph_ID', 'Philosophi

In [116]:
# Define a sample question
sample_question = (
    "In Chapter 1 of Wuthering Heights, what is the importance of nature and the landscape and why? "
    "Check other columns for metadata before answering "
)
# Retrieve context and generate a response from the agent
context_chunks = retrieve_docs(sample_question)
formatted_context = "\n\n".join([
    f"Chapter {chunk['Chapter_ID']}, Paragraph {chunk['Paragraph_ID']}: {chunk['Text']}" for chunk in context_chunks
])

response = agent({"input": sample_question, "context": formatted_context})

# Print the response to verify if it includes metadata
print("Generated Response:", response["output"])

Available metadata keys for this match: dict_keys(['Chapter_ID', 'Character_Development_and_Relationships', 'Comparative_Literature_and_Historical_Reception', 'Historical_Cultural_Context', 'Intertextual_References_and_Allusions', 'Language_and_Diction_Analysis', 'Literary_Devices', 'Literary_Theories_Applied', 'Mood', 'Narrative_Structure_and_Pacing', 'Narrative_Techniques', 'Paragraph_ID', 'Philosophical_and_Ethical_Dimensions', 'Psychological_Analysis', 'Social_and_Economic_Context', 'Symbolic_Geography', 'Symbolism', 'Text', 'Thematic_Connections_Across_Chapters', 'Themes', 'Tone'])
Available metadata keys for this match: dict_keys(['Chapter_ID', 'Character_Development_and_Relationships', 'Comparative_Literature_and_Historical_Reception', 'Historical_Cultural_Context', 'Intertextual_References_and_Allusions', 'Language_and_Diction_Analysis', 'Literary_Devices', 'Literary_Theories_Applied', 'Mood', 'Narrative_Structure_and_Pacing', 'Narrative_Techniques', 'Paragraph_ID', 'Philosophi

# 7. Set Up Gradio for the User Interface
This Gradio setup allows users to ask questions via text or voice, retrieves relevant context from Pinecone, and generates a response through the LangChain agent.

In [12]:
import gradio as gr
import whisper
import os
from pydub import AudioSegment
from langchain.memory import ConversationBufferMemory
from langchain.chat_models import ChatOpenAI
from langchain.agents import initialize_agent, Tool

# Initialize memory for follow-up questions
memory = ConversationBufferMemory(memory_key="chat_history", input_key="input")

# Load the Whisper model for speech recognition
asr_model = whisper.load_model("base")

# Function to transcribe audio input
def transcribe_audio(audio):
    print("Audio path received:", audio)  # Debugging output
    
    # Check if the file exists
    if not os.path.exists(audio):
        print("Error: Audio file does not exist at the provided path.")
        return "Audio file not found."
    
    # Convert audio to WAV format with 16000 Hz
    audio_wav_path = "voice.wav"
    sound = AudioSegment.from_file(audio)
    sound = sound.set_frame_rate(16000).set_channels(1)  # Ensure 16000 Hz and mono
    sound.export(audio_wav_path, format="wav")
    
    # Transcribe the converted audio file
    try:
        transcription = asr_model.transcribe(audio_wav_path)["text"]
        print("Transcription result:", transcription)  # Debugging output
        return transcription
    except Exception as e:
        print("Error during transcription:", e)
        return "Error in transcription."

# Function to generate haunting audio output using "mainvoice.mp3" as a base
def generate_haunting_audio(output_path="response_audio.wav"):
    # Load the base audio file "mainvoice.mp3"
    base_audio_path = "mainvoice.mp3"  # Use your existing mainvoice.mp3
    if not os.path.exists(base_audio_path):
        print("Error: Base audio file mainvoice.mp3 not found.")
        return None

    # Load the audio and apply modifications
    sound = AudioSegment.from_file(base_audio_path, format="mp3")
    
    # Apply pitch adjustment and speed up for haunting effect
    haunting_sound = sound._spawn(sound.raw_data, overrides={
        "frame_rate": int(sound.frame_rate * 0.8)  # Lower pitch
    }).set_frame_rate(int(sound.frame_rate * 1.1))  # Slight speed increase
    
    # Increase volume
    haunting_sound = haunting_sound + 8
    
    # Export the haunting sound to the specified output path
    haunting_sound.export(output_path, format="wav")
    return output_path

# Updated ask_agent function to handle both text and audio input, with explicit memory updating and debugging
def ask_agent(question=None, audio=None, memory=None):
    if audio:
        question = transcribe_audio(audio)
        if question.startswith("Error"):
            return question, None  # Return error message if transcription fails
    
    # Ensure there's a question to process
    if not question:
        return "No question provided. Please enter a question or provide a valid audio input.", None
    
    # Retrieve context chunks based on the question
    context_chunks = retrieve_docs(question)
    formatted_context = "\n\n".join([
        f"Chapter {chunk['Chapter_ID']}, Paragraph {chunk['Paragraph_ID']}: {chunk['Text']}"
        for chunk in context_chunks
    ])

    # Load chat history from memory and add it to the prompt
    chat_history = memory.load_memory_variables({}).get("chat_history", "")
    prompt = f"{chat_history}\n\nUser: {question}\nAI:"
    
    # Generate response from agent
    response = agent({"input": question, "context": formatted_context})
    answer = response["output"]
    follow_up_prompt = "\n\nIs there anything else you'd like to ask about *Wuthering Heights*?"

    # Update memory with the new question and answer
    if memory:
        memory.chat_memory.add_user_message(question)
        memory.chat_memory.add_ai_message(answer)
    
    # Debugging: Print the updated memory to verify
    print("Memory content after response:", memory.load_memory_variables({}))
    
    # Generate haunting audio response
    audio_path = generate_haunting_audio()
    return answer + follow_up_prompt, audio_path

# Function to save conversation history to a file
def save_conversation(memory):
    chat_history = memory.load_memory_variables({}).get("chat_history", "")
    with open("conversation_history.txt", "w") as file:
        if isinstance(chat_history, str):
            file.write(chat_history + "\n")
        elif isinstance(chat_history, list):
            for turn in chat_history:
                file.write(f"User: {turn['input']}\n")
                file.write(f"Bot: {turn['output']}\n\n")
    return "conversation_history.txt"

# Updated Gradio handler functions to ensure inputs are correctly passed and audio output is handled
def handle_text_input(question):
    print("Text input received in Gradio:", question)  # Debugging output
    return ask_agent(question=question, memory=memory)

def handle_audio_input(audio):
    print("Audio input received in Gradio:", audio)  # Debugging output
    return ask_agent(audio=audio, memory=memory)

# Gradio Interface with CSS Customization
with gr.Blocks(css=r"""
    /* Full-page background settings */
    .gradio-container {
        background-image: url('https://i.postimg.cc/qM88XSTr/Scary-Gothic-Landscape.jpg');
        background-size: cover;
        background-attachment: fixed;
        color: #F8F8FF;
        text-align: center;
        font-family: 'UnifrakturCook', 'Georgia', serif; /* Gothic font for the entire page */
    }

    /* Main heading styling */
    .main-heading {
        font-size: 3.5em; /* Larger font size */
        font-family: 'UnifrakturCook', 'Georgia', serif;
        color: #F8F8FF;
        text-shadow: 3px 3px 5px #000000; /* Strong shadow for a spooky effect */
        margin-top: 20px;
    }

    /* Subheading styling */
    .subheading {
        font-size: 3.2em; /* Larger subheading */
        font-family: 'UnifrakturMaguntia', 'Georgia', serif;
        color: #F8F8FF;
        text-shadow: 2px 2px 4px #000000;
        margin-bottom: 20px;
        font-style: italic;
    }

    /* Prompt text */
    .prompt-text {
        color: #F8F8FF;
        font-family: 'Georgia', serif;
        font-size: 1.2em;
        margin-top: 10px;
        margin-bottom: 20px;
    }

    /* Input and output text boxes */
    .gradio-input, .gradio-output {
        background-color: rgba(50, 50, 50, 0.5); /* Semi-transparent for background visibility */
        color: #F8F8FF; /* Black text for readability */
        border: 2px solid #8B0000;
        border-radius: 8px;
        padding: 10px;
        font-size: 1.1em;
        font-family: 'Georgia', serif;
    }

    /* Smaller button styling */
    .gradio-button {
        background-color: #4B0082;
        color: #F8F8FF;
        border-radius: 6px;
        border: 1px solid #8B0000;
        padding: 6px 12px;
        font-size: 0.9em;
        font-weight: bold;
    }

    /* Customizing the audio input section */
    .gradio-audio {
        max-width: 70%; /* Restrict width for audio input */
        margin: auto; /* Center align */
    }
""") as demo:
    # Main Heading
    gr.Markdown("### Wuthering Heights Book Literary Analysis", elem_classes="main-heading")
    
    # Subheading
    gr.Markdown("### Graveyard of Echoes", elem_classes="subheading")
    gr.Markdown("Engage in literary discourse that resonates beyond time. Debate interpretations, explore academic critiques, and unearth new understandings from the silent voices of those who haunt gothic literature. Links to the graveyard will guide you to archival discussions, past enquiries, and analyses left by others who sought meaning in the shadows.", elem_classes="prompt-text")
    
    gr.Markdown("Ask a question about the book, and get an answer based on public reviews.", elem_classes="prompt-text")
    
    # Define input and output components with custom classes
    text_input = gr.Textbox(label="Enter your question about Wuthering Heights", elem_classes="gradio-input")
    audio_input = gr.Audio(type="filepath", label="Or record your question")  # Set max-width in CSS for smaller input
    text_output = gr.Textbox(label="Answer", elem_classes="gradio-output")
    audio_output = gr.Audio(label="Haunting Voice Answer", type="filepath")

    # Link input components to the updated handler functions
    text_input.submit(handle_text_input, inputs=text_input, outputs=[text_output, audio_output])
    audio_input.change(handle_audio_input, inputs=audio_input, outputs=[text_output, audio_output])

    # Save conversation button and file download
    save_button = gr.Button("Save Conversation")
    download_link = gr.File(label="Download Conversation")

    # Link save button to saving conversation


    # Link save button to saving conversation
    def save_conversation_callback():
        print("Saving conversation...")  # Debugging output
        print("Memory content at save:", memory.load_memory_variables({}))  # Check memory content
        conversation_path = save_conversation(memory) 
        return conversation_path
    
    # Trigger conversation saving
    save_button.click(save_conversation_callback, outputs=download_link)

# Launch the Gradio app
demo.launch()


  checkpoint = torch.load(fp, map_location=device)


* Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.




Text input received in Gradio: in Wuthering Heights, Chapter 1, please explain the role of women
Available metadata keys for this match: dict_keys(['Chapter_ID', 'Character_Development_and_Relationships', 'Comparative_Literature_and_Historical_Reception', 'Historical_Cultural_Context', 'Intertextual_References_and_Allusions', 'Language_and_Diction_Analysis', 'Literary_Devices', 'Literary_Theories_Applied', 'Mood', 'Narrative_Structure_and_Pacing', 'Narrative_Techniques', 'Paragraph_ID', 'Philosophical_and_Ethical_Dimensions', 'Psychological_Analysis', 'Social_and_Economic_Context', 'Symbolic_Geography', 'Symbolism', 'Text', 'Thematic_Connections_Across_Chapters', 'Themes', 'Tone'])
Available metadata keys for this match: dict_keys(['Chapter_ID', 'Character_Development_and_Relationships', 'Comparative_Literature_and_Historical_Reception', 'Historical_Cultural_Context', 'Intertextual_References_and_Allusions', 'Language_and_Diction_Analysis', 'Literary_Devices', 'Literary_Theories_Applie

  response = agent({"input": question, "context": formatted_context})


[32;1m[1;3mI should find the text of Wuthering Heights Chapter 1 and then analyze the roles of women in that chapter.
Action: retrieve_docs
Action Input: Wuthering Heights Chapter 1[0mAvailable metadata keys for this match: dict_keys(['Chapter_ID', 'Character_Development_and_Relationships', 'Comparative_Literature_and_Historical_Reception', 'Historical_Cultural_Context', 'Intertextual_References_and_Allusions', 'Language_and_Diction_Analysis', 'Literary_Devices', 'Literary_Theories_Applied', 'Mood', 'Narrative_Structure_and_Pacing', 'Narrative_Techniques', 'Paragraph_ID', 'Philosophical_and_Ethical_Dimensions', 'Psychological_Analysis', 'Social_and_Economic_Context', 'Symbolic_Geography', 'Symbolism', 'Text', 'Thematic_Connections_Across_Chapters', 'Themes', 'Tone'])
Available metadata keys for this match: dict_keys(['Chapter_ID', 'Character_Development_and_Relationships', 'Comparative_Literature_and_Historical_Reception', 'Historical_Cultural_Context', 'Intertextual_References_and_

Summary
Loaded Excel Data: Loaded the Excel file with 22 columns.
Converted to JSON: Saved the data as JSON for storage.
Split Text and Metadata: Split the Text column into chunks and attached metadata.
Embedded Data: Used OpenAI embeddings for the Text column and stored them in Pinecone.
Implemented Retrieval: Retrieved relevant paragraphs using metadata to provide context.
LangChain Agent Setup: Created a LangChain agent that retrieves text and metadata for answer generation.
Tested Pipeline: Ensured that responses included references to Chapter_ID and Paragraph_ID.
This setup provides a flexible foundation for querying and analyzing Wuthering Heights with all the required metadata.

# Linted Code

In [None]:
# Project 3: Multimodal AI Chatbot for Wuthering Heights Literary Analysis

# 1. Data Preprocessing

import pandas as pd
import json
import re
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Load the Excel file
df = pd.read_excel("Wuthering_Heights_Chapter1_Complete_Analysis_1.xlsx")

# Replace dashes and spaces in column names with underscores
df.columns = [col.replace(" ", "_").replace("-", "_") for col in df.columns]

# Display the first few rows for verification
print(df.head())

# Define a function to clean text by replacing special characters
def clean_text(text):
    if pd.isnull(text):
        return ""
    text = text.replace("‘", "'").replace("’", "'").replace("“", '"').replace("”", '"')
    text = text.replace("—", "-").replace("\u2018", "'").replace("\u2019", "'")
    text = text.replace("\u201c", '"').replace("\u201d", '"')
    text = text.replace("\u2013", "-").replace("\u2014", "-")
    text = text.replace("\u2026", "...")
    return re.sub(r'[^\x00-\x7F]+', ' ', text)

# Columns to clean
text_columns = [
    "Text", "Literary_Devices", "Themes", "Symbolism", 
    "Historical_Cultural_Context", "Mood", "Tone", "Psychological_Analysis",
    "Narrative_Techniques", "Intertextual_References_and_Allusions", 
    "Social_and_Economic_Context", "Language_and_Diction_Analysis", 
    "Symbolic_Geography", "Literary_Theories_Applied", 
    "Character_Development_and_Relationships", "Narrative_Structure_and_Pacing", 
    "Thematic_Connections_Across_Chapters", "Philosophical_and_Ethical_Dimensions",
    "Comparative_Literature_and_Historical_Reception"
]

# Clean the text in each specified column
for col in text_columns:
    df[col] = df[col].apply(clean_text)

# Convert the cleaned data to JSON format
data_records = df.to_dict(orient="records")

# Save to JSON
with open("cleaned_data.json", "w") as f:
    json.dump(data_records, f, indent=4)

print("Data successfully converted to JSON.")

# Load and verify the JSON data
json_file_path = "cleaned_data.json"
with open(json_file_path, "r") as f:
    data = json.load(f)

print(f"Embedding data loaded from: {json_file_path}")
print("Sample record from data:")
print(data[0])

# 2. Split Text Column for Embedding and Store Metadata for Context

# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,  # Adjust based on model’s token limit
    chunk_overlap=100
)

# Split the text into chunks and store associated metadata
data_with_chunks = []
for _, row in df.iterrows():
    chunks = text_splitter.split_text(row["Text"])
    for chunk in chunks:
        data_with_chunks.append({
            "Chapter_ID": row["Chapter_ID"],
            "Paragraph_ID": row["Paragraph_ID"],
            "Text": chunk,
            **{col: row[col] for col in df.columns if col != "Text"}
        })

print(f"Total chunks created: {len(data_with_chunks)}")

# 3. Initialize and Upload Embeddings to Pinecone

import os
from getpass import getpass
from langchain.embeddings.openai import OpenAIEmbeddings

# Setup API keys
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") or getpass("Enter your OpenAI API key: ")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY") or getpass("Enter your Pinecone API key: ")

# Initialize OpenAI Embeddings
embed = OpenAIEmbeddings(model="text-embedding-ada-002", openai_api_key=OPENAI_API_KEY)

# Generate embeddings for each text chunk
chunk_embeddings = embed.embed_documents([record["Text"] for record in data_with_chunks])
print("Embeddings generated for each chunk.")

# Connect to Pinecone and upsert embeddings

from pinecone import Pinecone, ServerlessSpec
import time

# Configure Pinecone
spec = ServerlessSpec(cloud="aws", region="us-east-1")
pc = Pinecone(api_key=PINECONE_API_KEY, environment="us-east-1")

index_name = "wuthering-heights-analysis"
existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

# Create index if it does not exist
if index_name not in existing_indexes:
    pc.create_index(index_name, dimension=1536, metric="cosine", spec=spec)
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)

index = pc.Index(index_name)

# Upsert embeddings with metadata
for i, record in enumerate(data_with_chunks):
    metadata = {key: str(value) for key, value in record.items()}
    index.upsert([(f"chunk-{i}", chunk_embeddings[i], metadata)])

print("Uploaded embeddings to Pinecone.")

# 4. Retrieval Test Using a Sample Query

def retrieve_docs(question):
    question_embedding = embed.embed_query(question)
    results = index.query(
        vector=question_embedding,
        top_k=5,
        include_metadata=True
    )

    context = []
    for match in results["matches"]:
        text_data = {
            "Text": f"From 'Wuthering Heights': {match['metadata'].get('Text')}",
            "Chapter_ID": match["metadata"].get("Chapter_ID"),
            "Paragraph_ID": match["metadata"].get("Paragraph_ID"),
        }
        context.append(text_data)
    
    return context

# 5. Implement Retrieval-Augmented Generation (RAG)

from langchain.prompts.chat import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain.agents import initialize_agent, Tool

memory = ConversationBufferMemory(memory_key="chat_history", input_key="input")

# Define a prompt template
context_columns = ", ".join([
    "Themes", "Symbolism", "Historical Cultural Context", "Mood", "Tone", 
    "Psychological Analysis", "Narrative Techniques", "Intertextual References and Allusions", 
    "Social and Economic Context", "Language and Diction Analysis", "Symbolic Geography", 
    "Literary Theories Applied", "Character Development and Relationships", 
    "Narrative Structure and Pacing", "Thematic Connections Across Chapters", 
    "Philosophical and Ethical Dimensions", "Comparative Literature and Historical Reception"
])

prompt_template = ChatPromptTemplate.from_messages([
    SystemMessagePromptTemplate.from_template(
        "You are a literary expert in Gothic literature specializing in 'Wuthering Heights'. "
        "Provide detailed answers based on text, literary devices, and insights. "
        "Include metadata like chapter and paragraph details when relevant. "
        "Use these metadata fields: {context_columns}."
    ),
    HumanMessagePromptTemplate.from_template("{context}\n\nUser's Question: {question}\nYour Answer:")
])

# Initialize OpenAI chat model and LangChain agent
llm = ChatOpenAI(model="gpt-4", openai_api_key=OPENAI_API_KEY)
tools = [Tool(name="retrieve_docs", func=retrieve_docs, description="Retrieves relevant text chunks and metadata.")]

agent = initialize_agent(
    tools=tools, 
    llm=llm, 
    agent="zero-shot-react-description", 
    memory=memory, 
    verbose=True,
    prompt=prompt_template
)

print("Agent initialized.")

# Sample question and test
sample_question = "In Chapter 1 of Wuthering Heights, analyze the role of women and include 'Mood' and 'Historical Context'."
context_chunks = retrieve_docs(sample_question)
formatted_context = "\n\n".join([f"Chapter {chunk['Chapter_ID']}, Paragraph {chunk['Paragraph_ID']}: {chunk['Text']}" for chunk in context_chunks])

response = agent({"input": sample_question, "context": formatted_context})
print("Generated Response:", response["output"])

# 6. Set Up Gradio for the User Interface
import gradio as gr
import whisper
from pydub import AudioSegment

# Load Whisper model
asr_model = whisper.load_model("base")

# Function to transcribe audio
def transcribe_audio(audio):
    if not os.path.exists(audio):
        return "Audio file not found."
    sound = AudioSegment.from_file(audio)
    sound = sound.set_frame_rate(16000).set_channels(1)
    audio_wav_path = "voice.wav"
    sound.export(audio_wav_path, format="wav")
    return asr_model.transcribe(audio_wav_path)["text"]

# Gradio Interface Setup
with gr.Blocks() as demo:
    text_input = gr.Textbox(label="Ask a question about Wuthering Heights.")
    audio_input = gr.Audio(type="filepath", label="Or record your question")
    text_output = gr.Textbox(label="Answer")
    audio_output = gr.Audio(label="Haunting Voice Answer", type="filepath")

    text_input.submit(lambda q: agent({"input": q, "context": formatted_context}), text_input, [text_output, audio_output])

demo.launch()
