<a href="https://colab.research.google.com/github/4kananya/chat-pdf/blob/main/ChatPDF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#SENTIMENT ANALYSIS

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')

# Customer reviews
reviews = [
    "The product is amazing! I love it.",
    "I had a terrible experience with this product. It's not worth the money.",
    "This service exceeded my expectations. Highly recommended!"
]

# Initialize sentiment analyzer
sid = SentimentIntensityAnalyzer()

# Function to preprocess text
def preprocess_text(text):
    # Tokenize text
    tokens = word_tokenize(text.lower())
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(filtered_tokens)

# Perform sentiment analysis for each review
for review in reviews:
    # Preprocess text
    preprocessed_review = preprocess_text(review)
    # Perform sentiment analysis
    sentiment_scores = sid.polarity_scores(preprocessed_review)
    # Determine sentiment label
    if sentiment_scores['compound'] >= 0.05:
        sentiment_label = 'Positive'
    elif sentiment_scores['compound'] <= -0.05:
        sentiment_label = 'Negative'
    else:
        sentiment_label = 'Neutral'
    # Print results
    print("Review:", review)
    print("Sentiment:", sentiment_label)
    print("Sentiment Scores:", sentiment_scores)
    print()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Review: The product is amazing! I love it.
Sentiment: Positive
Sentiment Scores: {'neg': 0.0, 'neu': 0.108, 'pos': 0.892, 'compound': 0.8516}

Review: I had a terrible experience with this product. It's not worth the money.
Sentiment: Negative
Sentiment Scores: {'neg': 0.344, 'neu': 0.444, 'pos': 0.211, 'compound': -0.296}

Review: This service exceeded my expectations. Highly recommended!
Sentiment: Positive
Sentiment Scores: {'neg': 0.0, 'neu': 0.626, 'pos': 0.374, 'compound': 0.3367}



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


#ANSWER FROM PDF

## Packages Installed
The code installs the following packages:

1. `langchain==0.0.184`: A package related to language processing or chain, version 0.0.184.
2. `PyPDF2==3.0.1`: A library for reading and manipulating PDF files, version 3.0.1.
3. `python-dotenv==1.0.0`: A library for managing environment variables through `.env` files, version 1.0.0.
4. `streamlit==1.18.1`: A popular library for building interactive web applications with Python, version 1.18.1.
6. `faiss-cpu==1.7.4`: A library for efficient similarity search and clustering of dense vectors, CPU version 1.7.4.
7. `altair==4`: A declarative statistical visualization library for Python, version 4.
8. `tiktoken==0.4.0`: Possibly a custom package related to tokenization, version 0.4.0.
9. `huggingface-hub==0.14.1`: A library for accessing models and datasets provided by Hugging Face, version 0.14.1.
10. `InstructorEmbedding==1.0.1`: A custom package related to instructor embedding, version 1.0.1.
11. `sentence-transformers==2.2.2`: A library for computing vector representations of text using transformer models, version 2.2.2.


In [None]:
!pip install langchain==0.0.184 \
             PyPDF2==3.0.1 \
             python-dotenv==1.0.0 \
             streamlit==1.18.1 \
             openai==0.27.6 \
             faiss-cpu==1.7.4 \
             altair==4 \
             tiktoken==0.4.0 \
             huggingface-hub==0.14.1 \
             InstructorEmbedding==1.0.1 \
             sentence-transformers==2.2.2



In [None]:
import streamlit as st
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.llms import HuggingFaceHub

In [None]:
from huggingface_hub import login
login(token='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')

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid.
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
def get_pdf_text(pdf_docs):
    text = ""
    for pdf in pdf_docs:
        pdf_reader = PdfReader(pdf)
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text

In [None]:
def get_text_chunks(text):
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len
    )
    chunks = text_splitter.split_text(text)
    return chunks

In [None]:
def get_vectorstore(text_chunks):
    embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
    vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
    return vectorstore

In [None]:
def get_conversation_chain(vectorstore):
    llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512})

    memory = ConversationBufferMemory(
        memory_key='chat_history', return_messages=True)
    conversation_chain = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=vectorstore.as_retriever(),
        memory=memory
    )
    return conversation_chain

In [None]:
def main():
    st.set_page_config(page_title="Chat with multiple PDFs",
                       page_icon=":books:")
    st.write(css, unsafe_allow_html=True)

    if "conversation" not in st.session_state:
        st.session_state.conversation = None
    if "chat_history" not in st.session_state:
        st.session_state.chat_history = None

    st.header("Chat with multiple PDFs :books:")
    user_question = st.text_input("Ask a question about your documents:")
    if user_question:
        handle_userinput(user_question)

    with st.sidebar:
        st.subheader("Your documents")
        pdf_docs = st.file_uploader(
            "Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
        if st.button("Process"):
            with st.spinner("Processing"):
                # get pdf text
                raw_text = get_pdf_text(pdf_docs)
                # get the text chunks
                text_chunks = get_text_chunks(raw_text)
                # create vector store
                vectorstore = get_vectorstore(text_chunks)
                # create conversation chain
                st.session_state.conversation = get_conversation_chain(vectorstore)

In [None]:
def handle_userinput(user_question):
    response = st.session_state.conversation({'question': user_question})
    st.session_state.chat_history = response['chat_history']

    for i, message in enumerate(st.session_state.chat_history):
        if i % 2 == 0:
            st.write(user_template.replace(
                "{{MSG}}", message.content), unsafe_allow_html=True)
        else:
            st.write(bot_template.replace(
                "{{MSG}}", message.content), unsafe_allow_html=True)

In [None]:
!streamlit run /usr/local/lib/python3.10/dist-packages/colab_kernel_launcher.py


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.173.27.11:8501[0m
[0m
[34m  Stopping...[0m
[34m  Stopping...[0m
Exception ignored in atexit callback: <function shutdown at 0x79bbb5eef880>
Traceback (most recent call last):
  File "/usr/lib/python3.10/logging/__init__.py", line 2182, in shutdown
    h.flush()
  File "/usr/lib/python3.10/logging/__init__.py", line 1081, in flush
    self.acquire()
  File "/usr/lib/python3.10/logging/__init__.py", line 912, in acquire
    def acquire(self):
  File "/usr/local/lib/python3.10/dist-packages/streamlit/web/bootstrap.py", line 55, in signal_handler
    server.stop()
  File "/usr/local/lib/python3.10/dist-packages/streamlit/web/server/server.py", line 405, in stop
    self._runtime.stop()
  File "/usr/local/lib/python3.10/dist-pa

In [None]:
css = '''
<style>
.chat-message {
    padding: 1.5rem; border-radius: 0.5rem; margin-bottom: 1rem; display: flex
}
.chat-message.user {
    background-color: #2b313e
}
.chat-message.bot {
    background-color: #475063
}
.chat-message .avatar {
  width: 20%;
}
.chat-message .avatar img {
  max-width: 78px;
  max-height: 78px;
  border-radius: 50%;
  object-fit: cover;
}
.chat-message .message {
  width: 80%;
  padding: 0 1.5rem;
  color: #fff;
}
'''

bot_template = '''
<div class="chat-message bot">
    <div class="avatar">
        <img src="https://i.ibb.co/cN0nmSj/Screenshot-2023-05-28-at-02-37-21.png" style="max-height: 78px; max-width: 78px; border-radius: 50%; object-fit: cover;">
    </div>
    <div class="message">{{MSG}}</div>
</div>
'''

user_template = '''
<div class="chat-message user">
    <div class="avatar">
        <img src="https://i.ibb.co/rdZC7LZ/Photo-logo-1.png">
    </div>
    <div class="message">{{MSG}}</div>
</div>
'''