In [None]:
pip install streamlit

Collecting streamlit
  Downloading streamlit-1.36.0-py2.py3-none-any.whl (8.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.6/8.6 MB[0m [31m29.0 MB/s[0m eta [36m0:00:00[0m
Collecting gitpython!=3.1.19,<4,>=3.0.7 (from streamlit)
  Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m36.3 MB/s[0m eta [36m0:00:00[0m
Collecting watchdog<5,>=2.1.5 (from streamlit)
  Downloading watchdog-4.0.1-py3-none-manylinux2014_x86_64.whl (83 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.0/83.0 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Collecting gitdb<5,>=4.0.1 (from gitpython!=3.1.19,<4,>=3.0.7->streamlit)
  Downloading gitdb-4.

In [None]:
pip install nltk



In [None]:
pip install pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.11.1-py3-none-any.whl (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.9/57.9 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m38.7 MB/s[0m eta [36m0:00:00[0m
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.8/2.8 MB[0m [31m42.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pypdfium2, pdfminer.six, pdfplumber
Successfully installed pdfminer.six-20231228 pdfplumber-0.11.1 pypdfium2-4.30.0


In [None]:
%%writefile tfidf_app.py
import streamlit as st
import pdfplumber
import nltk
nltk.download('punkt')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Extract text from the provided PDF
def extract_text_from_pdf(pdf_path):
    pdf = pdfplumber.open(pdf_path)
    pdf_text = ""
    for page in pdf.pages:
        text = page.extract_text()
        pdf_text += text
    pdf.close()
    return pdf_text

# Preprocess and tokenize the text into sentences
def preprocess_text(text):
    sentences = nltk.sent_tokenize(text)
    return sentences

# Get TF-IDF vectors for the sentences
def get_tfidf_vectors(sentences):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(sentences)
    return vectorizer, tfidf_matrix

# Retrieve the most relevant sentence based on the query
def retrieve_most_relevant_sentence(query, vectorizer, tfidf_matrix, sentences):
    query_vector = vectorizer.transform([query])
    cosine_similarities = cosine_similarity(query_vector, tfidf_matrix)
    most_similar_index = cosine_similarities.argmax()
    return sentences[most_similar_index]

# Messenger-like UI
st.title("PDF Text Extraction and Query Answering")

# File upload
pdf_file = st.file_uploader("Upload PDF File", type=["pdf"])

if pdf_file is not None:
    # Extract text from the PDF
    pdf_text = extract_text_from_pdf(pdf_file)

    # Preprocess the text and tokenize it into sentences
    sentences = preprocess_text(pdf_text)

    # Get TF-IDF vectors for the sentences
    vectorizer, tfidf_matrix = get_tfidf_vectors(sentences)

    # User input for queries
    count = 0  # Initialize count outside the loop

    # Create a text box at the bottom
    user_query = st.text_input("Enter your query:", key=count)

    while True:
        context = []

        if user_query:
            # Retrieve answers for the user's query
            answer = retrieve_most_relevant_sentence(user_query, vectorizer, tfidf_matrix, sentences)
            retrieved_answer = " ".join(answer.split())

            context.append(user_query)

            # Display user query on the left
            st.write(f'<div style="display: flex; justify-content: flex-end; margin: 10px; text-align: right;">'
         f'<div style="background-color:#DCF8C6; padding:10px; border-radius:10px; max-width:70%;">'
         f'{user_query}</div></div>', unsafe_allow_html=True)

            # Display bot response on the right
            st.write(f'<div style="background-color:#E4E4E4; padding:10px; border-radius:10px; margin:10px; max-width:70%; text-align:right;">Bot: {retrieved_answer}</div>', unsafe_allow_html=True)

            # Increment count
            count += 1

            # Update the user query at the bottom
            user_query = st.text_input("Enter your query:", key=count)
        else:
            break  # Exit the loop if user_query is empty

Writing tfidf_app.py


In [None]:
! wget -q -O - ipv4.icanhazip.com

34.106.176.17


In [None]:
! streamlit run tfidf_app.py & npx localtunnel --port 8501


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.106.176.17:8501[0m
[0m
[K[?25hnpx: installed 22 in 5.463s
your url is: https://good-insects-cross.loca.lt
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading pa

In [None]:
!wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
!unzip ngrok-stable-linux-amd64.zip