<a href="https://colab.research.google.com/github/RouaBenYahia/RAG/blob/all_types_with_the_fine_tuned/Streamlit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pwd

In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
%cd /content/drive/MyDrive/FineTuning-Rag

In [None]:
!ls

In [None]:
import sys
sys.path.append('/content/drive/MyDrive/FineTuning-Rag')


In [None]:
!pip install -q streamlit langchain PyPDF2 faiss-cpu tiktoken huggingface-hub
!pip install -U langchain_community langchain-huggingface
!pip install python-docx

In [None]:
from google.colab import userdata
api_key=userdata.get('huggingface')
from huggingface_hub import login
login(token=api_key)


In [None]:
!npm install -g localtunnel

In [None]:
%%writefile Streamlit.py
import streamlit as st
from PyPDF2 import PdfReader
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
from langchain.vectorstores import FAISS
#from langchain_community.vectorstores import FAISS

from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.llms import HuggingFaceHub
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from htmltemplate import css, bot_template, user_template
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForCausalLM
from langchain.prompts import PromptTemplate
import pandas as pd
import docx
import io
from openpyxl import load_workbook


def extract_text_from_files(uploaded_files):
    all_text = ""

    for uploaded_file in uploaded_files:
        file_name = uploaded_file.name.lower()

        # PDF
        if file_name.endswith(".pdf"):
            try:
                pdf_reader = PdfReader(uploaded_file)
                for page in pdf_reader.pages:
                    text = page.extract_text()
                    if text:
                        all_text += text + "\n"
            except:
                st.warning(f"❌ Failed to read PDF: {uploaded_file.name}")

        # TXT
        elif file_name.endswith(".txt"):
            stringio = io.StringIO(uploaded_file.getvalue().decode("utf-8"))
            all_text += stringio.read() + "\n"

        # CSV
        elif file_name.endswith(".csv"):
            try:
                df = pd.read_csv(uploaded_file)
                all_text += df.to_string(index=False) + "\n"
            except:
                st.warning(f"❌ Failed to read CSV: {uploaded_file.name}")

        # DOCX
        elif file_name.endswith(".docx"):
            try:
                doc = docx.Document(uploaded_file)
                for para in doc.paragraphs:
                    all_text += para.text + "\n"
            except:
                st.warning(f"❌ Failed to read DOCX: {uploaded_file.name}")

        # XLSX
        elif file_name.endswith(".xlsx"):
            try:
                wb = load_workbook(uploaded_file, data_only=True)
                for sheet in wb.sheetnames:
                    ws = wb[sheet]
                    for row in ws.iter_rows(values_only=True):
                        row_text = " | ".join([str(cell) if cell is not None else "" for cell in row])
                        all_text += row_text + "\n"
            except:
                st.warning(f"❌ Failed to read XLSX: {uploaded_file.name}")

        else:
            st.warning(f"⚠️ Unsupported file type: {uploaded_file.name}")

    return all_text






def get_text_chuncks(text):
  text_splitter=CharacterTextSplitter(separator="\n",chunk_size=1000,chunk_overlap=300,length_function=len)
  chunks=text_splitter.split_text(text)
  return chunks



def get_vectorstore(text_chunks):
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs={"device": "cpu"}
)

    vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
    return vectorstore


def get_conversation_chain(vectorstore):
    #tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")
    #model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")
    tokenizer = AutoTokenizer.from_pretrained("rouabenyahia/FineTuningModel")
    model = AutoModelForCausalLM.from_pretrained("rouabenyahia/FineTuningModel")

    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=512,
        temperature=0.0,     # Réponses déterministes
        do_sample=False
    )
    llm = HuggingFacePipeline(pipeline=pipe)

    memory = ConversationBufferMemory(
        memory_key='chat_history', return_messages=True
    )

    prompt_template = """
    Tu es un assistant expert. Utilise le contexte ci-dessous pour répondre à la question de l'utilisateur.
    Si la réponse ne se trouve pas dans les documents, dis simplement que tu ne sais pas.

    Contexte:
    {context}

    Question:
    {question}

    Réponse:
    """

    prompt = PromptTemplate.from_template(prompt_template)

    conversation_chain = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever = vectorstore.as_retriever(search_kwargs={"k": 5}),
        #retriever=vectorstore.as_retriever(),
        memory=memory,
        combine_docs_chain_kwargs={"prompt": prompt}
    )
    return conversation_chain




def handle_userinput(user_question):
    retriever = st.session_state.conversation.retriever
    docs = retriever.get_relevant_documents(user_question)
    #st.write("🔍 Chunks retrouvés par le retriever:", len(docs))

    if len(docs) == 0:
        st.warning("⚠️ Aucun passage du document n’a été trouvé pour cette question. Essayez de reformuler.")

    response = st.session_state.conversation({'question': user_question})
    st.session_state.chat_history = response['chat_history']

    for i, message in enumerate(st.session_state.chat_history):
        content = message.content
        if i % 2 == 1:
            if "Réponse:" in content:
                content = content.split("Réponse:")[-1].strip()
            st.write(bot_template.replace("{{MSG}}", content), unsafe_allow_html=True)
        else:
            st.write(user_template.replace("{{MSG}}", content), unsafe_allow_html=True)







st.set_page_config(
    page_title="Chat with your data",
    page_icon=":books:",
)

st.write(css,unsafe_allow_html=True)

if "conversation" not in st.session_state:
  st.session_state.conversation=None


if "chat_history" not in st.session_state:
  st.session_state.chat_history=None

st.header("Chat with your data :books:")

#user_question=st.text_input("Ask a question about your documents")
#if user_question:
  #handle_userinput(user_question)

user_question = st.text_input("Ask a question about your documents")

if user_question and st.session_state.conversation is not None:
    handle_userinput(user_question)
elif user_question:
    st.warning("📄 Veuillez d'abord uploader vos PDF et cliquer sur 'Process'.")




with st.sidebar:
  st.subheader("Your documents")
  uploaded_files = st.file_uploader(
    "Upload your files (.pdf, .txt, .csv, .docx, .xlsx) and click on 'Process'",
    type=["pdf", "txt", "csv", "docx", "xlsx"],
    accept_multiple_files=True
)



  if st.button("Process"):
    with st.spinner("Processing"):

      raw_text = extract_text_from_files(uploaded_files)
      if not raw_text.strip():
        st.error("⚠️ Aucun texte n’a été extrait. Les fichiers sont peut-être vides, scannés ou illisibles.")
        st.stop()
      else:
        st.write(f"📄 Nombre de caractères extraits: {len(raw_text)}")



      #st.write(raw_text)
      text_chunks=get_text_chuncks(raw_text)
      #st.write("✅ Nombre de chunks extraits:", len(text_chunks))

      #st.write(text_chunks)
      vectorstore = get_vectorstore(text_chunks)

      #st.write(vectorstore)
      #el conversation chain tawa
      # on peut faire ça
      st.session_state.conversation=get_conversation_chain(vectorstore)
    st.success("✅ The processing of the files is over, you can ask your questions now!")
    #st.session_state.conversation


    #get the pdf text
    #get the text chuncks
    #create the vector store with the embeddings
  #juste el api key mtaa hugging face bech nhothaa


In [None]:
!ls

In [None]:

! wget -q -O - ipv4.icanhazip.com



In [None]:


! streamlit run Streamlit.py & npx localtunnel --port 8501

[1G[0K⠙
Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[1G[0K⠹[1G[0K⠸[1G[0K⠼[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.70.40.197:8501[0m
[0m
[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0Kyour url is: https://petite-ducks-sing.loca.lt
2025-07-18 12:13:49.670538: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-07-18 12:13:49.689085: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752840829.713234   14063 cu