In [30]:
!pip install google-generativeai pypdf requests transformers torch accelerate
import google.generativeai as genai
from google.colab import userdata, files
from pypdf import PdfReader
import requests
import json
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [31]:
model_name = "deepseek-ai/deepseek-coder-1.3b-instruct"  # Smaller model that runs on Colab
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/631 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.69G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

In [32]:
import zipfile
import os

def unzip_files(dataset_folder):
    for filename in os.listdir(dataset_folder):
        if filename.endswith(".zip"):
            filepath = os.path.join(dataset_folder, filename)
            try:
                with zipfile.ZipFile(filepath, 'r') as zip_ref:
                    zip_ref.extractall(dataset_folder)
                print(f"Successfully unzipped {filename}")
            except zipfile.BadZipFile:
                print(f"Error: {filename} is not a valid zip file")
            except Exception as e:
                print(f"An error occurred while unzipping {filename}: {e}")

# Replace 'dataset' with the actual path to your dataset folder if it's different
dataset_folder_path = '/content/dataset'
unzip_files(dataset_folder_path)

In [33]:
RFP_PDF_files = []
RFP_response_PDF_files = []
for i in range(1,15):
  RFP_PDF_files.append(f'/content/dataset/{i} (DCE and Answer)/{i} CCTP.pdf')
  RFP_response_PDF_files.append(f'/content/dataset/{i} (DCE and Answer)/{i} Response .pdf')

In [34]:
RFP_text_files = []
RFP_response_text_files = []

In [35]:
def read_RFP_file(PDF_file):
  text = ""
  reader = PdfReader(PDF_file)
  for page in reader.pages:
     text+= page.extract_text()
  return text

for PDF_file in RFP_PDF_files :
  RFP_text_files.append(read_RFP_file(PDF_file))
for PDF_file in RFP_response_PDF_files :
  RFP_response_text_files.append(read_RFP_file(PDF_file))

In [36]:
def get_prompt_template(RFP_file_text, RFP_response_text) :
  return f"""
  You will be judging RFP and RFP response quality.
  For each RFP file {RFP_file_text} and its response {RFP_response_text},
  you will be seeing whether the response is coherent with the RFP demand.
  You will be basing ur selection on these criteria :
  1 - Whether the response is coherent in itself
  2 - Whether it contains the structure of the project (and whether it is realistic or not)
  3 - Whether it actually responds to the demand of the RFP
  Give ur reasoning as u go.
  Give a grade from 0 to 10, 0 being the worst and 10 being the best.
  The grade should be the last thing in ur response.
  Ur answers should be in French, as our clients are all French.
  """

In [39]:
"""!zip -r /content/dataset.zip /content/dataset
files.download("/content/dataset.zip")"""


'!zip -r /content/dataset.zip /content/dataset\nfiles.download("/content/dataset.zip")'

In [None]:
import torch
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm  # For progress tracking

LLM_verdict = []

def generate_response(prompt):
    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt")

    # Generate response
    with torch.no_grad():
        outputs = model.generate(
            inputs.input_ids,
            max_new_tokens=2000,
            temperature=0.7,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )

    # Decode response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Remove the input prompt from response
    response = response.replace(prompt, "").strip()
    return response

def generate_ideas():
    prompts = []
    for RFP_file_text in RFP_text_files:
        for RFP_response_text in RFP_response_text_files:
            prompt = get_prompt_template(RFP_file_text, RFP_response_text)
            prompts.append(prompt)

    # Batch processing or parallelization
    with ThreadPoolExecutor(max_workers=4) as executor:  # Adjust max_workers as needed
        results = list(tqdm(executor.map(generate_response, prompts), total=len(prompts)))

    LLM_verdict.extend(results)

def main():
    generate_ideas()
    if LLM_verdict:
        print(LLM_verdict[0])
    else:
        print("No responses generated")

if __name__ == "__main__":
    main()

  0%|          | 0/196 [04:54<?, ?it/s]
