In [1]:
!pip install -qU \
transformers==4.31.0 \
sentence-transformers==2.2.2 \
pinecone-client==2.2.2 \
datasets==2.14.0 \
accelerate==0.21.0 \
einops==0.6.1 \
langchain==0.0.240 \
xformers==0.0.20 \
bitsandbytes==0.41.0

In [2]:
import json
from deep_translator import GoogleTranslator

In [3]:
english = True

# Initializing Hugginface Embedding

In [4]:
from torch import cuda
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

#The embedding
embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

embed_model = HuggingFaceEmbeddings(
  model_name=embed_model_id,
  model_kwargs={'device': device},
  encode_kwargs={'device': device, 'batch_size': 32}
)

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

# Preparing Docs

Each doc in this project is a string containing sociodemographic features o an anonimous person, a question and the answer of the person described. So each text is like the following example:

The CEP National Public Opinion Survey is an academic analysis of the political, economic and social attitudes and perceptions of the Chilean population. Below is a profile with the data of one of the people surveyed last year 2023, the question asked and the answer given. Person's features: Person 24.0 years old, female. She lives in the Metropolitan region in Chile, in an urban area. Her socioeconomic level is C3 and her educational level is Complete Technical Institute. Regarding religion, she considers herself agnostic. Regarding politics, on the left-right scale she feels closer to the left, the party with which she most sympathizes is the Socialist Party (PS). Question: Next, I am going to read you the name of an institution. According to the card alternatives, how much trust do you have in the institution that I name you below? Policies. Alternatives: A lot of confidence, A lot of confidence, Little confidence, No confidence. Answer: Little trust.

In [5]:
#Charge a json with the following keys: "prompt", "questions","alternatives","answers", "year"

#Docs will have the following structure:
""" docs = [
    "this is one document",
    "and another document"
] """

json_name = "translated_data_89_pres.json"
with open(json_name, 'r', encoding='utf-8') as f:
  df_json = json.load(f)


First, the dataset is cleaned from undesirable data. In particular, alternatives are presented in the questions, and also in the alternatives keys.

The alternatives from the questions will be removed

In [7]:
for data_point in df_json:
  alternatives = data_point["alternatives"]
  question = data_point["question"]

  data_point["question"] = question.replace(alternatives, '')

In [13]:
def generate_prompt(data_point, year):
  cep = f"""La Encuesta Nacional de Opinión Pública CEP es un análisis académico de las actitudes y \
  percepciones políticas, económicas y sociales de la población chilena. \
  A continuación se presenta un perfil con los datos de una de las personas\
   encuestadas el año {year}, indicando datos sociodemográficos, pregunta que se le hizo a dicha persona y su respuesta."""

  cep_eng = f"""The CEP National Public Opinion Survey is an academic analysis \
    of the political, economic and social attitudes and perceptions of the Chilean population. \
      Below is a profile with the data of one of the people surveyed the year {year}, \
        the question asked and the answer given."""
  if english:
    prompt = f"""{cep_eng} Person's features: {data_point["prompt"]}. \
    Question: {data_point["question"]}. \
    Alternatives: [ {data_point["alternatives"]}. \
    Answers: {data_point["answers"]}"""
  
  else:
    prompt =  f"""{cep} Características de la persona: {data_point["prompt"]}.\
     Pregunta: {data_point["question"]}. Alternativas: {data_point["alternatives"]}\
      Respuesta: {data_point["answers"]}"""

  return prompt

Creating docs

In [15]:

docs = []
print("generating docs...")
i = 0
for entrada in df_json:

  # Combinar el texto y la pregunta
  text = generate_prompt(entrada, 2023)
  # Crear un nuevo diccionario con el formato deseado
  docs.append(text)

# Escribir los datos modificados en un nuevo archivo JSON
#print(datos_modificados)
""" with open('train_' + str(n_samples) + '_ONLY_PRES_ENG.json', 'w', encoding='utf-8') as f:
  print("Saving json")
  json.dump(datos_modificados, f, ensure_ascii=False, indent=4) """

generating docs...


' with open(\'train_\' + str(n_samples) + \'_ONLY_PRES_ENG.json\', \'w\', encoding=\'utf-8\') as f:\n  print("Saving json")\n  json.dump(datos_modificados, f, ensure_ascii=False, indent=4) '

In [16]:

embeddings = embed_model.embed_documents(docs)

print(f"We have {len(embeddings)} doc embeddings, each with "
  f"a dimensionality of {len(embeddings[0])}.")

We have 1181 doc embeddings, each with a dimensionality of 384.


In [None]:
import os
import pinecone

# get API key from app.pinecone.io and environment from console
pinecone.init(
    api_key=os.environ.get('PINECONE_API_KEY') or '*******************************',
    environment=os.environ.get('PINECONE_ENVIRONMENT') or '**********'
)
     

In [None]:
import time

index_name = 'llama-2-rag'

if index_name not in pinecone.list_indexes():
    pinecone.create_index(
        index_name,
        dimension=len(embeddings[0]),
        metric='cosine'
    )
    # wait for index to finish initialization
    while not pinecone.describe_index(index_name).status['ready']:
        time.sleep(1)

In [None]:
index = pinecone.Index(index_name)
index.describe_index_stats()

# Hugging Face Pipeline

In [None]:
from torch import cuda, bfloat16
import transformers

model_id = 'meta-llama/Llama-2-7b-chat-hf'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

# begin initializing HF items, need auth token for these
hf_auth = '*******************************'
model_config = transformers.AutoConfig.from_pretrained(
  model_id,
  use_auth_token=hf_auth
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
    use_auth_token=hf_auth
)
model.eval()
print(f"Model loaded on {device}")

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)

In [None]:
generate_text = transformers.pipeline(
  model=model, tokenizer=tokenizer,
  return_full_text=True,  # langchain expects the full text
  task='text-generation',
  # we pass model parameters here too
  temperature=0.0,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
  max_new_tokens=512,  # mex number of tokens to generate in the output
  repetition_penalty=1.1  # without this output begins repeating
)

In [None]:
from langchain.llms import HuggingFacePipeline

llm = HuggingFacePipeline(pipeline=generate_text)

# RAG

In [None]:
from langchain.vectorstores import Pinecone


vectorstore = Pinecone(
    index, embed_model.embed_query
)


from langchain.chains import RetrievalQA

rag_pipeline = RetrievalQA.from_chain_type(
    llm=llm, chain_type='stuff',
    retriever=vectorstore.as_retriever()
)

In [None]:
llm('what is so special about llama 2?')

In [None]:
rag_pipeline('what is so special about llama 2?')