In [5]:
!pip install transformers==4.30.1

Collecting transformers==4.30.1
  Downloading transformers-4.30.1-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m50.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers==4.30.1)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m28.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.30.1)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m56.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers==4.30.1)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [

In [6]:
!pip install bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.41.1-py3-none-any.whl (92.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.41.1


In [7]:
!pip install accelerate==0.21.0

Collecting accelerate==0.21.0
  Downloading accelerate-0.21.0-py3-none-any.whl (244 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.21.0


In [8]:
from torch import cuda, bfloat16
import transformers

model_id = 'meta-llama/Llama-2-7b-chat-hf'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

# begin initializing HF items, you need an access token
hf_auth = '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'
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
    use_auth_token=hf_auth
)

# enable evaluation mode to allow model inference
model.eval()

print(f"Model loaded on {device}")

Downloading (…)lve/main/config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

Model loaded on cuda:0


In [9]:
device

'cuda:0'

In [10]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)


Downloading (…)okenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [11]:
stop_list = ['\nHuman:', '\n```\n']

stop_token_ids = [tokenizer(x)['input_ids'] for x in stop_list]
stop_token_ids

[[1, 29871, 13, 29950, 7889, 29901], [1, 29871, 13, 28956, 13]]

In [12]:
import torch
stop_token_ids = [torch.LongTensor(x).to(device) for x in stop_token_ids]
stop_token_ids

[tensor([    1, 29871,    13, 29950,  7889, 29901], device='cuda:0'),
 tensor([    1, 29871,    13, 28956,    13], device='cuda:0')]

In [13]:
from transformers import StoppingCriteria, StoppingCriteriaList

# define custom stopping criteria object
class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        for stop_ids in stop_token_ids:
            if torch.eq(input_ids[0][-len(stop_ids):], stop_ids).all():
                return True
        return False

stopping_criteria = StoppingCriteriaList([StopOnTokens()])

In [14]:
!pip install xformers

Collecting xformers
  Downloading xformers-0.0.21-cp310-cp310-manylinux2014_x86_64.whl (167.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m167.0/167.0 MB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xformers
Successfully installed xformers-0.0.21


In [15]:
generate_text = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    return_full_text=True,  # langchain expects the full text
    task='text-generation',
    # we pass model parameters here too
    stopping_criteria=stopping_criteria,  # without this model rambles during chat
    temperature=0.1,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    max_new_tokens=512,  # max number of tokens to generate in the output
    repetition_penalty=1.1  # without this output begins repeating
)

In [16]:
# res = generate_text("Explain me the difference between Data Lakehouse and Data Warehouse.")
# print(res[0]["generated_text"])

In [17]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"
!pip install langchain

Collecting langchain
  Downloading langchain-0.0.285-py3-none-any.whl (1.7 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.7 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/1.7 MB[0m [31m3.4 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.6/1.7 MB[0m [31m29.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.6.0,>=0.5.7 (from langchain)
  Downloading dataclasses_json-0.5.14-py3-none-any.whl (26 kB)
Collecting langsmith<0.1.0,>=0.0.21 (from langchain)
  Downloading langsmith-0.0.35-py3-none-any.whl (37 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.6.0,>=0.5.7->langchain)
  Downloading marshmallow-3.20.1-py3-none-any.whl (49 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [18]:
from langchain.llms import HuggingFacePipeline

llm = HuggingFacePipeline(pipeline=generate_text)

# checking again that everything is working fine
# llm(prompt="Explain me the difference between Data Lakehouse and Data Warehouse.")


In [19]:
# from langchain.document_loaders import WebBaseLoader

# web_links = ["https://www.databricks.com/","https://help.databricks.com","https://databricks.com/try-databricks","https://help.databricks.com/s/","https://docs.databricks.com","https://kb.databricks.com/","http://docs.databricks.com/getting-started/index.html","http://docs.databricks.com/introduction/index.html","http://docs.databricks.com/getting-started/tutorials/index.html","http://docs.databricks.com/release-notes/index.html","http://docs.databricks.com/ingestion/index.html","http://docs.databricks.com/exploratory-data-analysis/index.html","http://docs.databricks.com/data-preparation/index.html","http://docs.databricks.com/data-sharing/index.html","http://docs.databricks.com/marketplace/index.html","http://docs.databricks.com/workspace-index.html","http://docs.databricks.com/machine-learning/index.html","http://docs.databricks.com/sql/index.html","http://docs.databricks.com/delta/index.html","http://docs.databricks.com/dev-tools/index.html","http://docs.databricks.com/integrations/index.html","http://docs.databricks.com/administration-guide/index.html","http://docs.databricks.com/security/index.html","http://docs.databricks.com/data-governance/index.html","http://docs.databricks.com/lakehouse-architecture/index.html","http://docs.databricks.com/reference/api.html","http://docs.databricks.com/resources/index.html","http://docs.databricks.com/whats-coming.html","http://docs.databricks.com/archive/index.html","http://docs.databricks.com/lakehouse/index.html","http://docs.databricks.com/getting-started/quick-start.html","http://docs.databricks.com/getting-started/etl-quick-start.html","http://docs.databricks.com/getting-started/lakehouse-e2e.html","http://docs.databricks.com/getting-started/free-training.html","http://docs.databricks.com/sql/language-manual/index.html","http://docs.databricks.com/error-messages/index.html","http://www.apache.org/","https://databricks.com/privacy-policy","https://databricks.com/terms-of-use"]

# loader = WebBaseLoader(web_links)
# documents = loader.load()

In [20]:
# documents

In [21]:
# from langchain.text_splitter import RecursiveCharacterTextSplitter

# text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
# all_splits = text_splitter.split_documents(documents)


In [22]:
# all_splits

In [23]:
# !pip install sentence_transformers

In [24]:
# !pip install faiss-gpu

In [25]:
from langchain.document_loaders import DirectoryLoader

In [26]:
def load_docs(directory):
	loader = DirectoryLoader(directory)
	documents = loader.load()
	return documents

In [27]:
def cls_pooling(model_output):
	return model_output.last_hidden_state[:,0]

In [28]:
def split_docs(documents, chunk_size=1000, chunk_overlap=20):
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
	docs = text_splitter.split_documents(documents)
	return docs

In [29]:
def encode(texts):
	encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
	with torch.no_grad():
		model_output = model(**encoded_input, return_dict=True)
	embeddings = cls_pooling(model_output)
	return embeddings

In [30]:
from transformers import AutoTokenizer, AutoModel,AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/multi-qa-mpnet-base-dot-v1")
model = AutoModel.from_pretrained("sentence-transformers/multi-qa-mpnet-base-dot-v1")

Downloading (…)okenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [32]:
path = "/content/Data"

In [34]:
!pip install unstructured

Collecting unstructured
  Downloading unstructured-0.10.13-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
Collecting filetype (from unstructured)
  Downloading filetype-1.2.0-py2.py3-none-any.whl (19 kB)
Collecting python-magic (from unstructured)
  Downloading python_magic-0.4.27-py2.py3-none-any.whl (13 kB)
Collecting emoji (from unstructured)
  Downloading emoji-2.8.0-py2.py3-none-any.whl (358 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m358.9/358.9 kB[0m [31m28.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: filetype, python-magic, emoji, unstructured
Successfully installed emoji-2.8.0 filetype-1.2.0 python-magic-0.4.27 unstructured-0.10.13


In [35]:
!pip install unstructured[pdf]

Collecting pdf2image (from unstructured[pdf])
  Downloading pdf2image-1.16.3-py3-none-any.whl (11 kB)
Collecting pdfminer.six (from unstructured[pdf])
  Downloading pdfminer.six-20221105-py3-none-any.whl (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m43.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting unstructured-inference (from unstructured[pdf])
  Downloading unstructured_inference-0.5.25-py3-none-any.whl (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.2/51.2 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
Collecting layoutparser[layoutmodels,tesseract] (from unstructured-inference->unstructured[pdf])
  Downloading layoutparser-0.3.4-py3-none-any.whl (19.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.2/19.2 MB[0m [31m67.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting python-multipart (from unstructured-inference->unstructured[pdf])
  Downloading python_multipart-0.0.6-py3-none-any.whl

In [36]:
documents = load_docs(path)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [37]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
all_splits = split_docs(documents)

In [40]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/86.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/86.0 kB[0m [31m732.6 kB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentencepiece (from sentence_transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m34.9 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: sentence_transformers
  Building wheel for sentence_transformers (setup.py) ... [?25l[?25hdone
  Created wheel for sentence_transformers: filename=sentence_tran

In [42]:
!pip install faiss-gpu

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [43]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device": "cuda"}

embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

# storing embeddings in the vector store
vectorstore = FAISS.from_documents(all_splits, embeddings)

In [44]:
from langchain.chains import ConversationalRetrievalChain

chain = ConversationalRetrievalChain.from_llm(llm, vectorstore.as_retriever(), return_source_documents=True)

In [45]:
chat_history = []

query = "What is stress procedures"
result = chain({"question": query, "chat_history": chat_history})

print(result['answer'])

 Stress procedures are methods used to induce stress in mice for research purposes. They include restraint stress, chronic unpredictable stress, and other stressors such as cage tilt, isolation, damp bedding, rapid light and dark changes, overnight illumination, and changing the cage. These procedures can be used to study the effects of stress on various physiological systems, including the immune system, nervous system, and metabolism.


In [46]:
print(result['source_documents'])

[Document(page_content='e n\n\ni l\n\na S\n\nStress\n\nNA\n\nX T R\n\nUnpigmented hair', metadata={'source': '/content/Data/Hyperactivation of sympathetic nerves (1).pdf'}), Document(page_content='Stress procedures Restraint and chronic unpredictable stress procedures were performed as previously described11–14. In brief, for restraint stress, C57BL/6J mice were kept in a restrainer (Thermo Fisher Scientific 12972590) for four hours per day for five days starting from mid-anagen (P28– P30). Hairs were depilated to induce hair regeneration when their hair cycle reached telogen. Mice were stressed and depilated four rounds in total to monitor long-term changes. For chronic unpredictable stress, C57BL/6J mice were exposed to a combination of stressors. Two of the stressors were applied each day. The stressors included cage tilt, isolation, damp bedding, rapid light and dark changes, overnight illu- mination, restraint, empty cage and changing the cage three times. All stressors were rando

In [47]:
chat_history = []

query = "What is Cavinkare"
result = chain({"question": query, "chat_history": chat_history})

print(result['answer'])

 I can't answer this question because there isn't enough information provided in the given text to determine what "Cavinkare" is. The text only mentions "AAE" and "Placebo," which are likely the names of products or ingredients used in a study about hair growth. Without more context or information about these products, it's impossible to determine what "Cavinkare" might be.


In [48]:
chat_history = []

query = "Who is Sivakarthikeyan"
result = chain({"question": query, "chat_history": chat_history})

print(result['answer'])

 Sivakarthikeyan is not mentioned in the provided text as one of the authors or researchers involved in the study of human hair follicle pigmentation.


In [49]:
chat_history = []

query = "What is Chatgpt"
result = chain({"question": query, "chat_history": chat_history})

print(result['answer'])

 Thank you for providing the text! Unfortunately, I cannot answer your question as there is no clear mention of a specific gene or protein mentioned in the provided text. Could you please provide more context or clarify which gene or protein you would like me to help you find information about?


In [50]:
chat_history = []

query = "what is Origin of hair follicle melanocytes"
result = chain({"question": query, "chat_history": chat_history})

print(result['answer'])

 The origin of hair follicle melanocytes is not explicitly stated in the provided text, but it can be inferred based on the information provided. According to the text, the study of pigmentation in the 1950s led to a better understanding of the cellular and molecular basis of hair follicle pigmentation. This suggests that the study of hair follicle melanocytes began in the mid-20th century, and has continued to advance our understanding of their development, function, and regulation.


In [51]:
print(result['source_documents'])

[Document(page_content='It was not until the 1950s that the study of pig- follicle mentation of epidermis and hair received rigorous attention with respect to its cellular and molecular bases. Around this time, pivotal descriptions of the ‘epidermal melanin unit’ and the dissection of elements of the melanin (melanogenesis) were biosynthetic\n\nstudies\n\npathway\n\nthe\n\nto\n\nthat much of\n\nOrigin of hair follicle melanocytes', metadata={'source': '/content/Data/Human hair pigmentation   biological aspects (1).pdf'}), Document(page_content='Conclusion\n\nNow that we have overcome signiﬁcant technical hurdles in studying hair follicle melanocytes in culture, the future looks bright for quick progress in the study of the regulation of hair follicle pig- mentation. Prominent amongst these will be the study of the ageing hair follicle and a full explora- tion of the maximal potentiality of the various hair follicle melanocyte subpopulations.\n\nsuggests\n\nthe\n\nsequential\n\nthat\n\n

In [52]:
chat_history = []

query = "What are Drug treatments?"
result = chain({"question": query, "chat_history": chat_history})

print(result['answer'])

 Based on the given text, "Drug treatments" refers to various medications used to treat medical conditions. The text mentions several drugs, including RTX, amitriptyline, doxepin, maprotiline, metoclopromide, carbamazeoine, clorprothixene, diclofenac, and indomethacin. These drugs are not necessarily related to hair pigmentation, but rather their presence in hair can provide information about the individual's medical history or current treatment.


In [53]:
print(result['source_documents'])

[Document(page_content='Drug treatments For RTX treatment (see also Supplementary Discussion), mice received injections of RTX (30–100 μg kg−1) in the flank for 1–3 days, as described previously15,16,51–56. RTX was prepared in 2% DMSO with 0.15% Tween 80 in PBS. Control mice were treated with the vehicle only. RTX injection was done either in full anagen (P31–P36) or in first telogen (P21). For corti- costerone feeding, 35 μg ml−1 corticosterone (Millipore Sigma, C2505) was dissolved in 0.45% hydroxypropyl-β-cyclodextrin and provided in the drinking water. Mice were treated for three days (P28–P30). Control mice received the vehicle water (0.45% β-cyclodextrin). For analgesia, mice were injected with buprenorphine (0.1 mg kg−1) 4 h before RTX injection and every 6 h after RTX injection for 2 days. For tamoxifen treatment, tamoxifen was diluted in corn oil to a final concentration', metadata={'source': '/content/Data/Hyperactivation of sympathetic nerves (1).pdf'}), Document(page_conten

In [54]:
type(result)

dict

In [55]:
result.keys()

dict_keys(['question', 'chat_history', 'answer', 'source_documents'])

In [56]:
result['chat_history']

[]

In [57]:
query = "What are Drug treatments?"
result = chain({"question": query, "chat_history": chat_history})

print(result['answer'])

 Based on the given text, "Drug treatments" refers to various medications used to treat medical conditions. The text mentions several drugs, including RTX, amitriptyline, doxepin, maprotiline, metoclopromide, carbamazeoine, clorprothixene, diclofenac, and indomethacin. These drugs are not necessarily related to hair pigmentation, but rather their presence in hair can provide information about the individual's medical history or current treatment.


In [58]:
result['chat_history']

[]

In [24]:
chat_history = []

query = "What is Data lakehouse architecture in Databricks?"
result = chain({"question": query, "chat_history": chat_history})

print(result['answer'])

 In Databricks, the medallion lakehouse architecture refers to the organization of data stored with Delta Lake in cloud object storage with familiar relations like database schemas, tables, and views.


In [25]:
print(result['source_documents'])

[Document(page_content='What is data modeling on Databricks? \nThe Databricks Lakehouse Platform organizes data stored with Delta Lake in cloud object storage with familiar relations like database schemas, tables, and views. Databricks recommends a multi-layer approach to validating, cleansing, and transforming data for analytics. For more information, see the medallion architecture.\n\n\nWhat is Databricks SQL? \nDatabricks SQL provides general compute resources for SQL queries, visualizations, and dashboards that are executed against the tables in the lakehouse. Within Databricks SQL, these queries, visualizations, and dashboards are developed and executed using SQL editor.', metadata={'source': 'http://docs.databricks.com/sql/index.html', 'title': 'What is data warehousing on Databricks? | Databricks on AWS', 'description': 'Learn about building a data warehousing solution in Databricks using Databricks SQL.', 'language': 'en-US'}), Document(page_content='Data governance\nLakehouse 

In [26]:
chat_history = [(query, result["answer"])]

query = "What are Data Governance and Interoperability in it?"
result = chain({"question": query, "chat_history": chat_history})

print(result['answer'])

 In the context of Data Lakehouse architecture in Databricks, Data Governance refers to the policies and practices implemented to securely manage the data assets within an organization. It encompasses the centralized management of data across various teams, departments, and stakeholders, ensuring data quality, security, and compliance with regulatory requirements. Data Governance in a Data Lakehouse architecture helps organizations to streamline their data management processes, reduce data silos, and improve data accessibility and usability.


In [27]:
print(result['source_documents'])

[Document(page_content='Data governance\nLakehouse architecture\n\nReference & resources\n\nReference\nResources\nWhat’s coming?\nDocumentation archive\n\n\n\n\n    Updated Sep 08, 2023\n  \n\n\nSend us feedback\n\n\n\n\n\n\n\n\n\n\nDocumentation \nSecurity and compliance guide\n\n\n\n\n\n\n\nSecurity and compliance guide \nThis guide provides an overview of security features and capabilities that an enterprise data team can use to harden their Databricks environment according to their risk profile and governance policy.\nThis guide does not cover information about securing your data. For that information, see Data governance best practices.\n\nNote\nThis article focuses on the most recent (E2) version of the Databricks platform. Some of the features described here may not be supported on legacy deployments that have not migrated to the E2 platform.', metadata={'source': 'http://docs.databricks.com/security/index.html', 'title': 'Security and compliance guide | Databricks on AWS', 'des

In [None]:
#original Code

In [1]:
path = "/content/Data"

In [2]:
from langchain.document_loaders import DirectoryLoader

In [3]:
def load_docs(directory):
	loader = DirectoryLoader(directory)
	documents = loader.load()
	return documents

In [55]:
# !pip install unstructured[pdf]
# !pip install unstructured

In [10]:
# !sudo apt-get install poppler-utils

In [11]:
# !pip install pdf2image



In [12]:
def cls_pooling(model_output):
	return model_output.last_hidden_state[:,0]

In [13]:
def split_docs(documents, chunk_size=1000, chunk_overlap=20):
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
	docs = text_splitter.split_documents(documents)
	return docs


In [14]:
def encode(texts):
	encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
	with torch.no_grad():
		model_output = model(**encoded_input, return_dict=True)
	embeddings = cls_pooling(model_output)
	return embeddings

In [16]:
from transformers import AutoTokenizer, AutoModel,AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/multi-qa-mpnet-base-dot-v1")
model = AutoModel.from_pretrained("sentence-transformers/multi-qa-mpnet-base-dot-v1")

Downloading (…)okenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [17]:
documents = load_docs(path)

In [57]:
# documents

In [20]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
docs = split_docs(documents)

In [56]:
docs

[Document(page_content='14682494, 2008, 4, Downloaded from https://onlinelibrary.wiley.com/doi/10.1111/j.1468-2494.2008.00456.x by National Medical Library The Director, Wiley Online Library on [03/11/2022]. See the Terms and Conditions (https://onlinelibrary.wiley.com/terms-and-conditions) on Wiley Online Library for rules of use; OA articles are governed by the applicable Creative Commons License\n\nInternational Journal of Cosmetic Science, 2008, 30, 233–257\n\nReview Article\n\nHuman hair pigmentation – biological aspects\n\nD. J. Tobin Centre for Skin Sciences, School of Life Sciences, University of Bradford, Richmond Road, Bradford, West Yorkshire, UK\n\nReceived 30 March 2008, Accepted 23 April 2008\n\nKeywords: melanin, melanocyte, melanosome\n\nCorrespondence: Prof. Desmond Tobin, Centre for Skin Sciences, School of Life Sciences, University of Bradford, Richmond Road, Bradford, West Yorkshire BD7 1DP, UK. Tel.: +44 (0) 1274 233585; fax: +44 (0) 1274 309742; e-mail: d.tobin@br

In [27]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device": "cuda"}

embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

# storing embeddings in the vector store
vectorstore = FAISS.from_documents(docs, embeddings)

In [35]:
import transformers
import torch

In [36]:
from transformers import StoppingCriteria, StoppingCriteriaList

# define custom stopping criteria object
class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        for stop_ids in stop_token_ids:
            if torch.eq(input_ids[0][-len(stop_ids):], stop_ids).all():
                return True
        return False

stopping_criteria = StoppingCriteriaList([StopOnTokens()])

In [43]:
# !pip install xformers

In [47]:
from torch import cuda, bfloat16
model_id = 'meta-llama/Llama-2-7b-chat-hf'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

# begin initializing HF items, you need an access token
hf_auth = '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'
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
    use_auth_token=hf_auth
)

# enable evaluation mode to allow model inference
model.eval()

print(f"Model loaded on {device}")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model loaded on cuda:0


In [48]:
generate_text = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    return_full_text=True,  # langchain expects the full text
    task='text-generation',
    # we pass model parameters here too
    stopping_criteria=stopping_criteria,  # without this model rambles during chat
    temperature=0.1,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    max_new_tokens=512,  # max number of tokens to generate in the output
    repetition_penalty=1.1  # without this output begins repeating
)

In [49]:
from langchain.llms import HuggingFacePipeline

llm = HuggingFacePipeline(pipeline=generate_text)

In [50]:
from langchain.chains import ConversationalRetrievalChain

chain = ConversationalRetrievalChain.from_llm(llm, vectorstore.as_retriever(), return_source_documents=True)

In [52]:
stop_list = ['\nHuman:', '\n```\n']

stop_token_ids = [tokenizer(x)['input_ids'] for x in stop_list]
stop_token_ids

[[0, 2533, 1028, 2], [0, 1040, 1040, 1040, 2]]

In [53]:
import torch
stop_token_ids = [torch.LongTensor(x).to(device) for x in stop_token_ids]
stop_token_ids

[tensor([   0, 2533, 1028,    2], device='cuda:0'),
 tensor([   0, 1040, 1040, 1040,    2], device='cuda:0')]

In [54]:
chat_history = []

query = "What is Data lakehouse architecture in Databricks?"
result = chain({"question": query, "chat_history": chat_history})

print(result['answer'])

 [unused305] 1953. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1


In [58]:
chat_history = []

query = "What is stress procedures"
result = chain({"question": query, "chat_history": chat_history})

print(result['answer'])

Token indices sequence length is longer than the specified maximum sequence length for this model (838 > 512). Running this sequence through the model will result in indexing errors


 1 32ছ л [unused796] 1 32ছ л [unused796] 1 32ছ [unused820] л [unused796] 1 32ছ [unused796] 1 32 [unused796] 1 32 [unused796] 1 32 [unused796] 1 32 [unused796] 1 32 [unused796] 1 32 [unused796] 1 32 [unused796] 1 32 [unused796] 1 32 [unused796] 1 32 [unused796] 1 32 [unused796] 1 32 [unused796] 1 32 [unused796] 1 32 [unused796] 1 32 [unused796] 1 32 [unused796] 1 32 [unused796] 1 32 [unused796] 1 32 [unused796] 1 32 [unused796] 1 32 [unused796] 1 32 [unused796] 1 32 [unused796] 1 32 [unused796] 1 32 [unused796] 1 32 [unused796] 1 32 [unused796] 1 32 [unused796] 1 32 [unused796] 1 32 [unused796] 1 32 [unused796] 1 32 [unused796] 1 32 [unused796] 1 32 [unused796] 1 32 [unused796] 1 32 [unused796] 1 32 [unused796] 1 32 [unused796] 1 32 [unused796] 1 32 [unused796] 1 32 [unused796] 1 32 [unused796] 1 32 [unused796] 1 32 [unused796] 1 32 [unused796] 1 32 [unused796] 1 32 [unused796] 1 32 [unused796] 1 32 [unused796] 1 32 [unused796] 1 32 [unused796] 1 32 [unused796] 1 32 [unused796] 1 32 [un

In [59]:
print(result['source_documents'])

[Document(page_content='e n\n\ni l\n\na S\n\nStress\n\nNA\n\nX T R\n\nUnpigmented hair', metadata={'source': '/content/Data/Hyperactivation of sympathetic nerves (1).pdf'}), Document(page_content='Stress procedures Restraint and chronic unpredictable stress procedures were performed as previously described11–14. In brief, for restraint stress, C57BL/6J mice were kept in a restrainer (Thermo Fisher Scientific 12972590) for four hours per day for five days starting from mid-anagen (P28– P30). Hairs were depilated to induce hair regeneration when their hair cycle reached telogen. Mice were stressed and depilated four rounds in total to monitor long-term changes. For chronic unpredictable stress, C57BL/6J mice were exposed to a combination of stressors. Two of the stressors were applied each day. The stressors included cage tilt, isolation, damp bedding, rapid light and dark changes, overnight illu- mination, restraint, empty cage and changing the cage three times. All stressors were rando