<a href="https://www.kaggle.com/code/pratul007/porsche-911-data-analysis-query-with-llama-2-7b?scriptVersionId=143623015" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# Install necessary packages
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117 --upgrade
!pip install langchain einops accelerate transformers bitsandbytes scipy
!pip install xformers sentencepiece
!pip install llama-index llama_hub --upgrade
!pip install sentence-transformers
!pip install pypdf2
!pip install git+https://github.com/huggingface/transformers.git@main --quiet
!pip install git+https://github.com/huggingface/accelerate@main --quiet
!pip install tensor_parallel

In [None]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from llama_index import VectorStoreIndex, ServiceContext, set_global_service_context, Document
from llama_index.llms import HuggingFaceLLM
from llama_index.embeddings import LangchainEmbedding
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
import warnings 
warnings.filterwarnings("ignore")

In [None]:
# Load the CSV into a Pandas DataFrame
df = pd.read_csv("/kaggle/input/every-porsche-911/porsche_911.csv")

# Convert the DataFrame content into a format suitable for Llama
documents = [
    Document(
        text=" ".join([f"{col}: {value}" for col, value in zip(df.columns, row.astype(str))]),
        metadata={"row_num": idx}
    ) 
    for idx, row in df.iterrows()
]

In [None]:
# Llama setup
model_name = '/kaggle/input/llama-2/pytorch/7b-chat-hf/1'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name,device_map='auto',torch_dtype=torch.float16)

In [None]:
system_prompt = """<s>[INST] <<SYS>>
You are a helpful, respectful, and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information. Your goal is to provide answers relating to the car Porsche from the csv.<</SYS>>"""

query_wrapper_prompt = "{query_str}"

llm = HuggingFaceLLM(
    context_window=4098,
    max_new_tokens=256,
    system_prompt=system_prompt,
    query_wrapper_prompt=query_wrapper_prompt,
    model=model,
    tokenizer=tokenizer
)

embeddings = LangchainEmbedding(HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2"))
service_context = ServiceContext.from_defaults(chunk_size=4098, llm=llm, embed_model=embeddings)
set_global_service_context(service_context)

# Create an index using the DataFrame's content
index = VectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine()

def generate_response(query_text):
    input_tokens = tokenizer(query_text, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")
    output = model.generate(**input_tokens)
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response

In [None]:
# Sample Queries
queries = [
    "Describe the fuel efficiency trends in Porsche 911 models from 2010 to 2020.",
    "Which models of the Porsche 911 have a rear-wheel-drive powertrain architecture?",
    "What is the most relevant year and why for the Porsche 911 dataset?"
]

for query in queries:
    print(f"Question: {query}")
    print(f"Response: {generate_response(query)}")
    print("-" * 50)