In [1]:
import pandas as pd
import faiss
import numpy as np
from openai import OpenAI
from tqdm import tqdm
import json 
import re

# Load OpenAI key
import os
os.environ["OPENAI_API_KEY"] = "put your API key"  # Replace with your key
client = OpenAI()
data=pd.read_csv('cars_dataset.csv')
data 

Unnamed: 0,Car Name,Manufacturer,Launch Year,Description,Engine Specifications,Other Specifications,User Ratings,NCAP Global Rating
0,OffDecision12,Ross PLC,2017,Experience the fusion of style and performance...,"I6, 253 HP, 1520cc","SUV, 10 km/l, 203 km/h top speed",2.5,4
1,ExistGround23,Ross PLC,2017,The ExistGround23 by Ross PLC is a luxurious a...,"Electric, 281 HP, 4515cc","SUV, 15 km/l, 238 km/h top speed",2.1,1
2,SometimesHerself24,Ross PLC,2021,The SometimesHerself24 by Ross PLC is a effici...,"V8, 367 HP, 2351cc","Coupe, 15 km/l, 219 km/h top speed",5.0,1
3,OffAround14,Ross PLC,2006,The OffAround14 by Ross PLC is a elegant and v...,"Electric, 585 HP, 1464cc","Coupe, 12 km/l, 221 km/h top speed",1.7,4
4,PriceIdea77,Ross PLC,2002,"Ross PLC presents the PriceIdea77, a efficient...","V6, 422 HP, 1762cc","Hatchback, 6 km/l, 238 km/h top speed",2.8,2
...,...,...,...,...,...,...,...,...
1995,ApproachWife68,"Adams, Nelson and Taylor",2023,"Adams, Nelson and Taylor presents the Approach...","V6, 324 HP, 2388cc","Coupe, 11 km/l, 113 km/h top speed",2.8,2
1996,"Adams, Nelson and Taylor",ThoughNumber19,2021,"ThoughNumber19 presents the Adams, Nelson and ...","I4, 307 HP, 3055cc","Convertible, 5 km/l, 119 km/h top speed",1.7,5
1997,"Adams, Nelson and Taylor",ProduceThis27,1999,"The Adams, Nelson and Taylor by ProduceThis27 ...","V6, 211 HP, 4787cc","Convertible, 10 km/l, 176 km/h top speed",2.9,1
1998,ConsiderSuffer61,"Adams, Nelson and Taylor",2012,Experience the fusion of style and performance...,"V8, 273 HP, 4048cc","SUV, 11 km/l, 245 km/h top speed",1.0,2


In [2]:
data.columns

Index(['Car Name', 'Manufacturer', 'Launch Year', 'Description',
       'Engine Specifications', 'Other Specifications', 'User Ratings',
       'NCAP Global Rating'],
      dtype='object')

In [3]:
# === Step 1: Load the dataset ===
data = pd.read_csv('cars_dataset.csv')

# === Step 2: Clean column names ===
data.columns = data.columns.str.strip().str.lower().str.replace(' ', '_')

# === Step 3: Strip whitespaces from string columns ===
for col in data.select_dtypes(include='object').columns:
    data[col] = data[col].str.strip()

# === Step 4: Drop empty rows and duplicates ===
data.dropna(how='all', inplace=True)
data.drop_duplicates(inplace=True)

# === Step 5: Chunk for RAG embedding ===
car_chunks = []
for _, row in data.iterrows():
    chunk = {
        "id": row["car_name"],
        "text": (
            f"{row['car_name']} by {row['manufacturer']} (Launched in {row['launch_year']}).\n"
            f"Description: {row['description']}\n"
            f"Engine: {row['engine_specifications']}\n"
            f"Specs: {row['other_specifications']}\n"
            f"User Rating: {row['user_ratings']}/5 | NCAP Rating: {row['ncap_global_rating']}/5"
        )
    }
    car_chunks.append(chunk)

# === Step 6: Optional – Save chunks to JSONL for embedding later ===
import json

with open("car_chunks.jsonl", "w") as f:
    for chunk in car_chunks:
        f.write(json.dumps(chunk) + "\n")

print("✅ Car data cleaned and saved as RAG-ready chunks in 'car_chunks.jsonl'")


✅ Car data cleaned and saved as RAG-ready chunks in 'car_chunks.jsonl'


In [5]:

# === Step 1: Load chunks from JSONL ===
chunks = []
with open("car_chunks.jsonl", "r") as f:
    for line in f:
        chunks.append(json.loads(line))

# === Step 2: Generate embeddings using OpenAI ===
def embed_text(text):
    response = client.embeddings.create(
        model="text-embedding-ada-002",
        input=text
    )
    return response.data[0].embedding

# Embed all chunks
embeddings = []
metadata = []

for chunk in tqdm(chunks, desc="Embedding car chunks"):
    emb = embed_text(chunk["text"])
    embeddings.append(emb)
    metadata.append({"id": chunk["id"], "text": chunk["text"]})

# Convert to numpy array
embedding_matrix = np.array(embeddings).astype("float32")

# === Step 3: Create FAISS index ===
index = faiss.IndexFlatL2(embedding_matrix.shape[1])
index.add(embedding_matrix)

# Save index and metadata
faiss.write_index(index, "car_faiss.index")

with open("car_metadata.json", "w") as f:
    json.dump(metadata, f)

print("✅ Embeddings stored in FAISS and metadata saved.")


Embedding car chunks:   0%|          | 0/2000 [00:00<?, ?it/s]

Embedding car chunks: 100%|██████████| 2000/2000 [19:22<00:00,  1.72it/s]   


✅ Embeddings stored in FAISS and metadata saved.


In [7]:
# === Load FAISS index and metadata ===
index = faiss.read_index("car_faiss.index")

with open("car_metadata.json", "r") as f:
    metadata = json.load(f)

# === Embed a query ===
def embed_query(query):
    response = client.embeddings.create(
        model="text-embedding-ada-002",
        input=query
    )
    return np.array(response.data[0].embedding).astype("float32").reshape(1, -1)

# === Search function ===
def search_cars(query, k=3):
    query_embedding = embed_query(query)
    distances, indices = index.search(query_embedding, k)

    results = []
    for idx, score in zip(indices[0], distances[0]):
        if idx < len(metadata):
            results.append({
                "score": float(score),
                "car_id": metadata[idx]["id"],
                "text": metadata[idx]["text"]
            })
    return results

# === Example usage ===
query = "Which car has high horsepower and is a coupe?"
results = search_cars(query)

for res in results:
    print(f"\n🔍 Result (score: {res['score']:.2f}) — {res['car_id']}")
    print(res['text'])



🔍 Result (score: 0.33) — SevenAttorney93
SevenAttorney93 by Cohen-Hayes (Launched in 2005).
Description: Cohen-Hayes presents the SevenAttorney93, a vibrant masterpiece that redefines modern mobility. With features such as advanced safety features and high-performance engine, this roadster combines efficiency with bold performance. Since its inception in 2005, it has captivated enthusiasts and experts alike.
Engine: I6, 452 HP, 3959cc
Specs: Coupe, 7 km/l, 138 km/h top speed
User Rating: 3.6/5 | NCAP Rating: 5/5

🔍 Result (score: 0.34) — HelpFirm87
HelpFirm87 by Cohen-Hayes (Launched in 2008).
Description: The HelpFirm87 by Cohen-Hayes is a futuristic and agile coupe. It features unmatched reliability and integrated smart connectivity that deliver an exhilarating driving experience. Released in 2008, it embodies the future of automotive design.
Engine: V6, 368 HP, 2405cc
Specs: Convertible, 5 km/l, 171 km/h top speed
User Rating: 4.8/5 | NCAP Rating: 2/5

🔍 Result (score: 0.34) — Cohe

In [2]:
# === Step 1: Load Markdown File ===
with open("country_data.md", "r", encoding="utf-8") as f:
    raw_text = f.read()

# === Step 2: Split by country section ===
sections = raw_text.split("# Country:")
country_chunks = []

for section in sections[1:]:  # skip intro
    lines = section.strip().splitlines()
    country_name = lines[0].strip()
    body_text = "\n".join(lines[1:]).strip()

    # Optional cleaning: remove excessive whitespace or newlines
    cleaned_text = re.sub(r"\n{2,}", "\n", body_text).strip()

    chunk = {
        "id": country_name,
        "text": f"{country_name}\n{cleaned_text}"
    }
    country_chunks.append(chunk)

# === Step 3: Save as JSONL for embedding ===
with open("country_chunks.jsonl", "w", encoding="utf-8") as f:
    for chunk in country_chunks:
        f.write(json.dumps(chunk) + "\n")

print(f"✅ Processed {len(country_chunks)} country chunks. Saved to 'country_chunks.jsonl'")


✅ Processed 20 country chunks. Saved to 'country_chunks.jsonl'


In [3]:

# === Step 1: Load chunks from country_chunks.jsonl ===
chunks = []
with open("country_chunks.jsonl", "r") as f:
    for line in f:
        chunks.append(json.loads(line))

# === Step 2: Create embeddings ===
def embed_text(text):
    response = client.embeddings.create(
        model="text-embedding-ada-002",
        input=text
    )
    return response.data[0].embedding

embeddings = []
metadata = []

for chunk in tqdm(chunks, desc="Embedding country chunks"):
    emb = embed_text(chunk["text"])
    embeddings.append(emb)
    metadata.append({"id": chunk["id"], "text": chunk["text"]})

embedding_matrix = np.array(embeddings).astype("float32")

# === Step 3: Build FAISS index ===
index = faiss.IndexFlatL2(embedding_matrix.shape[1])
index.add(embedding_matrix)

faiss.write_index(index, "country_faiss.index")

with open("country_metadata.json", "w") as f:
    json.dump(metadata, f)

print("✅ Embedded and saved country data to FAISS and metadata.")


Embedding country chunks: 100%|██████████| 20/20 [00:10<00:00,  1.94it/s]

✅ Embedded and saved country data to FAISS and metadata.



