In [1]:
import re
import csv

def convert_transcript_to_csv_full(input_file, output_file):
    """
    Convert timestamped transcript to CSV - keeps ALL text intact
    """
    
    with open(input_file, 'r', encoding='utf-8') as f:
        content = f.read()
    
    # Remove video separator lines
    content = re.sub(r'<-+Video\d+-+>', '', content)
    
    all_rows = []
    
    # Split by timestamp pattern (M:SS) or (MM:SS)
    # Keep the timestamp and everything until the next timestamp
    segments = re.split(r'(\(\d+:\d+\))', content)
    
    current_time = None
    current_text = ""
    
    for segment in segments:
        # Check if this is a timestamp
        timestamp_match = re.match(r'\((\d+):(\d+)\)', segment)
        
        if timestamp_match:
            # Save previous entry if exists
            if current_time is not None and current_text.strip():
                all_rows.append({
                    'timestamp': current_time,
                    'text': current_text.strip()
                })
            
            # Start new entry
            minutes = int(timestamp_match.group(1))
            seconds = int(timestamp_match.group(2))
            current_time = f"{minutes}:{seconds:02d}"
            current_text = ""
        else:
            # This is text content
            current_text += segment
    
    # Add last entry
    if current_time is not None and current_text.strip():
        all_rows.append({
            'timestamp': current_time,
            'text': current_text.strip()
        })
    
    # Now convert to start_time, end_time format
    final_rows = []
    
    for i, row in enumerate(all_rows):
        # Parse timestamp
        parts = row['timestamp'].split(':')
        start_seconds = int(parts[0]) * 60 + int(parts[1])
        
        # Get end time from next row
        if i + 1 < len(all_rows):
            next_parts = all_rows[i + 1]['timestamp'].split(':')
            end_seconds = int(next_parts[0]) * 60 + int(next_parts[1])
        else:
            # Last entry - add 10 seconds
            end_seconds = start_seconds + 10
        
        final_rows.append({
            'start_time': float(start_seconds),
            'end_time': float(end_seconds),
            'text': row['text']
        })
    
    # Write to CSV
    with open(output_file, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=['start_time', 'end_time', 'text'])
        writer.writeheader()
        writer.writerows(final_rows)
    
    print(f"‚úÖ Converted {len(final_rows)} entries")
    print(f"üìÅ Saved to: {output_file}")
    
    # Show sample
    print("\nüìã Sample output:")
    for i in range(min(3, len(final_rows))):
        print(f"{final_rows[i]['start_time']},{final_rows[i]['end_time']},\"{final_rows[i]['text'][:60]}...\"")


# ============= ALTERNATIVE: Keep exact timestamps from text =============

def convert_with_exact_timestamps(input_file, output_file):
    """
    Alternative: Extract exact timestamps if they exist in text
    Format: (MM:SS) text until next timestamp
    """
    
    with open(input_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    rows = []
    
    for line in lines:
        # Skip video separator lines
        if '<---' in line or '--->' in line:
            continue
        
        # Look for timestamp at start of line
        match = re.match(r'(\d+):(\d+)\s*(.+)', line.strip())
        if match:
            minutes = int(match.group(1))
            seconds = int(match.group(2))
            text = match.group(3)
            
            start_time = minutes * 60 + seconds
            
            rows.append({
                'start_time': start_time,
                'text': text
            })
    
    # Add end times
    final_rows = []
    for i, row in enumerate(rows):
        if i + 1 < len(rows):
            end_time = rows[i + 1]['start_time']
        else:
            end_time = row['start_time'] + 10
        
        final_rows.append({
            'start_time': float(row['start_time']),
            'end_time': float(end_time),
            'text': row['text']
        })
    
    # Write to CSV
    with open(output_file, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=['start_time', 'end_time', 'text'])
        writer.writeheader()
        writer.writerows(final_rows)
    
    print(f"‚úÖ Converted {len(final_rows)} entries")
    print(f"üìÅ Saved to: {output_file}")


# ============= USAGE =============

if __name__ == "__main__":
    
    INPUT_FILE = 'combine_text.csv'
    OUTPUT_FILE = 'output_2.csv'
    
    # Method 1: Full text preservation
    print("Converting with full text preservation...")
    convert_transcript_to_csv_full(INPUT_FILE, OUTPUT_FILE)
    
    print("\n‚úÖ Done! No text lost - everything preserved.")

Converting with full text preservation...
‚úÖ Converted 161 entries
üìÅ Saved to: output_2.csv

üìã Sample output:
0.0,4.0,"Okay, now let's make it more interactive...."
4.0,6.0,"Let's take input from the user..."
6.0,9.0,"rather than us deciding what we want..."

‚úÖ Done! No text lost - everything preserved.


In [2]:
import pandas as pd
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Read CSV
df = pd.read_csv('output_2.csv')

# Setup splitter
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    separators=["\n\n", "\n", ". ", "! ", "? ", ", ", " ", ""]
)

# Chunk text
all_chunks = []

for idx, row in df.iterrows():
    text = str(row['text'])  # Changed to 'text'
    
    if text and text != 'nan':
        chunks = splitter.split_text(text)
        
        for chunk_num, chunk in enumerate(chunks):
            all_chunks.append({
                'chunk_text': chunk,
                'chunk_number': chunk_num,
                'source_row': idx,
                'start_time': row['start_time'],  # Keep timestamp info
                'end_time': row['end_time']
            })

# Save chunks
pd.DataFrame(all_chunks).to_csv('chunks_4.csv', index=False)
print(f"‚úì Created {len(all_chunks)} chunks")

  from .autonotebook import tqdm as notebook_tqdm


‚úì Created 399 chunks


In [3]:
import pandas as pd
import google.generativeai as genai
import time
from google.api_core import exceptions

# ================= CONFIG ================= #
genai.configure(api_key="AIzaSyBBJ-NRM9xKQB3NclSpMzA_3ZYOY75Fvps")

EMBEDDING_MODEL = "models/gemini-embedding-001"

# ================= LOAD DATA ================= #
df = pd.read_csv("chunks_4.csv")
chunks_text = df["chunk_text"].astype(str).tolist()

print(f"Processing {len(chunks_text)} chunks...")
print("Generating embeddings using Gemini (with rate limiting)...\n")

# ================= EMBEDDING FUNCTION WITH RATE LIMITING ================= #
def generate_embeddings(texts, batch_size=5):  # Reduced batch size
    all_embeddings = []
    total_batches = (len(texts) + batch_size - 1) // batch_size
    
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        batch_num = i // batch_size + 1
        
        retry_count = 0
        max_retries = 5
        
        while retry_count < max_retries:
            try:
                response = genai.embed_content(
                    model=EMBEDDING_MODEL,
                    content=batch,
                    task_type="retrieval_document"
                )
                
                all_embeddings.extend(response["embedding"])
                print(f"‚úì Batch {batch_num}/{total_batches} completed ({len(all_embeddings)} embeddings generated)")
                
                # Wait longer to avoid rate limits (1 second per batch)
                time.sleep(1.0)
                break  # Success, exit retry loop
                
            except exceptions.ResourceExhausted as e:
                retry_count += 1
                if retry_count >= max_retries:
                    print(f"\n‚ùå Max retries reached. Stopping.")
                    raise
                
                # Extract wait time from error message
                wait_time = 60  # Default wait time
                if "retry in" in str(e):
                    try:
                        wait_str = str(e).split("retry in ")[1].split("s")[0]
                        wait_time = float(wait_str) + 2  # Add buffer
                    except:
                        pass
                
                print(f"‚ö†Ô∏è  Rate limit hit. Waiting {wait_time:.0f} seconds... (Retry {retry_count}/{max_retries})")
                time.sleep(wait_time)
            
            except Exception as e:
                print(f"\n‚ùå Error: {e}")
                raise

    return all_embeddings

# ================= RUN ================= #
embeddings = generate_embeddings(chunks_text)

print(f"\n‚úì Successfully generated {len(embeddings)} embeddings")
print(f"‚úì Embedding dimension: {len(embeddings[0])}")

# ================= SAVE EMBEDDINGS ================= #
df['embedding'] = embeddings
df.to_csv("chunks_with_embeddings.csv", index=False)
print("‚úì Saved to chunks_with_embeddings.csv") 


All support for the `google.generativeai` package has ended. It will no longer be receiving 
updates or bug fixes. Please switch to the `google.genai` package as soon as possible.
See README for more details:

https://github.com/google-gemini/deprecated-generative-ai-python/blob/main/README.md

  import google.generativeai as genai


Processing 399 chunks...
Generating embeddings using Gemini (with rate limiting)...

‚úì Batch 1/80 completed (5 embeddings generated)
‚úì Batch 2/80 completed (10 embeddings generated)
‚úì Batch 3/80 completed (15 embeddings generated)
‚úì Batch 4/80 completed (20 embeddings generated)
‚úì Batch 5/80 completed (25 embeddings generated)
‚úì Batch 6/80 completed (30 embeddings generated)
‚úì Batch 7/80 completed (35 embeddings generated)
‚úì Batch 8/80 completed (40 embeddings generated)
‚úì Batch 9/80 completed (45 embeddings generated)
‚úì Batch 10/80 completed (50 embeddings generated)
‚úì Batch 11/80 completed (55 embeddings generated)
‚úì Batch 12/80 completed (60 embeddings generated)
‚úì Batch 13/80 completed (65 embeddings generated)
‚úì Batch 14/80 completed (70 embeddings generated)
‚úì Batch 15/80 completed (75 embeddings generated)
‚úì Batch 16/80 completed (80 embeddings generated)
‚úì Batch 17/80 completed (85 embeddings generated)
‚úì Batch 18/80 completed (90 embeddings 

In [None]:
import chromadb

# ================= SETUP CHROMADB ================= #
client = chromadb.PersistentClient(path="./chroma_db")

# ‚úÖ DELETE OLD COLLECTION (if it exists)
try:
    client.delete_collection(name="video_chunks_2")
    print("‚úì Deleted old collection")
except:
    pass

# ‚úÖ CREATE NEW COLLECTION (it will auto-detect 3072 dimensions)
collection = client.get_or_create_collection(
    name="video_chunks_2"
)

# ================= ADD TO DATABASE ================= #
collection.add(
    documents=chunks_text,
    embeddings=embeddings,  # 3072-dimensional embeddings from Gemini
    ids=[f"chunk_{i}" for i in range(len(chunks_text))],
    metadatas=[
        {
            "start_time": row["start_time"],
            "end_time": row["end_time"]
        }
        for _, row in df.iterrows()
    ]
)

print(f"‚úì Stored {len(embeddings)} embeddings in ChromaDB")
print(f"‚úì Embedding dimension: {len(embeddings[0])}")

‚úì Deleted old collection
‚úì Stored 399 embeddings in ChromaDB
‚úì Embedding dimension: 3072


In [7]:
from google import genai
import chromadb

# ================= CONFIG ================= #
genai_client = genai.Client(api_key="AIzaSyBBJ-NRM9xKQB3NclSpMzA_3ZYOY75Fvps")
EMBED_MODEL = "models/text-embedding-004"

client = chromadb.PersistentClient(path="./video_db_2")
collection = client.get_collection("video_chunks_2")

# ================= QUERY ================= #
query = "What is the example of arithmetic operator in Java? and PIR Motion"

# Generate query embedding (Gemini)
response = genai_client.models.embed_content(
    model=EMBED_MODEL,
    contents=query
)

query_embedding = response.embeddings[0].values  # list[float]

# ================= SEARCH ================= #
results = collection.query(
    query_embeddings=[query_embedding],  # ‚ùå no .tolist()
    n_results=3
)

# ================= PRINT RESULTS ================= #
print(f"\nQuery: {query}\n")

for i, doc in enumerate(results["documents"][0]):
    print(f"Result {i+1}:")
    print(doc)
    print("-" * 50)



Query: What is the example of arithmetic operator in Java? and PIR Motion

Result 1:
<--------------code-box video 24 ------------------------------------------------------------------->
PIR motion sensor. 
00:07
The PIR motion sensor uses the RE-200 BP sensing element. 
00:12
It works based on the pyroelectric effect, which means it can detect infrared radiation naturally released.
00:20
By a human body or an animal, with the help of a Fresnel lens, the sensor can detect motion from a greater distance and over a wider area. 
00:30
When a person or an animal moves within its sensing range, the PIR sensor outputs a high signal. 
00:37
When no motion is present, it outputs a low signal.
00:41
The programming blocks shown demonstrate how to read the PIR sensor value and use it inside an IF and else structure. 
00:49
The code checks whether motion is detected. 
00:51
If the condition is true, the instructions inside the IF section will run. 
00:56
If the condition is false, the code insid

In [5]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import chromadb
from groq import Groq
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Get API key from .env file
API_KEY = os.getenv('GROQ_API_KEY')

# ============= SETUP DATABASE =============

df = pd.read_csv('chunks_4.csv')

print("Loading model...")
model = SentenceTransformer('all-MiniLM-L6-v2')

print("Generating embeddings...")
chunks_text = df['chunk_text'].tolist()
embeddings = model.encode(chunks_text, show_progress_bar=True)

# ‚úÖ DELETE OLD COLLECTION FIRST
client = chromadb.PersistentClient(path="./chroma_db")

try:
    client.delete_collection("video_chunks_2")
    print("‚úì Deleted old collection with wrong dimensions")
except:
    print("‚úì No existing collection to delete")

# ‚úÖ CREATE FRESH COLLECTION
collection = client.get_or_create_collection("video_chunks_2")

collection.add(
    documents=chunks_text,
    embeddings=embeddings.tolist(),
    ids=[f"chunk_{i}" for i in range(len(chunks_text))],
    metadatas=[{"start_time": row['start_time'], "end_time": row['end_time']} 
               for _, row in df.iterrows()]
)

print(f"‚úì Database ready with {len(embeddings)} embeddings!")
print(f"‚úì Embedding dimension: {embeddings.shape[1]}")

# ============= RAG QUERY FUNCTION =============

def rag_query(question, top_k=3):
    
    print(f"\nüîç Searching for: {question}")
    query_embedding = model.encode([question])[0]
    
    results = collection.query(
        query_embeddings=[query_embedding.tolist()],
        n_results=top_k
    )
    
    context = "\n\n".join(results['documents'][0])
    
    print("\nüìö Retrieved chunks:")
    for i, doc in enumerate(results['documents'][0]):
        print(f"  {i+1}. {doc[:100]}...")
    
    print("\nü§ñ Generating answer...")
    
    groq_client = Groq(api_key=API_KEY)
    
    response = groq_client.chat.completions.create(
        model="llama-3.3-70b-versatile",
        messages=[
            {
                "role": "system",
                "content": "You are a helpful assistant that answers questions based on video transcripts."
            },
            {
                "role": "user",
                "content": f"""Based on the following video transcript excerpts, answer the question.

Context from video:
{context}

Question: {question}

Answer based only on the information provided above."""
            }
        ],
        temperature=0.3,
        max_tokens=1024
    )
    
    answer = response.choices[0].message.content
    
    print("\n" + "="*70)
    print("üí° ANSWER:")
    print("="*70)
    print(answer)
    print("="*70)
    
    return answer

# ============= USE IT =============

rag_query("Can you explain the Bluetooth topic mentioned in the video?") 

Loading model...


Loading weights: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 103/103 [00:00<00:00, 569.05it/s, Materializing param=pooler.dense.weight]                             
BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Generating embeddings...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 13/13 [00:08<00:00,  1.59it/s]


‚úì Deleted old collection with wrong dimensions
‚úì Database ready with 399 embeddings!
‚úì Embedding dimension: 384

üîç Searching for: Can you explain the Bluetooth topic mentioned in the video?

üìö Retrieved chunks:
  1. When we send one from the mobile phone, the Arduino turns the LED on. 
01:06
When we send 0, the Ard...
  2. Hello everyone, welcome to Wiz Robo. 
00:09
Today we are going to learn a very exciting project. 
00...
  3. 02:01
It helps students understand serial communication, Bluetooth technology, and digital output co...

ü§ñ Generating answer...

üí° ANSWER:
The video discusses Bluetooth technology in the context of controlling devices wirelessly using Arduino and a smartphone. Specifically, it mentions two projects: 

1. A Bluetooth controlled LED system, where sending "1" from a mobile phone turns the LED on and sending "0" turns it off. 
2. A Bluetooth controlled RGB LED system, where an HC05 Bluetooth module is used to send commands from a mobile phone to 

'The video discusses Bluetooth technology in the context of controlling devices wirelessly using Arduino and a smartphone. Specifically, it mentions two projects: \n\n1. A Bluetooth controlled LED system, where sending "1" from a mobile phone turns the LED on and sending "0" turns it off. \n2. A Bluetooth controlled RGB LED system, where an HC05 Bluetooth module is used to send commands from a mobile phone to the Arduino, allowing for wireless control of different colors.\n\nThe projects demonstrate how Bluetooth technology can be used for wireless communication and control of electronic devices, which has applications in real-life scenarios such as home automation, smart lights, and remote-controlled appliances.'

In [1]:
import google.generativeai as genai

genai.configure(api_key="AIzaSyB9DbuTKH4HAZXBtNvUtR4VvPxaXt4E5iI")

models = genai.list_models()
for model in models:
    print(model.name, model.supported_generation_methods)


  from .autonotebook import tqdm as notebook_tqdm

All support for the `google.generativeai` package has ended. It will no longer be receiving 
updates or bug fixes. Please switch to the `google.genai` package as soon as possible.
See README for more details:

https://github.com/google-gemini/deprecated-generative-ai-python/blob/main/README.md

  import google.generativeai as genai


models/gemini-2.5-flash ['generateContent', 'countTokens', 'createCachedContent', 'batchGenerateContent']
models/gemini-2.5-pro ['generateContent', 'countTokens', 'createCachedContent', 'batchGenerateContent']
models/gemini-2.0-flash ['generateContent', 'countTokens', 'createCachedContent', 'batchGenerateContent']
models/gemini-2.0-flash-001 ['generateContent', 'countTokens', 'createCachedContent', 'batchGenerateContent']
models/gemini-2.0-flash-exp-image-generation ['generateContent', 'countTokens', 'bidiGenerateContent']
models/gemini-2.0-flash-lite-001 ['generateContent', 'countTokens', 'createCachedContent', 'batchGenerateContent']
models/gemini-2.0-flash-lite ['generateContent', 'countTokens', 'createCachedContent', 'batchGenerateContent']
models/gemini-exp-1206 ['generateContent', 'countTokens', 'createCachedContent', 'batchGenerateContent']
models/gemini-2.5-flash-preview-tts ['countTokens', 'generateContent']
models/gemini-2.5-pro-preview-tts ['countTokens', 'generateContent', 