In [3]:
import pandas as pd
import time
import json
from youtubesearchpython import VideosSearch
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
import ollama
import logging
import requests

ERROR! Session/line number was not unique in database. History logging moved to new session 22


In [4]:
transcript_list = YouTubeTranscriptApi.list_transcripts('lbBDe4UJIgw')

In [11]:
full_transcript_segments = transcript_list.find_generated_transcript(['en', 'hi']).fetch()

In [12]:
full_transcript_segments

FetchedTranscript(snippets=[FetchedTranscriptSnippet(text='गाड़ी चली है सिर्फ 2,000 कि.मी. विद', start=0.08, duration=4.48), FetchedTranscriptSnippet(text='सर्विस रिकॉर्ड 12,000 चली है कंपनी', start=2.72, duration=3.76), FetchedTranscriptSnippet(text='वारंटी के साथ। गाड़ी चली है सिर्फ 3,000', start=4.56, duration=4.72), FetchedTranscriptSnippet(text='कि.मी. बस मात्र सर्विस रिकॉर्ड ये पंच', start=6.48, duration=5.199), FetchedTranscriptSnippet(text='में हमारे पास टोटल तीन कार हैं। सारे ही', start=9.28, duration=4.64), FetchedTranscriptSnippet(text='लगभग 5 7000 के आसपास चली है। भाई इतनी', start=11.679, duration=4.241), FetchedTranscriptSnippet(text='ज्यादा मॉडिफाइड civic आपने एक बारी सर', start=13.92, duration=5.359), FetchedTranscriptSnippet(text='इसकी भाई न्यू शेप Tata Nexon पे 6.5 लाख', start=15.92, duration=5.76), FetchedTranscriptSnippet(text='बचा दी है आपके सिर्फ एक से डेढ़ सालों के', start=19.279, duration=4.16), FetchedTranscriptSnippet(text='अंदर। Baleno सर हम बजट में लेके आए है

In [13]:
transcript_text = " ".join([segment.text for segment in full_transcript_segments])

In [14]:
transcript_text

'गाड़ी चली है सिर्फ 2,000 कि.मी. विद सर्विस रिकॉर्ड 12,000 चली है कंपनी वारंटी के साथ। गाड़ी चली है सिर्फ 3,000 कि.मी. बस मात्र सर्विस रिकॉर्ड ये पंच में हमारे पास टोटल तीन कार हैं। सारे ही लगभग 5 7000 के आसपास चली है। भाई इतनी ज्यादा मॉडिफाइड civic आपने एक बारी सर इसकी भाई न्यू शेप Tata Nexon पे 6.5 लाख बचा दी है आपके सिर्फ एक से डेढ़ सालों के अंदर। Baleno सर हम बजट में लेके आए हैं 5,15,000 की रेंज में। हर गाड़ी के ऊपर ज़ूम व्हील्स की 3 मंथ्स की वारंटी मिलती है। 5 लाख की रेंज में आपको इस लेवल की गाड़ी नहीं मिलेगी। ₹13,75,000 में आपको 4 बाय4 ब्लैक ब्यूटी की Thar मिल रही है। थ्री डोर अनरजिस्टर्ड गाड़ी है। Slaviया है। फर्स्ट ओनर आप बनोगे। Pan ओवर इंडिया अवेलेबल है। आपकी प्रोफाइल में अगर है दम तो ज़ूम व्यू करवा के देगा उसको फाइनेंस। ₹6400 में। ₹6,40,000 में टॉप वेरिएंट। 5200 कि.मी. कई लाखों बचा देंगे। [संगीत] भाई पूरे दिल्ली की सबसे बढ़िया-बढ़िया गाड़ियां दिखाने वाला हूं आज आपको इस वीडियो में। मोस्टली गाड़ियां होने वाली है। न्यू न्यू शेप की पैसे बचने वाले हैं। बहुत सारे आपके तो ये वीडियो बहुत 

In [17]:
def create_llm_prompt(query, transcript_text):
    """Creates the detailed prompt for the LLM (Llama 3 focus)."""
    parts = query.split(" in ")
    product_desc = parts[0]
    location_context = parts[1] if len(parts) > 1 else "the specified location"

    # Llama 3 works well with clear instructions and JSON format requests in the prompt
    prompt = f"""
You are a meticulous Information Extraction Specialist tasked with analyzing YouTube video transcripts. Your goal is to identify and extract comprehensive details about specific products mentioned.

The user is specifically interested in: **"{product_desc}"**
Optional Location Context: **"{location_context}"** (Focus extraction on items relevant to this location if provided and mentioned).

**Core Task:**
Analyze the provided **{transcript_text}** below. Identify every distinct instance of **"{product_desc}"** mentioned. For each distinct item found that matches the description and location context (if provided), extract the following details **ONLY IF explicitly mentioned in the transcript**:

1.  **Specific Item Identification:** The exact name or description used for the item in the transcript (e.g., "2018 Maruti Swift VXI", "Blue Honda Activa 5G"). This should be specific enough to distinguish it from other similar items if possible.
2.  **Detailed Specifications & Features:** Capture ALL mentioned characteristics relevant to the item. Examples include, but are not limited to:
    *   Model Year / Registration Year
    *   Kilometers Driven (Mileage)
    *   Fuel Type (Petrol, Diesel, CNG, EV)
    *   Transmission (Manual, Automatic, AMT)
    *   Variant / Trim Level (e.g., LXI, VXI, ZXI, SX(O))
    *   Condition (e.g., "excellent condition," "minor scratches," "single owner," "non-accidental")
    *   Color
    *   Engine Size/Type
    *   Insurance Details (e.g., "valid insurance," "insurance expired")
    *   Tyre Condition
    *   Any other unique features highlighted.
3.  **Price Information:** The asking price or price range. Note the currency if explicitly stated (default to INR if context strongly implies India, otherwise state the mentioned currency or leave blank if ambiguous). Mention if "negotiable" is stated.
4.  **Availability & Location:** Confirm if the item is explicitly mentioned as "for sale," "available," or similar. Capture dealership names or specific locations mentioned *if they relate to the {location_context} or provide specific sourcing*. Note: If an item is mentioned as "sold," do *not* include it.
5.  **Contact Details:** Extract any phone numbers, email addresses, physical addresses, website URLs, or explicit calls to action (e.g., "Call Mr. Sharma at...") associated *directly* with the item or seller.

**Input Transcript Snippet:**
{transcript_text[:15000]}

**Output Instructions:**

*   **Consolidate Information:** Combine all extracted details (Specifications, Price, Availability, Contact) for the *same specific item* into a *single* JSON object, even if mentioned across multiple sentences.
*   **Focus:** Extract information ONLY relevant to **"{product_desc}"** and potentially filtered by **"{location_context}"**. Ignore other product types or irrelevant chatter.
*   **Accuracy:** Only include details explicitly stated in the transcript. Do not infer or add information not present.
*   **Format:** Respond ONLY with a valid JSON list of objects. Each object represents one distinct item identified.
*   **Schema:** Each JSON object MUST use the following keys:
    *   `item_description`: (String) The specific item identified (Point 1).
    *   `specification`: (String) A consolidated summary of all extracted features and specifications (Point 2). Be comprehensive.
    *   `price`: (String) The extracted price information (Point 3). Include currency/negotiability if mentioned.
    *   `dealer_location`: (String) Availability status and seller/location details (Point 4).
    *   `contact`: (String) Associated contact information (Point 5).
*   **Empty Result:** If NO relevant items matching **"{product_desc}"** (and **"{location_context}"** if applicable) are found with explicitly mentioned details, output an empty JSON list: `[]`.
*   **Strict Formatting:** Ensure the entire output starts *exactly* with `[` and ends *exactly* with `]`. No introductory text, explanations, or summaries before or after the JSON list.
"""
    return prompt

In [18]:
def analyze_transcript_with_ollama(query, transcript_text):
    """Sends transcript to local Ollama model and parses the structured response."""
    if not transcript_text:
        return []

    # Generate the user prompt content
    user_prompt_content = create_llm_prompt(query, transcript_text)

    try:
        # Use ollama.chat for better control with system/user roles
        response = ollama.chat(
            model='llama3.1:latest',
            messages=[
                {
                    'role': 'system',
                    # Define the expected behavior clearly for Llama 3
                    'content': 'You are an expert information extractor. Your goal is to analyze the user-provided transcript based on their query and return ONLY a valid JSON list of findings as specified in the user prompt. Do not copy the example JSON directly. Use it only as a reference for formatting. Analyze the transcript and generate a unique response based on the input. Do not include any introductory text, explanations, or markdown formatting around the JSON.',
                },
                {
                    'role': 'user',
                    'content': user_prompt_content,
                },
            ],
            format='json',  # Instruct Ollama to attempt generating valid JSON
            options={       # Optional: Adjust generation parameters if needed
                'temperature': 0.2,
                # 'num_predict': 1024 # Limit output length if necessary
            }
        )

        content = response['message']['content'].strip()

        # --- JSON Parsing Logic (mostly the same as before) ---
        # Debug: Print raw response
        # print("--- Ollama Raw Response ---")
        # print(content)
        # print("---------------------------")

        if not content:
            logging.warning("Ollama returned an empty response.")
            return []

        # Basic cleanup: sometimes models still wrap in markdown
        if content.startswith("```json"):
            content = content.strip("```json").strip("`").strip()
        elif content.startswith("```"):
            content = content.strip("```").strip()

        # Attempt to parse the JSON
        try:
            # Find the start, handle potential leading text if model ignored instruction
            json_start = content.find('[')
            if json_start == -1:
                # Maybe it returned a single object instead of a list? Less likely with prompt.
                json_start = content.find('{')
                if json_start == -1:
                    logging.warning("Ollama response did not contain detectable JSON start '[' or '{'.")
                    print(f"Ollama Response (no JSON start): {content}")
                    return []

            # Find the corresponding end bracket/brace (crude matching)
            if content[json_start] == '[':
                json_end = content.rfind(']')
            else:
                json_end = content.rfind('}')

            if json_end == -1 or json_end < json_start:
                logging.warning("Ollama response JSON structure incomplete (missing end bracket/brace).")
                print(f"Ollama Response (incomplete JSON): {content}")
                return []

            json_string = content[json_start : json_end + 1]

            # print(f"Attempting to parse JSON from Ollama: {json_string}") # Debugging
            extracted_data = json.loads(json_string)

            # Validate structure (same as before)
            if isinstance(extracted_data, dict): extracted_data = [extracted_data] # Wrap single object
            validated_data = []
            if isinstance(extracted_data, list):
                for item in extracted_data:
                    if isinstance(item, dict) and all(k in item for k in ['specification', 'price', 'dealer', 'contact']):
                        if 'item_description' not in item: item['item_description'] = "Unknown Item"
                        validated_data.append(item)
                    else:
                        logging.warning(f"Ollama returned item with unexpected structure: {item}")
                        print(f"Ollama returned item with unexpected structure: {item}") # Debug
            return validated_data

        except json.JSONDecodeError as json_err:
            logging.error(f"Failed to parse Ollama response as JSON.")
            logging.error(f"Error: {json_err}")
            logging.error(f"Ollama Raw Response Content was:\n```\n{content}\n```")
            return []
        except Exception as e: # Catch other potential parsing errors
            logging.error(f"Error processing Ollama JSON response: {e}")
            logging.error(f"Ollama Raw Response Content was:\n```\n{content}\n```")
            return []

    # Specific Ollama connection errors (using requests check earlier helps)
    except requests.exceptions.ConnectionError: # Catch if check failed somehow
        logging.error(f"Ollama server connection failed at http://localhost:11434. Is it running?")
        return []
    except Exception as e: # Catch errors from ollama library or other issues
        logging.error(f"An error occurred during Ollama analysis: {e}")
        # Potentially log the full exception traceback here for debugging
        # import traceback
        # st.exception(e)
        return []

In [19]:
analyze_transcript_with_ollama("car in delhi", transcript_text)



Ollama returned item with unexpected structure: {}


[]