In [1]:
#!/usr/bin/env python3
"""
Filter out transcript entries with unavailable transcripts message
"""

import json
from pathlib import Path

def filter_transcripts(input_file, output_file):
    """Remove entries with unavailable transcript message"""
    
    # The text to look for and remove
    unavailable_text = "TranscriptCOPYTranscripts aren't available for this video. The publisher may"
    
    # Read the input JSON
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    print(f"📊 Original entries: {len(data)}")
    
    # Filter out entries containing the unavailable text
    filtered_data = []
    removed_count = 0
    
    for entry in data:
        transcript = entry.get('transcript', '')
        
        if unavailable_text in transcript:
            print(f"❌ Removing: {entry.get('title', 'Unknown')[:50]}...")
            removed_count += 1
        else:
            filtered_data.append(entry)
    
    # Save filtered data
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(filtered_data, f, ensure_ascii=False, indent=2)
    
    print(f"✅ Filtered entries: {len(filtered_data)}")
    print(f"🗑️  Removed entries: {removed_count}")
    print(f"💾 Saved to: {output_file}")

# Usage
if __name__ == "__main__":
    input_file = "kome_transcripts.json"  # Your current transcript file
    output_file = "kome_transcripts_filtered.json"  # Filtered output
    
    filter_transcripts(input_file, output_file)


📊 Original entries: 33
❌ Removing: LabelGPT: The Ultimate Auto Annotation Tool...
❌ Removing: LabelGPT: Turns raw images into labeled images in ...
❌ Removing: Labellerr: Unleash the power of automation in trai...
❌ Removing: Labellerr Feature demo 4: Copy Previous file to sp...
❌ Removing: Labellerr Feature Demo 3: Autolabel...
❌ Removing: Labellerr feture demo 2: Quick Review...
❌ Removing: Labellerr Feature demo 1: Search filters for files...
❌ Removing: Track Safety project: Project setup and Image anno...
❌ Removing: Image Annotation for Hard Hat Object Detection: De...
❌ Removing: Data Annotation for fitness AI model training: pos...
❌ Removing: How Conversational AI and IOT are Redefining the F...
❌ Removing: Text Annotation for Named Entity Recognition using...
❌ Removing: Intelligent Document Processing Automation with Da...
❌ Removing: Revolutionizing entertainment industry with Labell...
❌ Removing: Labellerr, AI in Retail - Supply Chain Machine Lea...
❌ Removing: Attribute 

In [7]:
#!/usr/bin/env python3
"""
Translate transcripts using Deep Translator (more reliable)
"""

import json
from pathlib import Path
from deep_translator import GoogleTranslator
import time

def translate_with_deep_translator(text, source='hi', target='en'):
    """Translate using Deep Translator"""
    try:
        translator = GoogleTranslator(source=source, target=target)
        
        # Handle long texts
        if len(text) > 4500:
            chunks = [text[i:i+4500] for i in range(0, len(text), 4500)]
            translated_chunks = []
            
            for chunk in chunks:
                translated = translator.translate(chunk)
                translated_chunks.append(translated)
                time.sleep(0.3)
            
            return ' '.join(translated_chunks)
        else:
            return translator.translate(text)
            
    except Exception as e:
        print(f"Translation error: {e}")
        return text

def batch_translate_transcripts(input_file, output_file):
    """Batch translate all transcripts"""
    
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    print(f"🔄 Translating {len(data)} transcripts...")
    
    for i, entry in enumerate(data, 1):
        transcript = entry.get('transcript', '')
        
        if transcript and len(transcript.strip()) > 20:
            print(f"[{i}/{len(data)}] {entry.get('title', 'Unknown')[:40]}...")
            
            # Clean up the transcript text
            cleaned_text = transcript.replace('\\n', ' ').strip()
            
            # Translate
            english_translation = translate_with_deep_translator(cleaned_text)
            
            # Update entry
            entry['transcript_english'] = english_translation
            entry['transcript_original'] = transcript
            entry['translation_status'] = 'success'
            
            print(f"  ✅ Translated {len(english_translation)} characters")
        else:
            entry['transcript_english'] = transcript
            entry['transcript_original'] = transcript
            entry['translation_status'] = 'skipped'
        
        time.sleep(0.5)  # Rate limiting
    
    # Save results
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    
    print(f"💾 Saved to: {output_file}")

# Install deep-translator if needed
try:
    from deep_translator import GoogleTranslator
except ImportError:
    import subprocess
    import sys
    subprocess.check_call([sys.executable, "-m", "pip", "install", "deep-translator"])
    from deep_translator import GoogleTranslator

# Usage
if __name__ == "__main__":
    batch_translate_transcripts("kome_transcripts_filtered.json", "kome_transcripts_english.json")


🔄 Translating 4 transcripts...
[1/4] Top Object Annotation Services | AI-Powe...
  ✅ Translated 939 characters
[2/4] Enhance Accuracy with Advanced Image Dis...
  ✅ Translated 2171 characters
[3/4] Labellerr Podcast: Insights from Soma Dh...
Translation error: Request exception can happen due to an api connection error. Please check your connection and try again
  ✅ Translated 7272 characters
[4/4] AI-Powered Data Annotation Services and ...
  ✅ Translated 549 characters
💾 Saved to: kome_transcripts_english.json


In [1]:
import json

def load_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)

def save_json(data, file_path):
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

def remove_duplicates(json1, json2, key='url'):
    combined = json1 + json2
    seen = set()
    unique_entries = []
    for entry in combined:
        identifier = entry.get(key)
        if identifier and identifier not in seen:
            seen.add(identifier)
            unique_entries.append(entry)
    return unique_entries

# Usage Example
if __name__ == "__main__":
    json_file_1 = "data_ingest/raw/documentation/doc_headings_simple.json"  # Replace with your first JSON file path
    json_file_2 = "data_ingest/raw/documentation/labellerr_documentation_headings.json"  # Replace with your second JSON file path
    output_file = "combined_unique.json"
    
    data1 = load_json(json_file_1)
    data2 = load_json(json_file_2)
    
    unique_data = remove_duplicates(data1, data2, key='url')  # Use 'url' or other unique key relevant for your data
    
    save_json(unique_data, output_file)
    
    print(f"Duplicates removed. Combined {len(unique_data)} unique entries saved in {output_file}.")


Duplicates removed. Combined 4062 unique entries saved in combined_unique.json.
