## AWS S3 Audio Upload

In [None]:
import boto3
import os
import pandas as pd
from pydub import AudioSegment

# Initialize a session using Amazon S3
s3 = boto3.client('s3')

# Your bucket name
bucket_name = 'ecocaudio'

# Local directory containing the audio files
local_directory = '/Users/vrajpatel/Desktop/work/IBS/Komal_Kaur/HindiSTT/Code/ECOC/ElectionConduct_Audio'

# Path to the video_info.xlsx file
excel_file_path = '/Users/vrajpatel/Desktop/work/IBS/Komal_Kaur/HindiSTT/Code/ECOC/video_info.xlsx'

def list_s3_files(bucket_name):
    """List all files in the S3 bucket."""
    response = s3.list_objects_v2(Bucket=bucket_name)
    if 'Contents' in response:
        return [item['Key'] for item in response['Contents']]
    else:
        return []

def list_local_files(directory):
    """List all files in the local directory."""
    return [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]

def strip_extension(file_name):
    """Remove common extensions from file name for comparison."""
    return file_name.rsplit('.', 1)[0].rsplit('_16000hz', 1)[0]

def convert_to_16000hz(file_path):
    """Convert the audio file to 16000 Hz if it is not already in that format."""
    audio = AudioSegment.from_file(file_path)
    if audio.frame_rate != 16000:
        audio = audio.set_frame_rate(16000)
        audio.export(file_path, format="wav")
        print(f'Converted {file_path} to 16000 Hz')
    else:
        print(f'{file_path} is already in 16000 Hz')

def upload_new_files(local_directory, bucket_name, df):
    """Upload files to S3 that are not already in the bucket and update the Excel file."""
    s3_files = list_s3_files(bucket_name)
    local_files = list_local_files(local_directory)

    for file_name in local_files:
        stripped_name = strip_extension(file_name)
        if file_name not in s3_files:
            file_path = os.path.join(local_directory, file_name)
            try:
                # Convert the file to 16000 Hz if necessary
                convert_to_16000hz(file_path)
                
                # Upload the file to S3
                s3.upload_file(file_path, bucket_name, file_name)
                print(f'Successfully uploaded {file_name}')
                
                # Update the Excel file: set 'Uploaded' to TRUE for the uploaded file
                df.loc[df['Title'].apply(strip_extension) == stripped_name, 'Uploaded'] = True

            except Exception as e:
                print(f'Error uploading {file_name}: {e}')

# Load the Excel file
df = pd.read_excel(excel_file_path)

# Ensure the 'Uploaded' column exists in the DataFrame
if 'Uploaded' not in df.columns:
    df['Uploaded'] = False

# Upload new files and update the DataFrame
upload_new_files(local_directory, bucket_name, df)

# Save the updated DataFrame back to the Excel file
df.to_excel(excel_file_path, index=False)

print(f"Uploaded files have been marked in {excel_file_path}")


In [2]:
import json
import pandas as pd
import boto3
from io import BytesIO
from fuzzywuzzy import fuzz

# AWS S3 configuration
s3_bucket_name = 'ecocaudio'
s3_prefix = 'transcriptions/'  # If your files are in a specific folder

# Path to your Excel file (local)
excel_file = 'video_info.xlsx'

# Initialize S3 client
s3 = boto3.client('s3')

# Read the Excel file
df = pd.read_excel(excel_file)

# Function to clean filename
def clean_filename(filename):
    prefixes = ['transcribe_16HzAudio_LIVE__', 'transcribe_16HzAudio_LIVE___']
    suffixes = ['.json']
    
    for prefix in prefixes:
        if filename.startswith(prefix):
            filename = filename[len(prefix):]
    
    for suffix in suffixes:
        if filename.endswith(suffix):
            filename = filename[:-len(suffix)]
    
    filename = filename.replace('_', ' ')
    return filename.lower()

# Create a dictionary to store filename: (transcript, original_filename) pairs
transcripts = {}

# List objects in the S3 bucket
paginator = s3.get_paginator('list_objects_v2')
for page in paginator.paginate(Bucket=s3_bucket_name, Prefix=s3_prefix):
    for obj in page.get('Contents', []):
        if obj['Key'].endswith('.json'):
            # Get the JSON file content
            response = s3.get_object(Bucket=s3_bucket_name, Key=obj['Key'])
            file_content = response['Body'].read().decode('utf-8')
            data = json.loads(file_content)
            
            transcript = data['results']['transcripts'][0]['transcript']
            original_filename = obj['Key'].split('/')[-1]
            cleaned_filename = clean_filename(original_filename)
            transcripts[cleaned_filename] = (transcript, original_filename)

# Function to find best match and return transcript and matched filename
def get_transcript_and_filename(row):
    title = row['Title'].lower()
    best_match = max(transcripts.keys(), key=lambda x: fuzz.ratio(x, title))
    if fuzz.ratio(best_match, title) > 70:  # You can adjust this threshold
        return transcripts[best_match]
    return '', ''

# Add transcripts and matched filenames to the DataFrame
df['Transcript'], df['Matched Filename'] = zip(*df.apply(get_transcript_and_filename, axis=1))

# Save the updated DataFrame back to Excel
df.to_excel('updated_' + excel_file.split('/')[-1], index=False)

print("Process completed. Updated Excel file saved.")

Process completed. Updated Excel file saved.


In [4]:
import pandas as pd
import boto3
from botocore.exceptions import ClientError

# Set up AWS Translate client
translate = boto3.client('translate')

def split_text(text, max_bytes=9000, delimiter='|'):
    """Split text into chunks of maximum size in bytes, using the delimiter."""
    segments = text.split(delimiter)
    chunks = []
    current_chunk = ""
    
    for segment in segments:
        if len((current_chunk + delimiter + segment).encode('utf-8')) > max_bytes:
            if current_chunk:
                chunks.append(current_chunk + delimiter)
                current_chunk = segment
            else:
                # Handle case where a single segment is larger than max_bytes
                segment_bytes = segment.encode('utf-8')
                while len(segment_bytes) > max_bytes:
                    split_point = segment_bytes[:max_bytes].rfind(b' ')
                    if split_point == -1:
                        split_point = max_bytes
                    chunks.append(segment_bytes[:split_point].decode('utf-8'))
                    segment_bytes = segment_bytes[split_point:]
                current_chunk = segment_bytes.decode('utf-8')
        else:
            if current_chunk:
                current_chunk += delimiter + segment
            else:
                current_chunk = segment
    
    if current_chunk:
        chunks.append(current_chunk)
    
    return chunks

def translate_text(text, source_lang='hi', target_lang='en', delimiter='|'):
    """Translate text using AWS Translate, handling large texts with delimiter."""
    try:
        chunks = split_text(text, delimiter=delimiter)
        translated_chunks = []
        
        for chunk in chunks:
            response = translate.translate_text(
                Text=chunk,
                SourceLanguageCode=source_lang,
                TargetLanguageCode=target_lang
            )
            translated_chunks.append(response['TranslatedText'])
        
        return delimiter.join(translated_chunks)
    except ClientError as e:
        print(f"Error translating text: {e}")
        return None

# Load the Excel file
excel_file = 'updated_video_info.xlsx'  # Replace with your actual file name
df = pd.read_excel(excel_file)

# Add a new column for translations
df['Translation'] = ''

# Translate each transcript
for index, row in df.iterrows():
    if pd.notna(row['Transcript']):
        translation = translate_text(row['Transcript'])
        if translation:
            df.at[index, 'Translation'] = translation
            print(f"Successfully translated transcript for {row['Title']}")
        else:
            print(f"Failed to translate transcript for {row['Title']}")
    else:
        print(f"No transcript found for {row['Title']}")

# Save the updated DataFrame back to Excel
output_file = 'updated_' + excel_file
df.to_excel(output_file, index=False)

print(f"Processing completed. Updated Excel file saved as '{output_file}'")

Successfully translated transcript for PM Modi's speech at BJP HQ after NDA win in 2024 Lok Sabha Elections
Successfully translated transcript for PM Modi addresses a public meeting in Hoshiarpur, Punjab
Successfully translated transcript for PM Modi addresses a public meeting in Kendrapara, Odisha
Successfully translated transcript for PM Modi addresses a public meeting in Balasore, Odisha
Successfully translated transcript for PM Modi addresses a public meeting in Mayurbhanj, Odisha
Successfully translated transcript for PM Modi addresses a public meeting in Mathurapur, West Bengal
Successfully translated transcript for PM Modi addresses a public meeting in Jadavpur, West Bengal
Successfully translated transcript for PM Modi addresses a public meeting in Barasat, West Bengal
Successfully translated transcript for PM Modi addresses a public meeting in Dumka, Jharkhand
Successfully translated transcript for PM Modi addresses a public meeting in Bansgaon, Uttar Pradesh
Successfully tran