In [2]:
#| default_exp audio

In [3]:

%load_ext autoreload
%autoreload 2

import sys
sys.path.insert(0,'/app')

In [4]:
##
import asyncio
from pydub import AudioSegment
import io
import subprocess
import json

In [5]:
##

class AudioSlicer:
    def __init__(self, data=None, format="mp3"):
        self.format = format
        self.audio = AudioSegment.from_file(io.BytesIO(data), format=format) if data is not None else None

    
    @classmethod
    async def from_file(cls, file_path, format="mp3"):
        def read_file(file_path):
            with open(file_path, "rb") as file:
                return file.read()

        data = await asyncio.to_thread(read_file, file_path)
        return cls(data, format)
    

    @classmethod
    async def from_ffmpeg_slice(cls, path, start, duration, format="mp3"):
        def slice_and_get_data(path, start, duration):
            command = ['ffmpeg', '-ss', str(start), '-t', str(duration),
                       '-i', path, '-f', format, '-acodec', 'libmp3lame', '-']
            result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            return result.stdout

        data = await asyncio.to_thread(slice_and_get_data, path, start, duration)
        return cls(data, format)
    


    async def export2file(self, export_path, start=None, end=None):
        def export(segment, export_path):
            segment.export(export_path, format=self.format)

        segment = self.slice(start, end)
        await asyncio.to_thread(export, segment, export_path)

    async def export_data(self, start=None, end=None,format='mp3'):
        def export(segment, buffer):
            segment.export(buffer, format=format)
            return buffer.getvalue()

        segment = self.slice(start, end)
        buffer = io.BytesIO()
        return await asyncio.to_thread(export, segment, buffer)

    # slice remains synchronous as it's a simple in-memory operation
    def slice(self, start=None, end=None):

        if start is not None:
            start_millis = start * 1000
            end_millis = end * 1000
            audio = self.audio[start_millis:end_millis]
        else:
            print(start)
            audio = self.audio

        return audio
    
    async def append(self, additional_data):
        def append_(additional_data):
            new_segment = AudioSegment.from_file(io.BytesIO(additional_data), format=self.format)
            self.audio += new_segment

        await asyncio.to_thread(append_, additional_data)




In [6]:
##

class AudioSlicer:
    def __init__(self, data=None, format="mp3"):
        self.format = format
        self.audio = AudioSegment.from_file(io.BytesIO(data), format=format) if data is not None else None

    
    @classmethod
    async def from_file(cls, file_path, format="mp3"):
        def read_file(file_path):
            with open(file_path, "rb") as file:
                return file.read()

        data = await asyncio.to_thread(read_file, file_path)
        return cls(data, format)
    

    @classmethod
    async def from_ffmpeg_slice(cls, path, start, duration, format="mp3"):
        def slice_and_get_data(path, start, duration):
            command = ['ffmpeg', '-ss', str(start), '-t', str(duration),
                       '-i', path, '-f', format, '-acodec', 'libmp3lame', '-']
            result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            return result.stdout

        data = await asyncio.to_thread(slice_and_get_data, path, start, duration)
        return cls(data, format)
    


    async def export2file(self, export_path, start=None, end=None):
        def export(segment, export_path):
            segment.export(export_path, format=self.format)

        segment = self.slice(start, end)
        await asyncio.to_thread(export, segment, export_path)

    async def export_data(self, start=None, end=None,format='mp3'):
        def export(segment, buffer):
            segment.export(buffer, format=format)
            return buffer.getvalue()

        segment = self.slice(start, end)
        buffer = io.BytesIO()
        return await asyncio.to_thread(export, segment, buffer)

    # slice remains synchronous as it's a simple in-memory operation
    def slice(self, start=None, end=None):

        if start is not None:
            start_millis = start * 1000
            end_millis = end * 1000
            audio = self.audio[start_millis:end_millis]
        else:
            print(start)
            audio = self.audio

        return audio
    
    async def append(self, additional_data):
        def append_(additional_data):
            new_segment = AudioSegment.from_file(io.BytesIO(additional_data), format=self.format)
            self.audio += new_segment

        await asyncio.to_thread(append_, additional_data)




In [7]:
##
async def writestream2file(conn_id,redis_client):
    path = f'/audio/{conn_id}.webm'
    item = True
    while item:
        item = await redis_client.rpop(f'initialFeed_audio:{conn_id}')
        if item:
            chunk = bytes.fromhex(json.loads(item)['chunk'])
            # Open the file in append mode
            with open(path, 'ab') as file:
                # Write data to the file
                file.write(chunk)

In [8]:
from pathlib import Path
Path.ls = lambda self: list(self.iterdir())

In [9]:
path = Path('/audio').ls()[0]

In [10]:
a =await AudioSlicer.from_file(path,  format='webm')


In [11]:
async def audio_len(path):
    try:
        audio = await AudioSlicer.from_file(path,  format='webm')
        return len(audio.audio)/1000
    except:
        return 0

In [12]:
file_lengths = [await audio_len(path) for path in  Path('/audio').ls()]

In [13]:
sum(file_lengths)

400590.89999999997

In [15]:
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [20]:
df = pd.DataFrame(file_lengths)

In [24]:
df.describe(percentiles=[0.95])

Unnamed: 0,0
count,308.0
mean,1300.619805
std,1526.039979
min,0.0
50%,698.43
95%,3977.943
max,8863.02


In [27]:
df.sum()

0    400590.9
dtype: float64

In [28]:
300000/60

5000.0

In [29]:
100/5000

0.02

In [26]:
df[df[0]<3977].sum()

0    305711.1
dtype: float64

In [22]:
len(file_lengths)

308

In [7]:
path = '/app/testdata/david_audio.webm'
#path = '/home/dima/0/1_audio/testdata/david_audio.webm'
audio_obj = await AudioSlicer.from_file(path,'webm')

In [14]:
path = '/app/testdata/david_audio.webm'
#path = '/home/dima/0/1_audio/testdata/david_audio.webm'
audio_obj = await AudioSlicer.from_ffmpeg_slice(path,0,10)

In [15]:
audio_obj.audio.duration_seconds

10.032

In [9]:
audio_obj.audio.duration_seconds

3893.82

In [12]:
extra_segment = await audio_obj.export_data(audio_obj.audio.duration_seconds-1,audio_obj.audio.duration_seconds,'webm')

In [13]:
extra_segment

b'\x1aE\xdf\xa3\x9fB\x86\x81\x01B\xf7\x81\x01B\xf2\x81\x04B\xf3\x81\x08B\x82\x84webmB\x87\x81\x04B\x85\x81\x02\x18S\x80g\x01\x00\x00\x00\x00\x00%\n\x11M\x9bt\xacM\xbb\x8bS\xab\x84\x15I\xa9fS\xac\x81\xe5M\xbb\x8cS\xab\x84\x16T\xaekS\xac\x82\x01\x1cM\xbb\x8cS\xab\x84\x12T\xc3gS\xac\x82\x01\x83\xec\x01\x00\x00\x00\x00\x00\x00\xab\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00

In [15]:
def extract_webm_header(data: bytes) -> bytes:
    """
    Extracts the header from a WebM file.

    Args:
    - data (bytes): The binary content of the WebM file.

    Returns:
    - bytes: The extracted header of the WebM file.
    """
    if not data.startswith(b'\x1a\x45\xdf\xa3'):  # EBML header start
        raise ValueError("Data does not start with a standard WebM header")

    # EBML header is variably sized, so we need to find where it ends.
    # The header's end is marked by the start of the Segment element (0x18538067)
    # We look for this marker to determine the end of the header.
    segment_start = data.find(b'\x18\x53\x80\x67')
    
    if segment_start == -1:
        raise ValueError("Segment start marker not found in the data")

    # Return the data up to (and including) the segment start marker
    return data[:segment_start]

# Example usage
# Replace `webm_data` with your actual WebM data
webm_data = extra_segment  # Binary data of the WebM file
header = extract_webm_header(webm_data)
print(header)

b'\x1aE\xdf\xa3\x9fB\x86\x81\x01B\xf7\x81\x01B\xf2\x81\x04B\xf3\x81\x08B\x82\x84webmB\x87\x81\x04B\x85\x81\x02'


In [22]:
await audio_obj.append(extra_segment)

In [20]:
type(extra_segment)

bytes

In [26]:
audio_obj.slice(audio_obj.audio.duration_seconds-20,audio_obj.audio.duration_seconds)

In [34]:
await audio_obj.export2file('test.webm')

None
