## Some Extra Needed Codes
Just for organizing files...

### Imports and Variables

In [1]:
import os
import re

# directory = 'Extra_Codes_Directory'
directory = 'Voices'
os.makedirs(directory, exist_ok=True)

### File Renaming

In [None]:
def rename_files(start_number, directory):
    current = start_number

    while True:
        old_filename = os.path.join(directory, f"{current}.m4a")
        new_filename = os.path.join(directory, f"{current - 1}.m4a")

        if os.path.exists(old_filename):
            print(f"Renaming {old_filename} → {new_filename}")
            os.rename(old_filename, new_filename)
            current += 1
        else:
            print(f"No file found: {old_filename}. Stopping.")
            break

# Example usage:
rename_files(8, "Voices")

### Missed Numbers in Voice Files

In [3]:
# pattern = re.compile(r'^(\d+)\.m4a$') # 1.m4a, 2.m4a, etc.
pattern = re.compile(r'^Voice (\d+)\.m4a$') # Voice 1.m4a, Voice 2.m4a, etc.

numbers = []

for filename in os.listdir(directory):
    match = pattern.match(filename)
    if match:
        numbers.append(int(match.group(1)))

if not numbers:
    print("No .m4a files with numeric names found.")
else:
    numbers = sorted(numbers)
    missing = []

    for num in range(numbers[0], numbers[-1] + 1):
        if num not in numbers:
            missing.append(num)

    if missing:
        print("Missing numbers:", missing)
    else:
        print("No missing numbers. Sequence is complete!")


No missing numbers. Sequence is complete!


### Shift Numbers in File Names

In [None]:
pattern = re.compile(r'(Voice )(\d{3})(\.m4a)')

for filename in os.listdir(directory):
    match = pattern.match(filename)
    if match:
        prefix, number_str, suffix = match.groups()
        new_number = int(number_str) + 410
        new_filename = f"{prefix}{new_number:03d}{suffix}"
        old_path = os.path.join(directory, filename)
        new_path = os.path.join(directory, new_filename)
        os.rename(old_path, new_path)
        print(f"Renamed: {filename} → {new_filename}")

print("✅ Done renaming all matching files!")


### Remove All Empty Lines

In [None]:
def remove_empty_lines(input_file, output_file, encoding='utf-8'):
    try:
        with open(input_file, 'r', encoding=encoding) as infile:
            lines = infile.readlines()
        
        non_empty_lines = [line for line in lines if line.strip() != '']
        
        with open(output_file, 'w', encoding=encoding) as outfile:
            outfile.writelines(non_empty_lines)
        
        print(f"Empty lines removed and saved to {output_file}")
    
    except UnicodeDecodeError:
        print(f"Error: Unable to read the file with the {encoding} encoding. Please check the file encoding.")

input_file = 'Labels/Kurdi.txt'  
output_file = 'Labels/Kurdi_cleared.txt' 

remove_empty_lines(input_file, output_file)

print(f"Empty lines removed and saved to {output_file}")

### Convert all `m4a` audio tracks to `mp3`

In [None]:
%pip install pydub

In [None]:
import os
from pydub import AudioSegment


directory = './Voices'

for filename in os.listdir(directory):
    if filename.lower().endswith('.m4a'):
        m4a_path = os.path.join(directory, filename)
        mp3_filename = os.path.splitext(filename)[0] + '.mp3'
        mp3_path = os.path.join(directory, mp3_filename)

        audio = AudioSegment.from_file(m4a_path, format='m4a')
        
        audio.export(mp3_path, format='mp3')
        print(f'Converted: {filename} → {mp3_filename}')

        os.remove(m4a_path)
        print(f'Deleted original: {filename}')

print('✅ All m4a files converted and cleaned up!')


### Convert all `ogg` audio tracks to `m4a`

In [None]:
import os
import subprocess

directory = 'Voices'

for filename in os.listdir(directory):
    if filename.endswith('.ogg'):
        ogg_path = os.path.join(directory, filename)
        m4a_path = os.path.join(directory, filename.replace('.ogg', '.m4a'))
        
        result = subprocess.run([
            'ffmpeg', '-i', ogg_path, '-c:a', 'aac', '-b:a', '192k', m4a_path
        ], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        
        if result.returncode == 0:
            print(f"✅ Converted: {filename} → {os.path.basename(m4a_path)}")
            os.remove(ogg_path)
            print(f"🗑️ Deleted original: {filename}")
        else:
            print(f"❌ Failed to convert: {filename}")
            print(result.stderr.decode())


### Gather all extracted labels to one file

In [2]:
import os

directory_path = './Extracted_Voices'
output_file = f'{directory_path}/all_labels.txt'

all_labels = []

total_time = 0

for filename in os.listdir(directory_path):
    if filename.endswith('.txt'):
        file_path = os.path.join(directory_path, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                parts = line.strip().split('\t')
                if len(parts) >= 3:
                    try:
                        start_time = float(parts[0])
                        end_time = float(parts[1])
                        duration = end_time - start_time
                        if duration > 3.0:  # Duration is in seconds
                            total_time += duration
                            label = parts[2]
                            all_labels.append(label)
                    except ValueError:
                        continue  # Skip malformed lines

with open(output_file, 'w', encoding='utf-8') as out:
    for label in all_labels:
        out.write(label + '\n')

print(f"✅ Extracted {len(all_labels)} labels longer than 3s into '{output_file}'")
print(f"Total Time: {total_time}")


✅ Extracted 589 labels longer than 3s into './Extracted_Voices/all_labels.txt'
Total Time: 4073.837131999995


### Gather all extracted labels to one file

Converting text like this:
```txt
"01.wav" "Text of the 01.wav"
"02.wav" "Text of the 02.wav"
...
```

to this:
```txt
Text of the 01.wav
Text of the 02.wav
```

In [1]:
def extract_text_from_file(input_file, output_file=None):
    clean_lines = []

    with open(input_file, 'r', encoding='utf-8') as f:
        for line in f:
            # Strip whitespace and quotes, then split by space only once
            parts = line.strip().strip('"').split('" "', 1)
            if len(parts) == 2:
                clean_text = parts[1].strip('"')
                clean_lines.append(clean_text)

    if output_file:
        with open(output_file, 'w', encoding='utf-8') as f:
            for text in clean_lines:
                f.write(text + '\n')

    return clean_lines


input_path = 'Labels/Tehraani-Text.txt'
output_path = 'Labels/Tehraani-Text-Cleaned.txt'
texts = extract_text_from_file(input_path, output_path)

# Print first few results
for t in texts[:3]:
    print(t)


رييس سازمان سنجش آموزش كشور همزمان با نزديك شدن به زمان ثبت‌نام كنكور و آغاز تبليغات موسسات كنكور، به داوطلبان و خانواده‌ها نسبت به سؤء استفاده برخي از اين موسسات با دادن وعده‌هاي دروغين همچون تضمين قبولي، فروش سوالات و ادعاي همكاري با سازمان هشدار داد.
به گزارش سرويس صنفي آموزشي خبرگزاري دانشجويان ايران (ايسنا)، نشست « نقش دانشگاهها و مراکز پژوهشي در آموزش و تحقيقات فناوري اطلاعات» با حضور وزراي «علوم، تحقيقات و فناوري» و «ارتباطات و فنآوري اطلاعات» روز شنبه 23 دي ماه در مجتمع تحقيقاتي عصر انقلاب برگزار مي‌شود.
وي با بيان اين‌كه برعكس آنچه شايع شده، دانشگاه برنامه‌ گسترده‌اي براي ايجاد كرسي‌هاي نظريه‌پردازي در دانشكده‌هاي مختلف با محوريت قطب‌هاي علمي و با حضئر اساتيد تمام دانشگاه در دست اجرا دارد، اظهار داشت: اميدواريم بتوانيم تا پايان سال جاري در برخي رشته‌ها نسبت به تاسيس اين كرسي‌ها اقدام كنيم. 
