In [None]:
import os
import pandas as pd
import numpy as np
import librosa
from google.colab import drive, files
import warnings
warnings.filterwarnings("ignore")

In [None]:
drive.mount("/content/drive", force_remount=True)

In [None]:
trans_folder = "/content/drive/MyDrive/Trans total converted_wav/Trans total converted_wav"

existing_excel = "/content/drive/MyDrive/PAS_Features(new).xlsx"

In [None]:
def get_pitch(y, sr=16000, fmin=75.0, fmax=400.0):
    try:
        f0 = librosa.yin(y, fmin=fmin, fmax=fmax, sr=sr, frame_length=2048, hop_length=256)
        voiced = f0[~np.isnan(f0)]
        return float(np.median(voiced)) if len(voiced) > 0 else 0.0
    except:
        return 0.0


In [None]:
def get_mfcc(y, sr=16000, n_mfcc=10, fixed_length=40000):
    y = librosa.to_mono(y)
    y = y[:fixed_length]
    if len(y) < fixed_length:
        y = np.pad(y, (0, fixed_length - len(y)), 'constant')
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc, hop_length=4000)
    mfcc_flat = mfcc.flatten()
    desired_len = 110
    if len(mfcc_flat) < desired_len:
        mfcc_flat = np.pad(mfcc_flat, (0, desired_len - len(mfcc_flat)), 'constant')
    else:
        mfcc_flat = mfcc_flat[:desired_len]
    return mfcc_flat

In [None]:
trans_features = []
counter = 0

for audio_file in os.listdir(trans_folder):
    if audio_file.endswith('.wav'):
        file_path = os.path.join(trans_folder, audio_file)
        try:
            y, sr = librosa.load(file_path, sr=16000, mono=True)
            pitch = get_pitch(y, sr)
            mfcc_features = get_mfcc(y, sr)
            trans_features.append([file_path, pitch] + mfcc_features.tolist() + ['trans'])
            counter += 1
            print(f"Processed {counter} trans files")
        except Exception as e:
            print(f"Error processing {file_path}: {e}")

print(f"\n Total trans features extracted: {len(trans_features)}")

In [None]:
if os.path.exists(existing_excel):
    existing_df = pd.read_excel(existing_excel)
    print(f"Loaded existing dataset with {len(existing_df)} samples.")
else:
    print("Existing Excel file not found. Creating new one.")
    columns = ['audio_file', 'pitch'] + [f'mfcc{i}' for i in range(1, 111)] + ['gender']
    existing_df = pd.DataFrame(columns=columns)

columns = ['audio_file', 'pitch'] + [f'mfcc{i}' for i in range(1, 111)] + ['gender']
trans_df = pd.DataFrame(trans_features, columns=columns)

combined_df = pd.concat([existing_df, trans_df], ignore_index=True)
print(f"\n Final dataset size: {len(combined_df)} samples")

In [None]:
save_path = '/content/drive/MyDrive/extracted_features_withtrans(new).xlsx'
combined_df.to_excel(save_path, index=False)
print(f"\n Saved combined features to Drive at: {save_path}")
