In [None]:
import os
import subprocess
import librosa
import numpy as np
import pandas as pd
from scipy.stats import variation
from sklearn.preprocessing import StandardScaler
import concurrent.futures

# Load the existing data from the Excel file
excel_file = '/home/rudra/Documents/GitHub/Articulation-Meter/Project Files/ted _data.xlsx'
sheet_name = 'Sheet1'
df = pd.read_excel(excel_file, sheet_name=sheet_name)

# Drop last two columns (assuming these are duration, likes, and views)
df = df.iloc[:, :-2]

# Restrict to the first 200 rows
df = df.head(200)

# Function to convert likes to numerical values (provided for reference)
def convert_likes(like):
  if isinstance(like, str):
    if 'K' in like:
      return float(like.replace('K', '')) * 1000
    elif 'M' in like:
      return float(like.replace('M', '')) * 1000000
    else:
      return float(like)
  return like

# Convert likes to numerical values (assuming 'likes' column exists)
df['likes'] = df['likes'].apply(convert_likes)

# Function to extract audio features
def extract_audio_features(audio_path):
  # Load the audio file
  y, sr = librosa.load(audio_path)

  # Calculate new features
  features = {}
  # features['chroma_stft'] = np.mean(librosa.feature.chroma_stft(y=y, sr=sr))
  # features['rmse'] = np.mean(librosa.feature.rms(y=y))
  features['spectral_centroid'] = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
  features['spectral_bandwidth'] = np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr))
  features['rolloff'] = np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr))
  features['zero_crossing_rate'] = np.mean(librosa.feature.zero_crossing_rate(y))

  # Include existing features (assuming these are relevant)
  # features['energy'] = np.sum(librosa.stft(y) ** 2)  # Energy
  # features['mfccs'] = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13).mean(axis=1)  # MFCCs (mean)
  features['pitch'] = np.max(librosa.piptrack(y=y, sr=sr)[0])  # Pitch
  # features['speech_rate_variation'] = variation(np.diff([d[1]-d[0] for d in librosa.effects.split(y=y)])) if len(librosa.effects.split(y=y)) > 1 else 0.0  # Speech rate variation
  # features['articulation_rate'] = len(librosa.effects.split(y=y)) / librosa.get_duration(y=y)  # Articulation rate
  # features['frequency'] = sr / len(y)  # Frequency

  return features
# Directory to store the audio files
audio_dir = '/home/rudra/Documents/GitHub/Articulation-Meter/Project Files/audio/audio_analysis/'

# Create the directory if it doesn't exist
os.makedirs(audio_dir, exist_ok=True)

In [None]:
# Assuming df is your DataFrame containing YouTube video codes
# Example:
# df = pd.DataFrame({'youtube_video_code': ['dQw4w9WgXcQ', '3JZ_D3ELwOQ', ...]})

# Directory to save audio files
audio_dir = '/home/rudra/Documents/GitHub/Articulation-Meter/Project Files/audio/audio_analysis'

# Function to process a single video
def process_video(video_code):
    url = f"https://www.youtube.com/watch?v={video_code}"
    output_template = os.path.join(audio_dir, f"{video_code}.%(ext)s")

    # Define the command for downloading and converting audio
    command = [
        "yt-dlp",
        "-f", "bestaudio",
        "--extract-audio",
        "--audio-format=wav",
        "-o", output_template,
        url
    ]

    try:
        # Run the command for downloading and converting audio
        subprocess.run(command, check=True)
        print(f"Successfully downloaded and converted: {url}")
    except subprocess.CalledProcessError as e:
        print(f"Failed to download {url}: {e}")

# Process the first 200 video links concurrently
if __name__ == '__main__':
    video_codes = df['youtube_video_code'][:200]
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor:
        # Map the function to the video codes
        executor.map(process_video, video_codes)


In [None]:
from sklearn.preprocessing import MinMaxScaler

# Initialize lists to store results
pitches = []
spectral_centroids = []
spectral_bandwidths = []
rolloffs = []
zero_crossing_rates = []
likes_list = []
views_list = []

# Process each audio file in the directory corresponding to the first 200 rows
audio_files_info = [(row['youtube_video_code'], row['likes'], row['views'], os.path.join(audio_dir, f"{row['youtube_video_code']}.wav")) for _, row in df.iterrows()]

for video_code, like, view, audio_path in audio_files_info:
    if os.path.exists(audio_path):
        # Extract audio features
        features = extract_audio_features(audio_path)

        # Store the features and the like value
        pitches.append(features.get('pitch', None))  # Handle missing values
        spectral_centroids.append(features['spectral_centroid'])
        spectral_bandwidths.append(features['spectral_bandwidth'])
        rolloffs.append(features['rolloff'])
        zero_crossing_rates.append(features['zero_crossing_rate'])
        likes_list.append(like)
        views_list.append(view)
    else:
        # Handle missing audio files
        pitches.append(None)
        spectral_centroids.append(None)
        spectral_bandwidths.append(None)
        rolloffs.append(None)
        zero_crossing_rates.append(None)
        likes_list.append(None)
        views_list.append(None)


In [None]:
# Create a DataFrame to store the results
audio_df = pd.DataFrame({
    'Likes': likes_list,
    'Views': views_list,
    'Pitch': pitches,
    'Spectral Centroid': spectral_centroids,
    'Spectral Bandwidth': spectral_bandwidths,
    'Rolloff': rolloffs,
    'Zero Crossing Rate': zero_crossing_rates,
})

# Normalize audio features using MinMaxScaler
scaler_minmax = MinMaxScaler()
audio_df[['Pitch', 'Spectral Centroid', 'Spectral Bandwidth', 'Rolloff', 'Zero Crossing Rate']] = scaler_minmax.fit_transform(
    audio_df[['Pitch', 'Spectral Centroid', 'Spectral Bandwidth', 'Rolloff', 'Zero Crossing Rate']]
)

# Write the final DataFrame to a new Excel file
audio_df.to_excel('merged_excel_file.xlsx', index=False)

# Display the DataFrame to verify (optional)
print(audio_df.head())

: 