In [11]:
import librosa
import tqdm

In [12]:
FPS = 30
FFT_WINDOW_SECONDS = 0.25 # how many seconds of audio make up an FFT window

# Note range to display
FREQ_MIN = 10
FREQ_MAX = 1000

# Notes to display
TOP_NOTES = 3

# Names of the notes
NOTE_NAMES = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"]

# Output size. Generally use SCALE for higher res, unless you need a non-standard aspect ratio.
RESOLUTION = (1920, 1080)
SCALE = 2 # 0.5=QHD(960x540), 1=HD(1920x1080), 2=4K(3840x2160)

In [13]:
# !ffmpeg -i muthai_tharu_slow.wav -filter:a "atempo=0.5" muthai_tharu_slow2.wav


In [14]:
import matplotlib.pyplot as plt
from scipy.fftpack import fft
from scipy.io import wavfile # get the api
import os


AUDIO_FILE = "muthai_tharu_slow2.wav"

fs, data = wavfile.read(AUDIO_FILE) # load the data
audio = data.T[0] # this is a two channel soundtrack, get the first track
FRAME_STEP = (fs / FPS) # audio samples per video frame
FFT_WINDOW_SIZE = int(fs * FFT_WINDOW_SECONDS)
AUDIO_LENGTH = len(audio)/fs

In [15]:
import numpy as np


# See https://newt.phys.unsw.edu.au/jw/notes.html
def freq_to_number(f): return 69 + 12*np.log2(f/440.0)
def number_to_freq(n): return 440 * 2.0**((n-69)/12.0)
def note_name(n): return NOTE_NAMES[n % 12] + str(int(n/12 - 1))

# Hanning window function
window = 0.5 * (1 - np.cos(np.linspace(0, 2*np.pi, FFT_WINDOW_SIZE, False)))

xf = np.fft.rfftfreq(FFT_WINDOW_SIZE, 1/fs)
FRAME_COUNT = int(AUDIO_LENGTH*FPS)
FRAME_OFFSET = int(len(audio)/FRAME_COUNT)




In [16]:
import plotly.graph_objects as go

def plot_fft(p, xf, fs, notes, dimensions=(960,540)):
  layout = go.Layout(
      title="frequency spectrum",
      autosize=False,
      width=dimensions[0],
      height=dimensions[1],
      xaxis_title="Frequency (note)",
      yaxis_title="Magnitude",
      font={'size' : 24}
  )

  fig = go.Figure(layout=layout,
                  layout_xaxis_range=[FREQ_MIN,FREQ_MAX],
                  layout_yaxis_range=[0,1]
                  )
  
  fig.add_trace(go.Scatter(
      x = xf,
      y = p))
  
  for note in notes:
    fig.add_annotation(x=note[0]+10, y=note[2],
            text=note[1],
            font = {'size' : 48},
            showarrow=False)
  return fig

def extract_sample(audio, frame_number):
  end = frame_number * FRAME_OFFSET
  begin = int(end - FFT_WINDOW_SIZE)

  if end == 0:
    # We have no audio yet, return all zeros (very beginning)
    return np.zeros((np.abs(begin)),dtype=float)
  elif begin<0:
    # We have some audio, padd with zeros
    return np.concatenate([np.zeros((np.abs(begin)),dtype=float),audio[0:end]])
  else:
    # Usually this happens, return the next sample
    return audio[begin:end]

def find_top_notes(fft,num):
  if np.max(fft.real)<0.001:
    return []

  lst = [x for x in enumerate(fft.real)]
  lst = sorted(lst, key=lambda x: x[1],reverse=True)

  idx = 0
  found = []
  found_note = set()
  while( (idx<len(lst)) and (len(found)<num) ):
    f = xf[lst[idx][0]]
    y = lst[idx][1]
    n = freq_to_number(f)
    n0 = int(round(n))
    name = note_name(n0)

    if name not in found_note:
      found_note.add(name)
      s = [f,note_name(n0),y]
      found.append(s)
    idx += 1
    
  return found

In [17]:
# Pass 1, find out the maximum amplitude so we can scale.
mx = 0
for frame_number in range(FRAME_COUNT):
  sample = extract_sample(audio, frame_number)

  fft = np.fft.rfft(sample * window)
  fft = np.abs(fft).real 
  mx = max(np.max(fft),mx)

print(f"Max amplitude: {mx}")



Max amplitude: 54067799.11678628


In [18]:
# Define reference frequencies for Sa in each octave
LOW_SA_FREQ = 164.81  # Sa in Mandra octave (E3 in Hz)
MIDDLE_SA_FREQ = 329.63  # Sa in Madhya octave (E4 in Hz)
HIGH_SA_FREQ = 659.25  # Sa in Taara octave (E5 in Hz)

# Define swara ratios relative to Sa (using ratios specific to Shanmukhapriya scale)
RATIOS = {
    'Sa': 1,
    'Ri2': 9/8,
    'Ga1': 5/4,
    'Ma2': 4/3,
    'Pa': 3/2,
    'Da1': 8/5,
    'Ni1': 9/5,
    'Sa_high': 2  # Sa in the higher octave
}

# Function to calculate frequencies for each swara in all octaves
def calculate_swara_frequencies(base_freq):
    return {s: base_freq * ratio for s, ratio in RATIOS.items()}

# Calculate frequencies for each octave
mandra_frequencies = calculate_swara_frequencies(LOW_SA_FREQ)
madhya_frequencies = calculate_swara_frequencies(MIDDLE_SA_FREQ)
taara_frequencies = calculate_swara_frequencies(HIGH_SA_FREQ)

# Combine results
all_frequencies = {
    'Mandra': mandra_frequencies,
    'Madhya': madhya_frequencies,
    'Taara': taara_frequencies
}

# Display all frequencies
all_frequencies


{'Mandra': {'Sa': 164.81,
  'Ri2': 185.41125,
  'Ga1': 206.0125,
  'Ma2': 219.74666666666667,
  'Pa': 247.215,
  'Da1': 263.696,
  'Ni1': 296.658,
  'Sa_high': 329.62},
 'Madhya': {'Sa': 329.63,
  'Ri2': 370.83375,
  'Ga1': 412.0375,
  'Ma2': 439.50666666666666,
  'Pa': 494.445,
  'Da1': 527.408,
  'Ni1': 593.3340000000001,
  'Sa_high': 659.26},
 'Taara': {'Sa': 659.25,
  'Ri2': 741.65625,
  'Ga1': 824.0625,
  'Ma2': 879.0,
  'Pa': 988.875,
  'Da1': 1054.8,
  'Ni1': 1186.65,
  'Sa_high': 1318.5}}

In [19]:
def freq_to_swara(frequency):
    """
    Map a frequency to the nearest Carnatic swara across all octaves.
    
    Parameters:
        frequency (float): The frequency in Hz to map.
        
    Returns:
        str: The corresponding swara name with octave or None if no match is found.
    """
    if frequency == 0:
        return None

    min_diff = float('inf')
    closest_swara = None
    
    # Iterate through each octave in all_frequencies
    for octave, swaras in all_frequencies.items():
        for swara, freq in swaras.items():
            diff = abs(frequency - freq)
            if diff < min_diff:
                min_diff = diff
                closest_swara = f"{swara} ({octave})"
    
    # Define a tolerance level (e.g., within 30 Hz)
    TOLERANCE = 30
    if min_diff <= TOLERANCE:
        return closest_swara
    else:
        return None

def detect_swaras(audio_path, output_txt_path, sr=None, hop_length=512, n_fft=2048):
    """
    Detect swaras from an audio file and write them to a text file.
    
    Parameters:
        audio_path (str): Path to the input audio file.
        output_txt_path (str): Path to the output text file.
        sr (int, optional): Sampling rate. If None, uses the file's original rate.
        hop_length (int, optional): Number of samples between successive frames.
        n_fft (int, optional): Length of the FFT window.
    """
    # Load audio file
    print("Loading audio file...")
    audio, sr = librosa.load(audio_path, sr=sr, mono=True)
    print(f"Audio loaded. Duration: {librosa.get_duration(y=audio, sr=sr):.2f} seconds.")
    
    # Normalize audio
    audio = audio / np.max(np.abs(audio))
    
    # Perform pitch tracking
    print("Performing pitch detection...")
    pitches, magnitudes = librosa.piptrack(y=audio, sr=sr, hop_length=hop_length, n_fft=n_fft)
    
    # Initialize list to store detected swaras
    detected_swaras = []
    
    # Iterate through each frame
    print("Mapping frequencies to swaras...")
    for i in tqdm.tqdm(range(pitches.shape[1])):
        index = magnitudes[:, i].argmax()
        pitch = pitches[index, i]
        if pitch > 0:
            swara = freq_to_swara(pitch)
            if swara:
                detected_swaras.append(swara)
            else:
                detected_swaras.append('---')  # Placeholder for no match
        else:
            detected_swaras.append('---')  # Placeholder for no pitch detected
    
    # Write swaras to text file
    print(f"Writing detected swaras to {output_txt_path}...")
    with open(output_txt_path, 'w') as f:
        for swara in detected_swaras:
            f.write(swara + ' ')
    print("Swara detection and writing completed.")


In [41]:

if __name__ == "__main__":
    # Define paths
    AUDIO_FILE = "muthai_tharu_slow2.wav"  # Replace with your audio file path
    OUTPUT_TXT = "music_notes_swaras.txt"  # Output text file path
    
    # Detect swaras and write to text file
    detect_swaras(AUDIO_FILE, OUTPUT_TXT)

Loading audio file...
Audio loaded. Duration: 524.32 seconds.
Performing pitch detection...
Mapping frequencies to swaras...


100%|██████████| 49155/49155 [00:01<00:00, 33542.69it/s]

Writing detected swaras to music_notes_swaras.txt...
Swara detection and writing completed.





In [21]:
# import tqdm
# import numpy as np
# import matplotlib.pyplot as plt  # Using matplotlib for efficiency
# import os

# # Ensure the output directory exists
# output_dir = "C:\\DEV\\SangeethamAI\\content"
# os.makedirs(output_dir, exist_ok=True)
# with open('music_notes.txt', 'w+') as musicfile:

#     # Pass 2, produce the animation
#     for frame_number in tqdm.tqdm(range(FRAME_COUNT)):
#         #print(f"Processing frame {frame_number + 1}/{FRAME_COUNT}")  # Debug print

#         # Extract the sample
#         sample = extract_sample(audio, frame_number)

#         # Compute FFT
#         fft = np.fft.rfft(sample * window)
#         fft = np.abs(fft) / mx 

#         # Find top notes
#         s = find_top_notes(fft, 4)

#         # Plot FFT using matplotlib for faster rendering
#         plt.figure()
#         plt.plot(xf, fft.real)  # Assuming `xf` and `fft` dimensions match
#         plt.title(f"FFT Frame {frame_number}")
#         plt.xlabel('Frequency (Hz)')
#         plt.ylabel('Amplitude')
#         plt.ylim(0,1)
#         plt.xlim(0,1000)

#         # Annotate the top notes on the FFT plot
#         for note in s:
#             frequency, note_label, magnitude = note
#             #note_label = western_to_carnatic[note_label[:-1]]
#             plt.annotate(note_label, 
#                         xy=(frequency, magnitude), 
#                         xytext=(frequency + 10, magnitude + 0.05), 
#                         fontsize=9, color='red', 
#                         arrowprops=dict(arrowstyle="->", color='red'))
#             musicfile.write(note_label + " ")
#         # # Save the frame
#         frame_path = os.path.join(output_dir, f"frame{frame_number}.png")
#         plt.savefig(frame_path, dpi=100)  # Lower DPI for faster saving
#         plt.close()  # Close the figure to release memory

In [22]:
# !ffmpeg -y -r {FPS} -f image2 -s 1920x1080 -i content/frame%d.png -i {AUDIO_FILE} -c:v libx264 -c:a aac -pix_fmt yuv420p -shortest movie.mp4
