In [3]:
import librosa
import librosa.display
import numpy as np

In [4]:
# Load audio file
audio_path = 'output.wav'
y, sr = librosa.load(audio_path)

# Extract MFCCs
mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)

In [12]:
import librosa
import numpy as np
from scipy.stats import kurtosis

def detect_violence_with_timestamps(audio_file, pitch_threshold=300, tone_kurtosis_threshold=5):
    """
    Detects early signs of violence based on pitch and tone analysis and provides timestamps.

    Parameters:
        audio_file (str): Path to the WAV file.
        pitch_threshold (int): Threshold for detecting high-pitched aggression (default: 300 Hz).
        tone_kurtosis_threshold (float): Threshold for tone roughness (default: 5).

    Returns:
        dict: Detection results with timestamps, confidence score, and description.
    """
    # Load audio file
    y, sr = librosa.load(audio_file, sr=None)
    
    # Frame length and hop length for timestamp calculation
    frame_length = 2048
    hop_length = 512
    
    # Extract pitch using Librosa's autocorrelation method
    pitches, magnitudes = librosa.piptrack(y=y, sr=sr, hop_length=hop_length)
    
    # Extract MFCCs for tone analysis
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13, hop_length=hop_length)
    
    # Initialize variables for tracking timestamps
    timestamps = []
    
    # Analyze each frame for pitch and tone thresholds
    for i in range(pitches.shape[1]):
        avg_pitch = np.mean(pitches[:, i][pitches[:, i] > 0])  # Average pitch ignoring zero values
        tone_kurtosis = kurtosis(mfccs[:, i])  # Kurtosis of MFCCs for roughness
        
        # Check if thresholds are exceeded
        if avg_pitch > pitch_threshold or tone_kurtosis > tone_kurtosis_threshold:
            timestamp = librosa.frames_to_time(i, sr=sr, hop_length=hop_length)  # Convert frame index to time
            confidence_score = round((avg_pitch / pitch_threshold + tone_kurtosis / tone_kurtosis_threshold) / 2, 2)
            description = f"Detected high pitch ({avg_pitch:.2f} Hz) or rough tone (kurtosis={tone_kurtosis:.2f})."
            timestamps.append({
                "timestamp": timestamp,
                "confidence_score": confidence_score,
                "description": description
            })
    
    # Return results
    if timestamps:
        return {
            "timestamps": timestamps,
            "overall_score": round(np.mean([t["confidence_score"] for t in timestamps]), 2),
            "description": "Signs of violence detected at specific timestamps."
        }
    else:
        return {
            "timestamps": [],
            "overall_score": 0.0,
            "description": "No signs of violence detected."
        }

# Example usage
audio_file_path = "output.wav"  # Replace with your WAV file path
result = detect_violence_with_timestamps(audio_file_path)
print(result)


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


{'timestamps': [{'timestamp': 0.0, 'confidence_score': nan, 'description': 'Detected high pitch (nan Hz) or rough tone (kurtosis=8.08).'}, {'timestamp': 0.032, 'confidence_score': nan, 'description': 'Detected high pitch (nan Hz) or rough tone (kurtosis=8.08).'}, {'timestamp': 0.064, 'confidence_score': nan, 'description': 'Detected high pitch (nan Hz) or rough tone (kurtosis=8.08).'}, {'timestamp': 0.096, 'confidence_score': nan, 'description': 'Detected high pitch (nan Hz) or rough tone (kurtosis=8.08).'}, {'timestamp': 0.128, 'confidence_score': nan, 'description': 'Detected high pitch (nan Hz) or rough tone (kurtosis=8.08).'}, {'timestamp': 0.16, 'confidence_score': nan, 'description': 'Detected high pitch (nan Hz) or rough tone (kurtosis=8.08).'}, {'timestamp': 0.192, 'confidence_score': nan, 'description': 'Detected high pitch (nan Hz) or rough tone (kurtosis=8.08).'}, {'timestamp': 0.224, 'confidence_score': nan, 'description': 'Detected high pitch (nan Hz) or rough tone (kurtos

In [6]:
mfccs

array([[-534.3328, -534.3328, -534.3328, ..., -534.3328, -534.3328,
        -534.3328],
       [   0.    ,    0.    ,    0.    , ...,    0.    ,    0.    ,
           0.    ],
       [   0.    ,    0.    ,    0.    , ...,    0.    ,    0.    ,
           0.    ],
       ...,
       [   0.    ,    0.    ,    0.    , ...,    0.    ,    0.    ,
           0.    ],
       [   0.    ,    0.    ,    0.    , ...,    0.    ,    0.    ,
           0.    ],
       [   0.    ,    0.    ,    0.    , ...,    0.    ,    0.    ,
           0.    ]], dtype=float32)

In [7]:
mel_spec = librosa.feature.melspectrogram(y=y, sr=sr)

In [8]:
mel_spec

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)