<a href="https://colab.research.google.com/github/KavyaD02/Continuous_Speech_Stress_Detection/blob/main/Stress_Level_Diagnosis_SVM_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import os
import librosa
import librosa.display
from pydub.silence import split_on_silence
from pydub import AudioSegment, effects 
from scipy.io.wavfile import read, write



def preprocess_audio(audio_file_name):

    audio_file, sr = librosa.load(audio_file_name)
    
    audio_file = librosa.effects.preemphasis(audio_file)
    audio_file = librosa.effects.trim(audio_file, top_db=20)[0]
    audio_file = librosa.util.normalize(audio_file)
    
    return sr, audio_file

In [None]:
DATA_PATH = "./Data/"

def get_file_names(path = DATA_PATH):
    file_names = os.listdir(path)
    return file_names

def process_audio(audio_file_name):
    rate, audio = read(f"./Data/{audio_file_name}")

    aud = AudioSegment(audio.tobytes(), frame_rate = rate,
                         sample_width = audio.dtype.itemsize, channels = 1)

    audio_chunks = split_on_silence(
        aud,
        min_silence_len = 1000,
        silence_thresh = -50,
        keep_silence = 500,)
    
    # audio chunks are combined here
    audio_processed = sum(audio_chunks)
    audio_processed = np.array(audio_processed.get_array_of_samples())
    audio_file = audio_file_name.split(".")[0]
    write(f"./Processed Data/{audio_file}_PROCESSED.wav", rate, audio_processed)

file_names = get_file_names()
for file_name in file_names:
    process_audio(file_name)
print("Done")

In [None]:
def get_label_dict(df, file_list, test=False):
    label_dict = {}
    
    if test:
        df_list = df['PHQ_Score']
    else:
        df_list = df['PHQ8_Score']

    for file in file_list:
        patient_num = int(file.split("/")[-1].split("_")[0])

        patient_list = list(df['Participant_ID'])

        idx = patient_list.index(patient_num)

        phq8_score = int(df_list[idx])
        
        if phq8_score in range(0, 6):
            score = 0
        elif phq8_score in range(6, 15):
            score = 1
        elif phq8_score in range(15, 22):
            score = 2
        elif phq8_score in range(22, 28):
            score = 3

        label_dict[f"{file}"] = score
    
    return label_dict

In [None]:
def get_set(df):
    df_files = [f"./Processed Data/{x}_AUDIO_PROCESSED.wav" for x in df['Participant_ID']]

    return list(set(df_files))

In [None]:
train_df, test_df, val_df = pd.read_csv(f"./train.csv"), pd.read_csv(f"./test.csv"), pd.read_csv(f"./val.csv")

train, test, val = get_set(train_df), get_set(test_df), get_set(val_df)

In [None]:
train_dict, test_dict, val_dict = get_label_dict(train_df, train), get_label_dict(test_df, test, True), get_label_dict(val_df, val)
data_dict = {**train_dict, **test_dict, **val_dict}

In [None]:
PROCESSED_DATA_PATH = "./Processed Data/"

data_list = list(data_dict.keys())
label_list = list(data_dict.values())

new_data_list = []
new_label_list = []
for data in data_list:
    audios = os.listdir(PROCESSED_DATA_PATH)
    data_2 = data.split('/')[2]
    if data_2 in audios:
        new_data_list.append(data)
        new_label_list.append(label_list[data_list.index(data)])

In [None]:
csv_path = "./Combined Features.csv"
combined_df = pd.read_csv(csv_path)
combined = list(combined_df["Combined"])
combined_features = []
for i in combined:
    new_i = i.replace('\n', ' ')
    new_i = new_i.replace('[', '')
    new_i = new_i.replace(']', '')
    array_data = np.fromstring(new_i, dtype=np.float64, sep=' ')
    combined_features.append(array_data)
print(combined_features)
print(len(combined_features[0]))

[array([-4.79988800e+02,  8.19064180e+01,  2.37880670e+01,  2.76477850e+01,
        1.64174310e+01,  1.18718340e+01, -1.46540220e+00,  6.03091720e-01,
        3.43655110e+00, -2.21614880e-01,  1.62698010e+00, -2.90776350e-03,
        1.44289350e+00,  4.51081140e-04,  6.64304240e-04,  7.05572430e-04,
        6.93348760e-04,  6.08195260e-04,  4.96753200e-04,  4.12559780e-04,
        3.60709500e-04,  3.11995860e-04,  2.56050230e-04,  2.16459030e-04,
        2.10880550e-04,  2.17300180e-04, -8.55024120e-05, -9.66644000e-05,
       -4.22024570e-05,  6.02277810e-06,  2.73243350e-05,  2.84198010e-05,
        2.49642820e-05,  2.08131610e-05,  9.92893820e-06, -7.61972980e-06,
       -2.14398770e-05, -2.43459530e-05, -2.29571520e-05,  3.92782060e-01,
        4.06451580e-01,  4.09212560e-01,  4.32155340e-01,  4.71119400e-01,
        5.26509300e-01,  5.63314200e-01,  5.53642150e-01,  4.79845640e-01,
        4.36706070e-01,  4.09527900e-01,  3.95022150e-01,  1.00000000e+00,
       -3.20140654e+00, 

In [None]:
X = np.array(combined_features)

In [None]:
Y = np.array(new_label_list)

In [None]:
from sklearn.model_selection import train_test_split
train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size = 0.2)

In [None]:
from sklearn import svm
from sklearn.metrics import accuracy_score

clf = svm.SVC()

clf.fit(train_X, train_Y)

# Make predictions on the test set
y_pred = clf.predict(test_X)

# Evaluate the model
accuracy = accuracy_score(test_Y, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.5263157894736842
