In [4]:
#to convert audio to text
import speech_recognition as sr

def transcribe_audio(mp3_file):
    # Initialize the recognizer
    r = sr.Recognizer()

    # Load the audio file
    with sr.AudioFile(mp3_file) as source:
        audio = r.record(source)

    # Perform speech recognition
    try:
        transcription = r.recognize_google(audio)
        return transcription
    except sr.UnknownValueError:
        print("Speech recognition could not understand audio")
    except sr.RequestError as e:
        print("Could not request results from speech recognition service; {0}".format(e))

    return ""

# Example usage
mp3_file = "output.mp3"
transcription = transcribe_audio(mp3_file)
print("Transcription:", transcription)


Transcription: this is a very sad drawing


In [5]:
#emotion prediction for tts
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the CSV data
df = pd.read_csv('emotions.csv')

# Replace missing values with an empty string
df['Text'].fillna('', inplace=True)

# Extract input features (X) and labels (y) from the CSV
X = df['Text'].values
y = df['Emotion'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Train a model on the training data
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

# Make predictions on new text inputs

text_vectorized = vectorizer.transform([transcription])
predicted_emotion = model.predict(text_vectorized)
print("Predicted emotion:", predicted_emotion[0])

# Evaluate the model on the testing set
y_pred = model.predict(X_test_vectorized)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print the evaluation metrics
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1-Score:', f1)



Predicted emotion: Sad
Accuracy: 0.9937007874015747
Precision: 0.9936991049195774
Recall: 0.9937007874015747
F1-Score: 0.9936895763446699


In [6]:
#Quality Output


import numpy as np
import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
import librosa
import pyttsx3
import matplotlib.pyplot as plt


def evaluate_text_to_speech_system(text):
    # Initialize the pyttsx3 engine
    engine = pyttsx3.init()

    # Set the rate and volume of the speech
    engine.setProperty('rate', 150)
    engine.setProperty('volume', 1.0)

    # Convert text to speech
    engine.say(text)
    engine.runAndWait()

    """  # Get MOS score from user input
    mos_score = int(input("Please rate the audio quality on a scale of 1 to 5: "))

    return mos_score """
    return 


# Load the model
loaded_model = load_model("E:/Samsung-Test/models/model.h5")

# Load the necessary variables
max_length = 99840  # Max sequence length
num_mfcc = 8  # Number of MFCC coefficients

# Load and preprocess the new audio sample
new_audio_path = 'E:\Samsung\output.mp3'
new_audio, sr = librosa.load(new_audio_path, sr=None, mono=True)
new_audio = librosa.resample(new_audio, orig_sr=sr, target_sr=16000)

# Extract MFCC features for the new audio sample
mfcc = librosa.feature.mfcc(y=new_audio, sr=16000, n_mfcc=num_mfcc)

# Pad or truncate the MFCC features to match the expected shape
if mfcc.shape[1] < max_length:
    mfcc = pad_sequences([mfcc.T], padding='post', maxlen=max_length, dtype='float32').T
elif mfcc.shape[1] > max_length:
    mfcc = mfcc[:, :max_length]

# Reshape the MFCC features to match the model's input shape
mfcc = np.expand_dims(mfcc, axis=0)
mfcc = np.swapaxes(mfcc, 1, 2)  # Swap axes to match the expected shape




# Perform prediction
predictions = loaded_model.predict(mfcc)
print(predictions)

class_names = ['Very Unnatural', 'Unnatural', 'Neutral', 'Natural', 'Completely Natural']
predicted_class_index = np.argmax(predictions[0])
predicted_class_name = class_names[predicted_class_index]

print(" ")
print("The provided audio's naturalness output is: ",predicted_class_name)
print("Predicted emotion:", predicted_emotion[0])


if (predicted_class_name, predicted_emotion[0]) == ('Very Unnatural', 'Excited'):
    text = "The naturalness of the provided audio is unauthentic and bad, the predicted emotion is filled with joy and excitement."
elif (predicted_class_name, predicted_emotion[0]) == ('Unnatural', 'Excited'):
    text = "The naturalness of the provided audio is unauthentic and ok, the predicted emotion is filled with joy and excitement."
elif (predicted_class_name, predicted_emotion[0]) == ('Neutral', 'Excited'):
    text = "The naturalness of the provided audio is not great, the predicted emotion is filled with joy and excitement."
elif (predicted_class_name, predicted_emotion[0]) == ('Natural', 'Excited'):
    text = "The naturalness of the provided audio is authentic and good, the predicted emotion is filled with joy and excitement."
elif (predicted_class_name, predicted_emotion[0]) == ('Completely Natural', 'Excited'):
    text = "The naturalness of the provided audio is authentic and very natural, the predicted emotion is filled with joy and excitement."
elif (predicted_class_name, predicted_emotion[0]) == ('Very Unnatural', 'Happy'):
    text = "The naturalness of the provided audio is unauthentic and bad, the predicted emotion is happy."
elif (predicted_class_name, predicted_emotion[0]) == ('Unnatural', 'Happy'):
    text = "The naturalness of the provided audio is unauthentic and ok, the predicted emotion is happy."
elif (predicted_class_name, predicted_emotion[0]) == ('Neutral', 'Happy'):
    text = "The naturalness of the provided audio is not great, the predicted emotion is happy."
elif (predicted_class_name, predicted_emotion[0]) == ('Natural', 'Happy'):
    text = "The naturalness of the provided audio is authentic and good, the predicted emotion is happy."
elif (predicted_class_name, predicted_emotion[0]) == ('Completely Natural', 'Happy'):
    text = "The naturalness of the provided audio is authentic and very natural, the predicted emotion is happy."
elif (predicted_class_name, predicted_emotion[0]) == ('Very Unnatural', 'Anger'):
    text = "The naturalness of the provided audio is unauthentic and bad, the predicted emotion is anger."
elif (predicted_class_name, predicted_emotion[0]) == ('Unnatural', 'Anger'):
    text = "The naturalness of the provided audio is unauthentic and ok, the predicted emotion is anger."
elif (predicted_class_name, predicted_emotion[0]) == ('Neutral', 'Anger'):
    text = "The naturalness of the provided audio is not great, the predicted emotion is anger."
elif (predicted_class_name, predicted_emotion[0]) == ('Natural', 'Anger'):
    text = "The naturalness of the provided audio is authentic and good, the predicted emotion is anger."
elif (predicted_class_name, predicted_emotion[0]) == ('Completely Natural', 'Anger'):
    text = "The naturalness of the provided audio is authentic and very natural, the predicted emotion is anger."
elif (predicted_class_name, predicted_emotion[0]) == ('Very Unnatural', 'Base'):
    text = "The naturalness of the provided audio is unauthentic and bad, the predicted emotion is base."
elif (predicted_class_name, predicted_emotion[0]) == ('Unnatural', 'Base'):
    text = "The naturalness of the provided audio is unauthentic and ok, the predicted emotion is base."
elif (predicted_class_name, predicted_emotion[0]) == ('Neutral', 'Base'):
    text = "The naturalness of the provided audio is not great, the predicted emotion is base."
elif (predicted_class_name, predicted_emotion[0]) == ('Natural', 'Base'):
    text = "The naturalness of the provided audio is authentic and good, the predicted emotion is base."
elif (predicted_class_name, predicted_emotion[0]) == ('Completely Natural', 'Base'):
    text = "The naturalness of the provided audio is authentic and very natural, the predicted emotion is base."
elif (predicted_class_name, predicted_emotion[0]) == ('Very Unnatural', 'Calm'):
    text = "The naturalness of the provided audio is unauthentic and bad, the predicted emotion is calm."
elif (predicted_class_name, predicted_emotion[0]) == ('Unnatural', 'Calm'):
    text = "The naturalness of the provided audio is unauthentic and ok, the predicted emotion is calm."
elif (predicted_class_name, predicted_emotion[0]) == ('Neutral', 'Calm'):
    text = "The naturalness of the provided audio is not great, the predicted emotion is calm."
elif (predicted_class_name, predicted_emotion[0]) == ('Natural', 'Calm'):
    text = "The naturalness of the provided audio is authentic and good, the predicted emotion is calm."
elif (predicted_class_name, predicted_emotion[0]) == ('Completely Natural', 'Calm'):
    text = "The naturalness of the provided audio is authentic and very natural, the predicted emotion is calm."
elif (predicted_class_name, predicted_emotion[0]) == ('Very Unnatural', 'Surprise'):
    text = "The naturalness of the provided audio is unauthentic and bad, the predicted emotion is surprise."
elif (predicted_class_name, predicted_emotion[0]) == ('Unnatural', 'Surprise'):
    text = "The naturalness of the provided audio is unauthentic and ok, the predicted emotion is surprise."
elif (predicted_class_name, predicted_emotion[0]) == ('Neutral', 'Surprise'):
    text = "The naturalness of the provided audio is not great, the predicted emotion is surprise."
elif (predicted_class_name, predicted_emotion[0]) == ('Natural', 'Surprise'):
    text = "The naturalness of the provided audio is authentic and good, the predicted emotion is surprise."
elif (predicted_class_name, predicted_emotion[0]) == ('Completely Natural', 'Surprise'):
    text = "The naturalness of the provided audio is authentic and very natural, the predicted emotion is surprise."
elif (predicted_class_name, predicted_emotion[0]) == ('Very Unnatural', 'Fear'):
    text = "The naturalness of the provided audio is unauthentic and bad, the predicted emotion is fear."
elif (predicted_class_name, predicted_emotion[0]) == ('Unnatural', 'Fear'):
    text = "The naturalness of the provided audio is unauthentic and ok, the predicted emotion is fear."
elif (predicted_class_name, predicted_emotion[0]) == ('Neutral', 'Fear'):
    text = "The naturalness of the provided audio is not great, the predicted emotion is fear."
elif (predicted_class_name, predicted_emotion[0]) == ('Natural', 'Fear'):
    text = "The naturalness of the provided audio is authentic and good, the predicted emotion is fear."
elif (predicted_class_name, predicted_emotion[0]) == ('Completely Natural', 'Fear'):
    text = "The naturalness of the provided audio is authentic and very natural, the predicted emotion is fear."
elif (predicted_class_name, predicted_emotion[0]) == ('Very Unnatural', 'Apologetic'):
    text = "The naturalness of the provided audio is unauthentic and bad, the predicted emotion is apologetic."
elif (predicted_class_name, predicted_emotion[0]) == ('Unnatural', 'Apologetic'):
    text = "The naturalness of the provided audio is unauthentic and ok, the predicted emotion is apologetic."
elif (predicted_class_name, predicted_emotion[0]) == ('Neutral', 'Apologetic'):
    text = "The naturalness of the provided audio is not great, the predicted emotion is apologetic."
elif (predicted_class_name, predicted_emotion[0]) == ('Natural', 'Apologetic'):
    text = "The naturalness of the provided audio is authentic and good, the predicted emotion is apologetic."
elif (predicted_class_name, predicted_emotion[0]) == ('Completely Natural', 'Apologetic'):
    text = "The naturalness of the provided audio is authentic and very natural, the predicted emotion is apologetic."
elif (predicted_class_name, predicted_emotion[0]) == ('Very Unnatural', 'Sad'):
    text = "The naturalness of the provided audio is unauthentic and bad, the predicted emotion is Sad."
elif (predicted_class_name, predicted_emotion[0]) == ('Unnatural', 'Sad'):
    text = "The naturalness of the provided audio is unauthentic and ok, the predicted emotion is Sad."
elif (predicted_class_name, predicted_emotion[0]) == ('Neutral', 'Sad'):
    text = "The naturalness of the provided audio is not great, the predicted emotion is Sad."
elif (predicted_class_name, predicted_emotion[0]) == ('Natural', 'Sad'):
    text = "The naturalness of the provided audio is authentic and good, the predicted emotion is Sad."
elif (predicted_class_name, predicted_emotion[0]) == ('Completely Natural', 'Sad'):
    text = "The naturalness of the provided audio is authentic and very natural, the predicted emotion is Sad."
else:
    text = "No text available for the given combination."

print (text)
evaluate_text_to_speech_system(text)

[[0.131831   0.10057315 0.2651936  0.29380453 0.20859776]]
 
The provided audio's naturalness output is:  Natural
Predicted emotion: Sad
The naturalness of the provided audio is authentic and good, the predicted emotion is Sad.


In [2]:
# First Trial
from flask import Flask, request, jsonify
import os
import librosa
import numpy as np
from tensorflow import keras
import pandas as pd
import subprocess
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
import speech_recognition as sr
import os
from pydub import AudioSegment


# Load the emotion recognition model
model = keras.models.load_model('E:\Samsung-Test\TEST\models\model.h5')

# Initialize the Flask app
app = Flask(__name__)


""" def transcribe_audio(mp3_file):
    wav_file = 'temp_audio.wav'

    # Convert MP3 to WAV using pydub
    audio = AudioSegment.from_mp3(mp3_file)
    audio.export(wav_file, format='wav')
    
    wav_file = 'temp_audio.wav'
    # Transcribe the audio file using SpeechRecognition
    r = sr.Recognizer()
    with sr.AudioFile(wav_file) as source:
        audio_data = r.record(source)
        transcription = r.recognize_google(audio_data)

    # Remove the temporary WAV file
    os.remove(wav_file)

    return transcription """






""" v11def transcribe_audio(mp3_file):
    # Initialize the recognizer
    recognizer = sr.Recognizer()

    # Load the audio file
    with sr.AudioFile(mp3_file) as source:
        audio = recognizer.record(source)

    # Perform speech recognition
    try:
        transcription = recognizer.recognize_google(audio)
        return transcription
    except sr.UnknownValueError:
        print("Speech recognition could not understand audio")
    except sr.RequestError as e:
        print("Could not request results from speech recognition service; {0}".format(e))
    return "" """

def transcribe_audio(mp3_file):
    # Convert audio file to PCM WAV format using FFmpeg
    wav_file = 'temp_audio.wav'
    subprocess.run(['ffmpeg', '-i', mp3_file, '-acodec', 'pcm_s16le', '-ar', '16000', '-ac', '1', wav_file], check=True)

    # Initialize the recognizer
    recognizer = sr.Recognizer()

    # Load the audio file
    with sr.AudioFile(wav_file) as source:
        audio = recognizer.record(source)

    # Perform speech recognition
    try:
        transcription = recognizer.recognize_google(audio)
        return transcription
    except sr.UnknownValueError:
        print("Speech recognition could not understand audio")
    except sr.RequestError as e:
        print("Could not request results from speech recognition service; {0}".format(e))
    finally:
        # Clean up the temporary WAV file
        os.remove(wav_file)

    return 


def predict_emotion(text):
    # Load the CSV data
    df = pd.read_csv('E:\Samsung-Test\TEST\emotions.csv')

    # Replace missing values with an empty string
    df['Text'].fillna('', inplace=True)

    # Extract input features (X) and labels (y) from the CSV
    X = df['Text'].values
    y = df['Emotion'].values

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Vectorize the text data using TF-IDF
    vectorizer = TfidfVectorizer()
    X_train_vectorized = vectorizer.fit_transform(X_train)
    X_test_vectorized = vectorizer.transform(X_test)

    # Train a model on the training data
    model = LogisticRegression()
    model.fit(X_train_vectorized, y_train)

    # Make predictions on new text inputs
    text_vectorized = vectorizer.transform([text])
    predicted_emotion = model.predict(text_vectorized)
    
    return predicted_emotion[0]

def predict_naturalness(audio_path):
    # Load the necessary variables
    max_length = 99840  # Max sequence length
    num_mfcc = 8  # Number of MFCC coefficients

    # Load and preprocess the new audio sample
    new_audio, sr = librosa.load(audio_path, sr=None, mono=True)
    new_audio = librosa.resample(new_audio, orig_sr=sr, target_sr=16000)

    # Extract MFCC features for the new audio sample
    mfcc = librosa.feature.mfcc(y=new_audio, sr=16000, n_mfcc=num_mfcc)

    # Pad or truncate the MFCC features to match the expected shape
    if mfcc.shape[1] < max_length:
        mfcc = pad_sequences([mfcc.T], padding='post', maxlen=max_length, dtype='float32').T
    elif mfcc.shape[1] > max_length:
        mfcc = mfcc[:, :max_length]

    # Reshape the MFCC features to match the model's input shape
    mfcc = np.expand_dims(mfcc, axis=0)
    mfcc = np.swapaxes(mfcc, 1, 2)  # Swap axes to match the expected shape

    # Perform prediction
    predictions = model.predict(mfcc)
    class_names = ['Very Unnatural', 'Unnatural', 'Neutral', 'Natural', 'Completely Natural']
    predicted_class_index = np.argmax(predictions[0])
    predicted_class_name = class_names[predicted_class_index]

    return predicted_class_name

@app.route('/process-audio', methods=['POST'])
def process_audio():
    # Check if the audio file is present in the request
    if 'audio' not in request.files:
        return jsonify({'error': 'No audio file found'})

    audio_file = request.files['audio']

    # Save the audio file temporarily
    audio_path = 'temp_audio.wav'
    audio_file.save(audio_path)

    # Perform speech recognition
    transcription = transcribe_audio(audio_path)

    # Perform emotion recognition
    predicted_emotion = predict_emotion(transcription)

    # Perform naturalness classification
    predicted_naturalness = predict_naturalness(audio_path)

    # Clean up the temporary audio file
    os.remove(audio_path)

    # Return the results as JSON
    result = {
        'naturalness': predicted_naturalness,
        'emotion': predicted_emotion
    }

    return jsonify(result)

if __name__ == '__main__':
    app.run()


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit


In [1]:
from flask import Flask, request, jsonify
import os
import numpy as np
from tensorflow import keras
import pandas as pd
import subprocess
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
import speech_recognition as sr
from pydub import AudioSegment
from flask import Flask, request, jsonify
import os
from pydub import AudioSegment
import speech_recognition as sr
import librosa

# Load the emotion recognition model
model = keras.models.load_model('E:\Samsung-Test\TEST\models\model.h5')


# Initialize the Flask app
app = Flask(__name__)

def transcribe_audio(mp3_file):
    # Convert audio file to WAV format using FFmpeg
    wav_file = 'temp_audio.wav'
    ffmpeg_path = 'ffmpeg'  # Update with the correct path to ffmpeg if necessary
    command = [ffmpeg_path, '-i', mp3_file, '-acodec', 'pcm_s16le', '-ar', '16000', '-ac', '1', wav_file]
    process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    _, error = process.communicate()

    if process.returncode != 0:
        print(f"Audio conversion failed: {error.decode('utf-8').strip()}")
        return None

    # Initialize the recognizer
    recognizer = sr.Recognizer()

    # Load the audio file
    with sr.AudioFile(wav_file) as source:
        audio = recognizer.record(source)

    # Perform speech recognition
    try:
        transcription = recognizer.recognize_google(audio)
        return transcription
    except sr.UnknownValueError:
        print("Speech recognition could not understand audio")
    except sr.RequestError as e:
        print("Could not request results from speech recognition service; {0}".format(e))
    finally:
        # Clean up the temporary WAV file
        os.remove(wav_file)

    return "Function passed"



def predict_emotion(text):
    # Load the CSV data
    df = pd.read_csv('E:\Samsung-Test\TEST\emotions.csv')

    # Replace missing values with an empty string
    df['Text'].fillna('', inplace=True)

    # Extract input features (X) and labels (y) from the CSV
    X = df['Text'].values
    y = df['Emotion'].values

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Vectorize the text data using TF-IDF
    vectorizer = TfidfVectorizer()
    X_train_vectorized = vectorizer.fit_transform(X_train)
    X_test_vectorized = vectorizer.transform(X_test)

    # Train a model on the training data
    model = LogisticRegression()
    model.fit(X_train_vectorized, y_train)

    # Make predictions on new text inputs
    text_vectorized = vectorizer.transform([text])
    predicted_emotion = model.predict(text_vectorized)

    return predicted_emotion[0]


def predict_naturalness(audio_path):
    # Load the necessary variables
    max_length = 99840  # Max sequence length
    num_mfcc = 8  # Number of MFCC coefficients

    # Load and preprocess the new audio sample
    new_audio, sr = librosa.load(audio_path, sr=None, mono=True)
    new_audio = librosa.resample(new_audio, orig_sr=sr, target_sr=16000)

    # Extract MFCC features for the new audio sample
    mfcc = librosa.feature.mfcc(y=new_audio, sr=16000, n_mfcc=num_mfcc)

    # Pad or truncate the MFCC features to match the expected shape
    if mfcc.shape[1] < max_length:
        mfcc = pad_sequences([mfcc.T], padding='post', maxlen=max_length, dtype='float32').T
    elif mfcc.shape[1] > max_length:
        mfcc = mfcc[:, :max_length]

    # Reshape the MFCC features to match the model's input shape
    mfcc = np.expand_dims(mfcc, axis=0)
    mfcc = np.swapaxes(mfcc, 1, 2)  # Swap axes to match the expected shape

    # Perform prediction
    predictions = model.predict(mfcc)
    class_names = ['Very Unnatural', 'Unnatural', 'Neutral', 'Natural', 'Completely Natural']
    predicted_class_index = np.argmax(predictions[0])
    predicted_class_name = class_names[predicted_class_index]

    return predicted_class_name


@app.route('/process-audio', methods=['POST'])
def process_audio():
    # Check if the audio file is present in the request
    if 'audio' not in request.files:
        return jsonify({'error': 'No audio file found'})

    audio_file = request.files['audio']

    # Save the audio file temporarily
    audio_path = 'temp_audio.mp3'
    audio_file.save(audio_path)

    # Perform speech recognition
    transcription = transcribe_audio(audio_path)

    # Perform emotion recognition
    predicted_emotion = predict_emotion(transcription)

    # Perform naturalness classification
    predicted_naturalness = predict_naturalness(audio_path)

    # Clean up the temporary audio file
    os.remove(audio_path)

    # Return the results as JSON
    result = {
        'naturalness': predicted_naturalness,
        'emotion': predicted_emotion
    }

    return jsonify(result)


if __name__ == '__main__':
    app.run()


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit




127.0.0.1 - - [16/Jun/2023 17:50:41] "POST /process-audio HTTP/1.1" 200 -




127.0.0.1 - - [16/Jun/2023 17:52:04] "POST /process-audio HTTP/1.1" 200 -




127.0.0.1 - - [16/Jun/2023 17:53:26] "POST /process-audio HTTP/1.1" 200 -


In [10]:
# uding the ffmpeg udating the code
#Latest half working code

from flask import Flask, request, jsonify
from flask_cors import CORS
import os
import numpy as np
from tensorflow import keras
import pandas as pd
import subprocess
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
import speech_recognition as sr
from pydub import AudioSegment
import librosa

# Load the emotion recognition model
model = keras.models.load_model('E:\Samsung-Test\TEST\models\model.h5')

# Initialize the Flask app
app = Flask(__name__)
CORS(app)  # Enable CORS

def transcribe_audio(mp3_file):
    # Convert audio file to WAV format using FFmpeg
    wav_file = 'temp_audio.wav'
    ffmpeg_path = 'ffmpeg'  # Update with the correct path to ffmpeg if necessary
    command = [ffmpeg_path, '-i', mp3_file, '-acodec', 'pcm_s16le', '-ar', '16000', '-ac', '1', wav_file]
    process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    _, error = process.communicate()

    if process.returncode != 0:
        print(f"Audio conversion failed: {error.decode('utf-8').strip()}")
        return None

    # Initialize the recognizer
    recognizer = sr.Recognizer()

    # Load the audio file
    with sr.AudioFile(wav_file) as source:
        audio = recognizer.record(source)

    # Perform speech recognition
    try:
        transcription = recognizer.recognize_google(audio)
        return transcription
    except sr.UnknownValueError:
        print("Speech recognition could not understand audio")
    except sr.RequestError as e:
        print("Could not request results from speech recognition service; {0}".format(e))
    finally:
        # Clean up the temporary WAV file
        os.remove(wav_file)

    return "Function passed"


def predict_emotion(text):
    # Load the CSV data
    df = pd.read_csv('E:\Samsung-Test\TEST\emotions.csv')

    # Replace missing values with an empty string
    df['Text'].fillna('', inplace=True)

    # Extract input features (X) and labels (y) from the CSV
    X = df['Text'].values
    y = df['Emotion'].values

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Vectorize the text data using TF-IDF
    vectorizer = TfidfVectorizer()
    X_train_vectorized = vectorizer.fit_transform(X_train)
    X_test_vectorized = vectorizer.transform(X_test)

    # Train a model on the training data
    model = LogisticRegression()
    model.fit(X_train_vectorized, y_train)

    # Make predictions on new text inputs
    text_vectorized = vectorizer.transform([text])
    predicted_emotion = model.predict(text_vectorized)

    return predicted_emotion[0]


def predict_naturalness(audio_path):
    # Load the necessary variables
    max_length = 99840  # Max sequence length
    num_mfcc = 8  # Number of MFCC coefficients

    # Load and preprocess the new audio sample
    new_audio, sr = librosa.load(audio_path, sr=None, mono=True)
    new_audio = librosa.resample(new_audio, orig_sr=sr, target_sr=16000)

    # Extract MFCC features for the new audio sample
    mfcc = librosa.feature.mfcc(y=new_audio, sr=16000, n_mfcc=num_mfcc)

    # Pad or truncate the MFCC features to match the expected shape
    if mfcc.shape[1] < max_length:
        mfcc = pad_sequences([mfcc.T], padding='post', maxlen=max_length, dtype='float32').T
    elif mfcc.shape[1] > max_length:
        mfcc = mfcc[:, :max_length]

    # Reshape the MFCC features to match the model's input shape
    mfcc = np.expand_dims(mfcc, axis=0)
    mfcc = np.swapaxes(mfcc, 1, 2)  # Swap axes to match the expected shape

    # Perform prediction
    predictions = model.predict(mfcc)
    class_names = ['Very Unnatural', 'Unnatural', 'Neutral', 'Natural', 'Completely Natural']
    predicted_class_index = np.argmax(predictions[0])
    predicted_class_name = class_names[predicted_class_index]

    return predicted_class_name


@app.route('/process-audio', methods=['POST'])
def process_audio():
    # Check if the audio file is present in the request
    if 'audio' not in request.files:
        return jsonify({'error': 'No audio file found'})

    audio_file = request.files['audio']

    # Save the audio file temporarily
    audio_path = 'temp_audio.mp3'
    audio_file.save(audio_path)

    # Perform speech recognition
    transcription = transcribe_audio(audio_path)

    # Perform emotion recognition
    predicted_emotion = predict_emotion(transcription)

    # Perform naturalness classification
    predicted_naturalness = predict_naturalness(audio_path)

    # Clean up the temporary audio file
    os.remove(audio_path)

    # Return the results as JSON
    result = {
        'naturalness': predicted_naturalness,
        'emotion': predicted_emotion
    }

    return jsonify(result)


if __name__ == '__main__':
    app.run()


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit




127.0.0.1 - - [16/Jun/2023 18:08:55] "POST /process-audio HTTP/1.1" 200 -


In [1]:
#Using flask updated
#Trying to debug

from flask import Flask, request, jsonify, render_template
import os
import numpy as np
from tensorflow import keras
import pandas as pd
import subprocess
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
import speech_recognition as sr
from pydub import AudioSegment
import librosa

# Load the emotion recognition model
model = keras.models.load_model('E:\Samsung-Test\TEST\models\model.h5')


# Initialize the Flask app
app = Flask(__name__)

def transcribe_audio(mp3_file):
    # Convert audio file to WAV format using FFmpeg
    wav_file = 'temp_audio.wav'
    ffmpeg_path = 'ffmpeg'  # Update with the correct path to ffmpeg if necessary
    command = [ffmpeg_path, '-i', mp3_file, '-acodec', 'pcm_s16le', '-ar', '16000', '-ac', '1', wav_file]
    process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    _, error = process.communicate()

    if process.returncode != 0:
        print(f"Audio conversion failed: {error.decode('utf-8').strip()}")
        return None

    # Initialize the recognizer
    recognizer = sr.Recognizer()

    # Load the audio file
    with sr.AudioFile(wav_file) as source:
        audio = recognizer.record(source)

    # Perform speech recognition
    try:
        transcription = recognizer.recognize_google(audio)
        return transcription
    except sr.UnknownValueError:
        print("Speech recognition could not understand audio")
    except sr.RequestError as e:
        print("Could not request results from speech recognition service; {0}".format(e))
    finally:
        # Clean up the temporary WAV file
        os.remove(wav_file)

    return "Function passed"



def predict_emotion(text):
    # Load the CSV data
    df = pd.read_csv('E:\Samsung-Test\TEST\emotions.csv')

    # Replace missing values with an empty string
    df['Text'].fillna('', inplace=True)

    # Extract input features (X) and labels (y) from the CSV
    X = df['Text'].values
    y = df['Emotion'].values

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Vectorize the text data using TF-IDF
    vectorizer = TfidfVectorizer()
    X_train_vectorized = vectorizer.fit_transform(X_train)
    X_test_vectorized = vectorizer.transform(X_test)

    # Train a model on the training data
    model = LogisticRegression()
    model.fit(X_train_vectorized, y_train)

    # Make predictions on new text inputs
    text_vectorized = vectorizer.transform([text])
    predicted_emotion = model.predict(text_vectorized)

    return predicted_emotion[0]


def predict_naturalness(audio_path):
    # Load the necessary variables
    max_length = 99840  # Max sequence length
    num_mfcc = 8  # Number of MFCC coefficients

    # Load and preprocess the new audio sample
    new_audio, sr = librosa.load(audio_path, sr=None, mono=True)
    new_audio = librosa.resample(new_audio, orig_sr=sr, target_sr=16000)

    # Extract MFCC features for the new audio sample
    mfcc = librosa.feature.mfcc(y=new_audio, sr=16000, n_mfcc=num_mfcc)

    # Pad or truncate the MFCC features to match the expected shape
    if mfcc.shape[1] < max_length:
        mfcc = pad_sequences([mfcc.T], padding='post', maxlen=max_length, dtype='float32').T
    elif mfcc.shape[1] > max_length:
        mfcc = mfcc[:, :max_length]

    # Reshape the MFCC features to match the model's input shape
    mfcc = np.expand_dims(mfcc, axis=0)
    mfcc = np.swapaxes(mfcc, 1, 2)  # Swap axes to match the expected shape

    # Perform prediction
    predictions = model.predict(mfcc)
    class_names = ['Very Unnatural', 'Unnatural', 'Neutral', 'Natural', 'Completely Natural']
    predicted_class_index = np.argmax(predictions[0])
    predicted_class_name = class_names[predicted_class_index]

    return predicted_class_name


# UPLOAD_FOLDER = r'E:\Samsung-Test\TEST\backend.ipynb'
# app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER

@app.route('/home')
def index():
    return render_template('index.html')


@app.route('/process-audio', methods=['POST'])
def process_audio():
    # Check if the audio file is present in the request
    if 'audio' not in request.files:
        return jsonify({'error': 'No audio file found'})

    audio_file = request.files['audio']

    # Save the audio file temporarily
    audio_path = 'temp_audio.mp3'
    audio_file.save(audio_path)

    try:
        # Perform speech recognition
        transcription = transcribe_audio(audio_path)

        # Perform emotion recognition
        predicted_emotion = predict_emotion(transcription)

        # Perform naturalness classification
        predicted_naturalness = predict_naturalness(audio_path)

        # Clean up the temporary audio file
        os.remove(audio_path)

        # Return the results as JSON
        result = {
            'naturalness': predicted_naturalness,
            'emotion': predicted_emotion
        }

        return jsonify(result)
    except Exception as e:
        return jsonify({'error': str(e)})


if __name__ == '__main__':
    app.run(debug=True)


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
 * Restarting with stat


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
