# Assignment 1 - Speech Emotion Recognition

In this assignment, I have used libraries **librosa, soundfile, glob, pickle and sklearn** to build a model using an **MLPClasssifier** and trained the model using **RAVDESS dataset** (Ryerson Audio-Visual Database of Emotional Speech and Song dataset) . We can also use any other pre-trained machine learning model that can classify emotions based on the extracted features. 

### Importing libraries

In [None]:
import librosa #to extract the audio data from the MP3 file and perform speech processing techniques on it to identify the emotions
import librosa.display
import soundfile
import os, glob, pickle
import matplotlib.pyplot as plt #to plot a graph to visualize the emotional changes throughout the file
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

### Extracting relevant features from sound file (mfcc, chroma, mel)

In [None]:
# Extract features (mfcc, chroma, mel) from a sound file
def feature_extraction(audio_file, mfcc, chroma, mel):
  with soundfile.SoundFile(audio_file) as sound_file:
        X = sound_file.read(dtype="float32")
        sample_rate=sound_file.samplerate
        if chroma:
            stft=np.abs(librosa.stft(X))
        result=np.array([])
        if mfcc:
            mfccs=np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
            result=np.hstack((result, mfccs))
        if chroma:
            chroma=np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
            result=np.hstack((result, chroma))
        if mel:
            mel=np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
            result=np.hstack((result, mel))
  return result

### Selecting emotions 

In [None]:
# Emotions in the RAVDESS dataset
emotions={
  '01':'neutral',
  '02':'calm',
  '03':'happy',
  '04':'sad',
  '05':'angry',
  '06':'fearful',
  '07':'disgust',
  '08':'surprised'
}
# Emotions to observe
observed_emotions=[ 'happy', 'sad', 'angry','neutral']

### Loading the data and extracting features for each sound file

In [None]:
def data_loading(test_size=0.2):
    x,y=[],[]
    for file in glob.glob("/content/drive/MyDrive/Colab Notebooks/Dataset\\Actor_*\\*.wav"):
        file_name=os.path.basename(file)
        emotion=emotions[file_name.split("-")[2]]
        if emotion not in observed_emotions:
            continue
        feature=feature_extraction(file, mfcc=True, chroma=True, mel=True)
        x.append(feature)
        y.append(emotion)
    return train_test_split(np.array(x), y, test_size=test_size, random_state=9)

### Training and testing

In [None]:
# Spliting the dataset into test and train
x_train,x_test,y_train,y_test=data_loading(test_size = 0.25)

In [None]:
# shape of the training and testing datasets
print((x_train.shape[0], x_test.shape[0]))

In [None]:
# number of features extracted
print(f'Features extracted: {x_train.shape[1]}')

### Building model

In [None]:
# Initialize the Multi Layer Perceptron Classifier
model=MLPClassifier(alpha=0.01, batch_size=256, epsilon=1e-08, hidden_layer_sizes=(300,), learning_rate='adaptive', max_iter=500)

In [None]:
# Train the model
model.fit(x_train,y_train)

In [None]:
# Predict for the test set
y_pred=model.predict(x_test)

### Accuracy

In [None]:
# Calculate the accuracy of our model
accuracy=accuracy_score(y_true=y_test, y_pred=y_pred)
# Print the accuracy
print("Accuracy: {:.2f}%".format(accuracy*100))

## Input audio file

In [None]:
# Load the audio file
audio_path = 'path/to/mp3/file.mp3'
y, sr = librosa.load(audio_path, duration=60)

### Model prediction

In [None]:
# Extract features from the audio file
chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
rmse = librosa.feature.rmse(y=y)
spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
zcr = librosa.feature.zero_crossing_rate(y)

In [None]:
# Compute the emotions from the features
features = np.vstack([chroma_stft, rmse, spec_cent, spec_bw, rolloff, zcr])
emotion_labels = ['happy', 'sad', 'angry', 'neutral']
emotion_predictions = model.predict(features.T)
predicted_emotions = [emotion_labels[np.argmax(pred)] for pred in emotion_predictions]

### Plotting

In [None]:
# Plot the emotions as a function of time
plt.figure(figsize=(10, 4))
librosa.display.waveplot(y, sr=sr, alpha=0.5)
plt.plot(np.linspace(0, len(y) / sr, len(predicted_emotions)), predicted_emotions, color='r')
plt.title('Emotional Changes in the Audio File')
plt.xlabel('Time (s)')
plt.ylabel('Emotion')
plt.show()

# Assignment 2 - Analysing Company Earning Calls

### Importing libraries

In [1]:
import pandas as pd
import re
import requests

# For webscrapping purpose
from bs4 import BeautifulSoup

### Reading excel file

In [2]:
# Read in the Excel file
df = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/corpo_announcements.xlsx')

### Regular expression

In [3]:
# Define regular expression patterns for transcript and audio links
transcript = re.compile(r'https?://.*\.pdf')
audio= re.compile(r'https?://.*\.mp3')

In [4]:
# Create new columns for transcript and audio links
df['Transcript Link'] = ''
df['Audio Link'] = ''

In [6]:
# Loop through each row of the DataFrame
for i, row in df.iterrows():
    # Extract the relevant columns
    date = row['HEADLINE']
    url = row['SOURCE']
    desc = row['NEWSSUB']
    info = row['MORE']
    
    # Check if the transcript link is in the URL column
    if transcript.search(url):
        df.at[i, 'Transcript Link'] = url
    else:
        # If not, scrape the URL for the transcript link
        try:
            r = requests.get(url)
            soup = BeautifulSoup(r.content, 'html.parser')
            links = soup.find_all('a')
            for link in links:
                href = link.get('href')
                if transcript.search(href):
                    df.at[i, 'Transcript Link'] = href
                    break
        except:
            pass
    
    # Check if the audio link is in the description or info column
    if audio.search(desc):
        df.at[i, 'Audio Link'] = audio.search(desc).group()
    elif audio.search(str(info)):
        df.at[i, 'Audio Link'] = audio.search(info).group()

### Final new excel sheet

In [7]:
# Output the new Excel file with the relevant data and links
df.to_excel('earnings_calls.xlsx', index=False)