## Importing Data

In [17]:
import librosa
import os
import numpy as np

In [18]:
# Path to the Morse code audio files
DATA_PATH = './morse_code_train_data/'

In [19]:
# Function to extract MFCC features from an audio file
def extract_features(file_path):
    audio, sample_rate = librosa.load(file_path)
    # Extract 40 MFCC features
    mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
    # Take the mean of the MFCCs over time to reduce the feature dimensionality
    mfccs_mean = np.mean(mfccs.T, axis=0)
    return mfccs_mean

In [20]:
# Load the dataset and extract features
def load_data():
    features = []  # To store MFCC features
    labels = []    # To store the corresponding labels (letters/numbers)
    
    # Loop through all files in the data directory
    for file_name in os.listdir(DATA_PATH):
        if file_name.endswith('.wav'):
            # Extract the label from the file name (e.g., 'A.wav' -> label 'A')
            label = file_name[0]  # Assuming the first character of the filename is the label
            file_path = os.path.join(DATA_PATH, file_name)
            
            # Extract features from the audio file
            mfccs = extract_features(file_path)
            
            # Append the features and label to their respective lists
            features.append(mfccs)
            labels.append(label)
    
    return np.array(features), np.array(labels)

In [21]:
# Call the load_data function to get the features and labels
X, y = load_data()

# Print shapes of X and y to confirm
print(f"Features shape: {X.shape}, Labels shape: {y.shape}")

Features shape: (36, 40), Labels shape: (36,)


## Splitting Data

In [30]:
import librosa
import os
import numpy as np

# Path to the Morse code audio files
DATA_PATH = './morse_code_train_data/'

# Function to extract MFCC features from an audio file
def extract_features(file_path):
    audio, sample_rate = librosa.load(file_path)
    mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
    return np.mean(mfccs.T, axis=0)

# Load the dataset and extract features
def load_data():
    features = []  # To store MFCC features
    labels = []    # To store the corresponding labels (letters/numbers)
    
    # Loop through all files in the data directory
    for file_name in os.listdir(DATA_PATH):
        if file_name.endswith('.wav'):
            label = file_name[0]  # Assuming the first character of the filename is the label
            file_path = os.path.join(DATA_PATH, file_name)
            mfccs = extract_features(file_path)
            features.append(mfccs)
            labels.append(label)
    
    return np.array(features), np.array(labels)

# Load the full dataset
X, y = load_data()

# Manually specify which files to include in the testing set
# Example: Let's say we want to test on these letters
testing_files = ['A.wav', 'B.wav', 'C.wav', 'Z.wav', 'Y.wav', 'X.wav', '1.wav', '0.wav', 'H.wav']  # Add any letters you want to test

# Initialize lists for training and testing data
X_train, y_train = [], []
X_test, y_test = [], []

# Split the data
for file_name in os.listdir(DATA_PATH):
    if file_name.endswith('.wav'):
        # If the file is in the testing set, add to test data
        if file_name in testing_files:
            index = np.where(y == file_name[0])[0][0]  # Get index of the label in original labels
            X_test.append(X[index])  # Add corresponding features to test set
            y_test.append(y[index])  # Add corresponding label to test set
        
        index = np.where(y == file_name[0])[0][0]  # Get index of the label in original labels
        X_train.append(X[index])  # Add corresponding features to train set
        y_train.append(y[index])  # Add corresponding label to train set

# Convert lists back to numpy arrays
X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)

# Print shapes to confirm split
print(f"Training Features shape: {X_train.shape}, Training Labels shape: {y_train.shape}")
print(f"Testing Features shape: {X_test.shape}, Testing Labels shape: {y_test.shape}")

Training Features shape: (36, 40), Training Labels shape: (36,)
Testing Features shape: (9, 40), Testing Labels shape: (9,)


## Training the model and testing it

### Support Vector Machine

In [31]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Initialize the SVM classifier
classifier = SVC(kernel='linear')

# Train the classifier on the training data
classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)

# Get unique labels from the test data for classification report
target_names = np.unique(y_test)

# Generate classification report
report = classification_report(y_test, y_pred, target_names=target_names)

# Print results
print(f"Model Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:")
print(report)

Model Accuracy: 100.00%
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         1
           A       1.00      1.00      1.00         1
           B       1.00      1.00      1.00         1
           C       1.00      1.00      1.00         1
           H       1.00      1.00      1.00         1
           X       1.00      1.00      1.00         1
           Y       1.00      1.00      1.00         1
           Z       1.00      1.00      1.00         1

    accuracy                           1.00         9
   macro avg       1.00      1.00      1.00         9
weighted avg       1.00      1.00      1.00         9



### Random Forest

In [33]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier on the training data
rf_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = rf_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)

# Generate classification report
report = classification_report(y_test, y_pred)

# Print results
print(f"Model Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:")
print(report)

Model Accuracy: 100.00%
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         1
           A       1.00      1.00      1.00         1
           B       1.00      1.00      1.00         1
           C       1.00      1.00      1.00         1
           H       1.00      1.00      1.00         1
           X       1.00      1.00      1.00         1
           Y       1.00      1.00      1.00         1
           Z       1.00      1.00      1.00         1

    accuracy                           1.00         9
   macro avg       1.00      1.00      1.00         9
weighted avg       1.00      1.00      1.00         9



### K Nearest Neighbours

In [35]:
from sklearn.neighbors import KNeighborsClassifier

# Initialize the KNN classifier
knn_classifier = KNeighborsClassifier(n_neighbors=5)  # You can adjust n_neighbors based on your needs

# Train the classifier on the training data
knn_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = knn_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)

# Generate classification report
report = classification_report(y_test, y_pred)

# Print results
print(f"Model Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:")
print(report)

Model Accuracy: 22.22%
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       0.50      1.00      0.67         1
           2       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         0
           A       0.00      0.00      0.00         1
           B       0.00      0.00      0.00         1
           C       0.00      0.00      0.00         1
           H       0.00      0.00      0.00         1
           X       0.00      0.00      0.00         1
           Y       0.00      0.00      0.00         1
           Z       0.00      0.00      0.00         1

    accuracy                           0.22         9
   macro avg       0.12      0.15      0.13         9
weighted avg       0.17      0.22      0.19         9



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Decision Tree Classifier

In [36]:
from sklearn.tree import DecisionTreeClassifier

# Initialize the Decision Tree classifier
dt_classifier = DecisionTreeClassifier()

# Train the classifier on the training data
dt_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred_dt = dt_classifier.predict(X_test)

# Evaluate the model
accuracy_dt = accuracy_score(y_test, y_pred_dt)

# Generate classification report
report_dt = classification_report(y_test, y_pred_dt)

# Print results
print(f"Decision Tree Model Accuracy: {accuracy_dt * 100:.2f}%")
print("Decision Tree Classification Report:")
print(report_dt)

Decision Tree Model Accuracy: 100.00%
Decision Tree Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         1
           A       1.00      1.00      1.00         1
           B       1.00      1.00      1.00         1
           C       1.00      1.00      1.00         1
           H       1.00      1.00      1.00         1
           X       1.00      1.00      1.00         1
           Y       1.00      1.00      1.00         1
           Z       1.00      1.00      1.00         1

    accuracy                           1.00         9
   macro avg       1.00      1.00      1.00         9
weighted avg       1.00      1.00      1.00         9



### Logistic Regression

In [37]:
from sklearn.tree import DecisionTreeClassifier

# Initialize the Decision Tree classifier
dt_classifier = DecisionTreeClassifier()

# Train the classifier on the training data
dt_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred_dt = dt_classifier.predict(X_test)

# Evaluate the model
accuracy_dt = accuracy_score(y_test, y_pred_dt)

# Generate classification report
report_dt = classification_report(y_test, y_pred_dt)

# Print results
print(f"Decision Tree Model Accuracy: {accuracy_dt * 100:.2f}%")
print("Decision Tree Classification Report:")
print(report_dt)

Decision Tree Model Accuracy: 100.00%
Decision Tree Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         1
           A       1.00      1.00      1.00         1
           B       1.00      1.00      1.00         1
           C       1.00      1.00      1.00         1
           H       1.00      1.00      1.00         1
           X       1.00      1.00      1.00         1
           Y       1.00      1.00      1.00         1
           Z       1.00      1.00      1.00         1

    accuracy                           1.00         9
   macro avg       1.00      1.00      1.00         9
weighted avg       1.00      1.00      1.00         9



### Gradient Boosting Classifier

In [38]:
from sklearn.ensemble import GradientBoostingClassifier

# Initialize the Gradient Boosting classifier
gb_classifier = GradientBoostingClassifier()

# Train the classifier on the training data
gb_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred_gb = gb_classifier.predict(X_test)

# Evaluate the model
accuracy_gb = accuracy_score(y_test, y_pred_gb)

# Generate classification report
report_gb = classification_report(y_test, y_pred_gb)

# Print results
print(f"Gradient Boosting Model Accuracy: {accuracy_gb * 100:.2f}%")
print("Gradient Boosting Classification Report:")
print(report_gb)

Gradient Boosting Model Accuracy: 100.00%
Gradient Boosting Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         1
           A       1.00      1.00      1.00         1
           B       1.00      1.00      1.00         1
           C       1.00      1.00      1.00         1
           H       1.00      1.00      1.00         1
           X       1.00      1.00      1.00         1
           Y       1.00      1.00      1.00         1
           Z       1.00      1.00      1.00         1

    accuracy                           1.00         9
   macro avg       1.00      1.00      1.00         9
weighted avg       1.00      1.00      1.00         9



### Naive Bayes Classifier

In [39]:
from sklearn.naive_bayes import GaussianNB

# Initialize the Naive Bayes classifier
nb_classifier = GaussianNB()

# Train the classifier on the training data
nb_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred_nb = nb_classifier.predict(X_test)

# Evaluate the model
accuracy_nb = accuracy_score(y_test, y_pred_nb)

# Generate classification report
report_nb = classification_report(y_test, y_pred_nb)

# Print results
print(f"Naive Bayes Model Accuracy: {accuracy_nb * 100:.2f}%")
print("Naive Bayes Classification Report:")
print(report_nb)

Naive Bayes Model Accuracy: 100.00%
Naive Bayes Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         1
           A       1.00      1.00      1.00         1
           B       1.00      1.00      1.00         1
           C       1.00      1.00      1.00         1
           H       1.00      1.00      1.00         1
           X       1.00      1.00      1.00         1
           Y       1.00      1.00      1.00         1
           Z       1.00      1.00      1.00         1

    accuracy                           1.00         9
   macro avg       1.00      1.00      1.00         9
weighted avg       1.00      1.00      1.00         9



## Testing Data on Words

In [32]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import librosa
import numpy as np

# Train the classifier
classifier = SVC(kernel='linear')
classifier.fit(X_train, y_train)

# Function to predict letters using SVM
def predict_svm(features, classifier, label_encoder):
    # Predict using the trained classifier
    predictions = classifier.predict(features)
    
    # Decode the predictions to actual letter labels
    predicted_letters = label_encoder.inverse_transform(predictions)
    
    return predicted_letters

# Load and preprocess the audio file (user input)
def load_audio(file_path):
    audio, sample_rate = librosa.load(file_path)
    return audio, sample_rate

# Detect pauses based on silence (using RMS method)
def detect_pauses(audio, sample_rate, threshold=0.01):
    rms = librosa.feature.rms(y=audio)[0]
    pauses = np.where(rms < threshold)[0]
    return pauses

# Split the audio into segments by detected pauses
def split_audio_by_pauses(audio, sample_rate, pauses, min_segment_duration=0.2):
    segments = []
    start = 0
    for pause in pauses:
        end = pause * (512 / sample_rate)  # Convert pause index to time
        if end - start >= min_segment_duration:  # Ensure we only take longer segments
            segments.append(audio[int(start * sample_rate):int(end * sample_rate)])
        start = end
    return segments

# Extract MFCC features from each audio segment
def extract_features_from_segments(segments, sample_rate):
    features = []
    for segment in segments:
        mfccs = librosa.feature.mfcc(y=segment, sr=sample_rate, n_mfcc=40)
        features.append(np.mean(mfccs.T, axis=0))  # Take mean across MFCCs
    return np.array(features)

# Main function to handle the entire flow
def process_word_audio(file_path, classifier, label_encoder):
    # Step 1: Load the audio file
    audio, sr = load_audio(file_path)

    # Step 2: Detect pauses in the audio
    pauses = detect_pauses(audio, sr)

    # Step 3: Split the audio into segments based on the pauses
    segments = split_audio_by_pauses(audio, sr, pauses)

    # Step 4: Extract MFCC features from each audio segment
    features = extract_features_from_segments(segments, sr)

    # Step 5: Predict the letters using the SVM classifier
    predicted_letters = predict_svm(features, classifier, label_encoder)

    # Step 6: Combine the predicted letters into a word
    predicted_word = ''.join(predicted_letters)

    return predicted_word

# Example usage
file_path = './morse_code_testing/HI.wav'  # Replace with the actual file path
predicted_word = process_word_audio(file_path, classifier, label_encoder)
print(f"Predicted Word: {predicted_word}")

ValueError: y contains previously unseen labels: ['H']

### 