# Automatic Speech Recognition 
 

## First step:
Extracting the Audio from video 

In [None]:
import moviepy.editor as mp
import librosa as lr
import matplotlib.pyplot as plt
import numpy as np
import librosa.display as display
import scipy
from IPython.display import Audio
import pydub
from pydub import AudioSegment
from pydub.silence import split_on_silence
import speech_recognition as sr
import os
import jiwer
print("Done! necessary libraries are imported")



### Extracting the audio from video file

In [None]:
clip = mp.VideoFileClip(r"iceland.mp4")
clip.audio.write_audiofile(r"converted.wav")
Audio("converted.wav")

### Signal visualization

In [None]:
#read the audio
samples, sample_rate = lr.load("converted.wav")

plt.figure(figsize=(20, 6))
plt.plot(samples[500000: 600000])
n_samples = len(samples)
print("number of samples is:",n_samples)



### plot the audio in time domain

In [None]:

time = np.arange(0, len(samples))/sample_rate
plt.figure(figsize=(20, 6))
plt.plot(time, samples)
print("n_time_points:",len(time))


In [None]:
total_duration = time.shape[0]/sample_rate
print("total_duration in seconds :",total_duration)



### Short time fourier transform

In [None]:
y =samples
stft_feature = lr.amplitude_to_db(np.abs(lr.stft(y, 
                                                 n_fft=1024,
                                                 hop_length=512,
                                                 window=scipy.signal.hanning
                                                )),
                                  ref=np.max
                                 )
# Plot Spectrogram
plt.figure(figsize=(20,3))
display.specshow(stft_feature, y_axis='log', x_axis='time')
plt.colorbar(format='%+2.0f dB')
plt.tight_layout()
plt.show()

### split the audio into segments of 3-5 seconds

In [None]:
sound = AudioSegment.from_wav(file="converted.wav")
print("total duration:", sound.duration_seconds)

# split on silence 
chunks = split_on_silence(sound, min_silence_len=200, silence_thresh=-34, keep_silence=True)


# remerge segments if duration less than 3 second
durations_b_m = []
for chunk in chunks:
    durations_b_m.append(chunk.duration_seconds)


duration_minimun = 3
chunks_merged = []

currently_merging = False
merging_chunk = None

for j in range(0, len(chunks)):

    if not currently_merging:
        current_chunk = chunks[j]
    else:
        current_chunk = merging_chunk + chunks[j]

    if current_chunk.duration_seconds > duration_minimun:
         chunks_merged.append(current_chunk)
         merging_chunk = None
         currently_merging = False
    else:
        currently_merging = True
        merging_chunk = current_chunk
        
# saving the segments after merging as wav file
durations = []
for i, chunk in enumerate(chunks_merged):
    durations.append(chunk.duration_seconds)
    chunk.export("chunks/chunk{:02d}.wav".format(i), format="wav")
    
    
print("total number of chunks before merging is:", len(chunks))    
print("total number of chunks after merging is:", len(chunks_merged))    
    
print("durations of segments before merging in seconds")
for i, d in enumerate(durations_b_m[0:5]):
    print("segment %d: %.2f "%(i,d))
    
print("duration of segments after merging in seconds")
for i, d in enumerate(durations[0:10]):
    print("segment %d: %.2f "%(i,d))
    
#print("minimum %.2f sec, maximum %.2f sec" %(min(durations), max(durations)))


In [None]:
Audio("chunks/chunk00.wav")


In [None]:
Audio("chunks/chunk01.wav")

In [None]:
Audio("chunks/chunk03.wav")

### Transcribe the segments using SpeechRecognition python library

In [None]:
files = os.listdir("chunks")
sorted_files = sorted(files)
sorted_files = sorted_files[1:]
#print("files:", sorted_files)

r = sr.Recognizer()
text_segments = []
for file in sorted_files:
    audio = sr.AudioFile("chunks"+"/"+file)
    with audio as source:
        audio_file = r.record(source)
        result = r.recognize_google(audio_file, language='de-DE')
        text_segments.append(result)
    print(result)

with open('recognized_segments.txt', mode='w') as file:
    for item in text_segments:
        file.write("%s\n" % item)



### Evaluation

In [None]:
with open("transcribed.txt", "r") as test:
    refs = test.readlines()
with open("recognized.txt", "r") as pred:
    preds = pred.readlines()

reference = refs[0]
predicted = preds[1]

measures = jiwer.compute_measures(refs[0], preds[1])
wer = measures['wer']
mer = measures['mer']
print("Evaluation")
print("word error rate is: ", wer*100)
print("match error rate is: ", mer*100)