In [26]:
from os import path
import sounddevice as sd
import scipy.io.wavfile as wav
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from lang_trans.arabic import buckwalter
from nltk import edit_distance
from tqdm import tqdm
import pyquran as q

In [27]:
def record():
    fs = 16000  # Sample rate
    seconds = 5  # Duration of recording
    print("Recording...")
    myrecording = sd.rec(int(seconds * fs), samplerate=fs, channels=1)
    sd.wait()  # Wait until recording is finished
    print("Finished recording.")
    return fs , myrecording[:,0]

In [28]:
def load_Quran_fine_tuned_elgeish_xlsr_53_model_and_processor():
    global loaded_model, loaded_processor
    loaded_model = Wav2Vec2ForCTC.from_pretrained("Nuwaisir/Quran_speech_recognizer").eval()
    loaded_processor = Wav2Vec2Processor.from_pretrained("Nuwaisir/Quran_speech_recognizer")

In [29]:
def load_elgeish_xlsr_53_model_and_processor():
    global loaded_model, loaded_processor
    loaded_model = Wav2Vec2ForCTC.from_pretrained("elgeish/wav2vec2-large-xlsr-53-arabic").eval()
    loaded_processor = Wav2Vec2Processor.from_pretrained("elgeish/wav2vec2-large-xlsr-53-arabic")

In [30]:
def predict(single):
    inputs = loaded_processor(single["speech"], sampling_rate=16000, return_tensors="pt", padding=True)
    with torch.no_grad():
        predicted = torch.argmax(loaded_model(inputs.input_values).logits, dim=-1)
    predicted[predicted == -100] = loaded_processor.tokenizer.pad_token_id  # see fine-tuning script
    pred_1 = loaded_processor.tokenizer.batch_decode(predicted)[0]
    single["predicted"] = buckwalter.untrans(pred_1)
    return single

In [31]:
def last_para_str(taskeel=False):
    quran_string = ''
    for i in range (78, 115):
        quran_string += ' '.join(q.quran.get_sura(i, with_tashkeel=taskeel,basmalah=False))
        quran_string += ' '
    return quran_string

def find_match_2(q_str, s, spaces, threshhold = 10):
  len_q = len(q_str)
  len_s = len(s)
  min_dist = 1000000000
  min_dist_pos = []
  for i in tqdm(spaces):
    j = i+1
    k = j + len_s + len_s // 3
    if k > len_q:
      break
    dist = edit_distance(q_str[j:k],s)
    if dist < min_dist:
      min_dist = dist
      min_dist_pos = [j]
    elif dist == min_dist:
      min_dist_pos.append(j)
  return min_dist, min_dist_pos

def find_all_index(s, ch):
    return [i for i, ltr in enumerate(s) if ltr == ch]

In [32]:
last_para = last_para_str(taskeel=True)
last_para_spaces = find_all_index(last_para,' ')
last_para_spaces.insert(0, -1)

In [33]:
def pipeline():
    fs, myrecording = record()
    single_example = {
        "speech": myrecording,
        "sampling_rate": fs,
    }
    predicted = predict(single_example)
    print(predicted["predicted"])
    dist,poses = find_match_2(last_para, predicted['predicted'], spaces=last_para_spaces)
    print("distance:",dist)
    for i in poses:
        print(last_para[i:i+200],'\n')


### Load the elgeish_xlsr_53 model

In [None]:
load_elgeish_xlsr_53_model_and_processor()

### Load Quran fine-tuned elgeish_xlsr_53 model

In [34]:
load_Quran_fine_tuned_elgeish_xlsr_53_model_and_processor()

In [39]:
# Recite after running this cell. The first 5 seconds will capture your audio
pipeline()

Recording...
Finished recording.
وَالشَّمْسِ وَضُوحَاهَا


100%|█████████▉| 2305/2309 [00:02<00:00, 1113.52it/s]

distance: 10
وَالشَّمْسِ وَضُحَىهَا وَالْقَمَرِ إِذَا تَلَىهَا وَالنَّهَارِ إِذَا جَلَّىهَا وَالَّيْلِ إِذَا يَغْشَىهَا وَالسَّمَاءِ وَمَا بَنَىهَا وَالْأَرْضِ وَمَا طَحَىهَا وَنَفْسٍ وَمَا سَوَّىهَا فَأَلْهَمَهَا 




