### Setup

In [1]:
## path
path = 'drive/MyDrive/Colab Notebooks/'

In [2]:
%%capture
!pip install datasets # to use
!pip install git+https://github.com/huggingface/transformers # to user huggingface transformer
!pip install jiwer # for wer metric

!pip install -U pip
!pip install -U dill
!pip install -U nltk==3.4

In [3]:
## load packages
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
import os.path

import torch

import torch.nn as nn
import torch.nn.functional as F

from transformers import Wav2Vec2ForCTC, Wav2Vec2CTCTokenizer
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Processor

from datasets import load_metric

import librosa as lb

from sklearn.model_selection import train_test_split

from nltk.util import pad_sequence
from nltk.util import ngrams, bigrams
from nltk.lm.preprocessing import pad_both_ends
from nltk.lm.preprocessing import flatten

In [4]:
# seeding
random.seed(10)
np.random.seed(10)
torch.manual_seed(10)
torch.cuda.manual_seed_all(10)

In [5]:
## mount drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Data

In [6]:
## read into memory (small)
df = pd.read_feather('drive/MyDrive/Colab Notebooks/data/ASR_train_audio6683.ft')

In [7]:
## train valid split
df_train, df_valid = train_test_split(df, test_size=0.2, random_state=1234)

### XLSR Model

In [14]:
##
# load XLSR model
if not 'XLSRmodel' in globals():
  print('Load model')
  XLSRmodel = Wav2Vec2ForCTC.from_pretrained('./drive/MyDrive/Colab Notebooks/model/wav2vec2-large-xlsr-french-11Apr/checkpoint-1200/').to("cuda")

# load processor
if not 'processor' in globals():
  print('Load processor')
  tokenizer = Wav2Vec2CTCTokenizer.from_pretrained("facebook/wav2vec2-large-xlsr-53-french")
  feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)
  processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

# prepare dataset
def prepare_dataset(batch):
    return processor(batch, return_tensors="pt", sampling_rate=16*1e3)

# word error rate
wer_metric = load_metric("wer")
wer_ = []

#
input_dict = df_valid['audio_signal'].apply(prepare_dataset)

## WER over everything (one long string)
label_str = ''
pred_str = ''

for idx in range(len(df_valid)):
  #print('-----------------')
  logits = XLSRmodel(input_dict.values[idx].input_values.to("cuda")).logits
  pred_ids = torch.argmax(logits, dim=-1)[0]

  ## WER per sentence and then average
  #pred_str = processor.decode(pred_ids)
  #label_str = df_valid["transcription"].values[idx].lower()

  ## WER over everything (one long string)
  pred_str+= processor.decode(pred_ids)+ ' '
  label_str+= df_valid["transcription"].values[idx].lower()+ ' '

  # need same length for wer_metric ? really
  #label_str = label_str.ljust(len(pred_str))
  #pred_str = pred_str.ljust(len(label_str))

wer_.append(wer_metric.compute(predictions=[pred_str], references=[label_str]))

print(np.mean(wer_))

----------------
ban car mooy dem cheikh ahmadou ban mackétaboune<unk> dakar
ban car mooy dem cheikh ahmadou bando mackétaboune<unk> dakar
----------------
rue séne
rue bafaséne
----------------
croisement taly diallo
croisement telry diallo
----------------
place de l’indépendance
place de l<unk>indépendance
----------------
agence axa assurance immo ngor
agen axta asurance immo ngor
----------------
ecole sherif youssou thiaw laye malika
ecole sherif oussénou thiaw laye malika
----------------
croisement camberéne
croisement camberénee
----------------
mbédou
mbédoufass
----------------
depot la layousse faouzy
depot la yousse faouzy
----------------
ndeureuhlou
deureuhlou
----------------
ban bus mooy jaar pont danguou
ban bus mooy jaar pont ndanguo
----------------
rond point les grands moulins de dakar
rond point les grands mouzains de dakar
----------------
lat - urbam
kat - urbam
----------------
rond point terrain dialoré
rond point terrain diadoré
----------------
ministère de

KeyboardInterrupt: ignored

In [11]:
from nltk.probability import FreqDist
from wordcloud import WordCloud, ImageColorGenerator

#
words = df['transcription']
allwords = []

for wordlist in words:
  allwords += list(wordlist.lower().split())

# histogram
mostcommon_small = FreqDist(allwords).most_common(10000)
xv, yv = zip(*mostcommon_small)

In [227]:
wer_metric.compute(predictions=['hann bel-air'], references=['hann bel - air'])

0.75

In [None]:
0.07964882287993731

### Language Model

In [None]:
# to be considered
# model produces output of the form (CTC)
# <pad> <pad> <pad> <pad> <pad> <pad> r <pad> <pad> o u u <pad> <pad> t <pad> <pad> e <pad>
# => do we perform beam search on this sequence or first clean up the <pad> tokens?

#### Beam Search

In [9]:
# Beam Search
# https://towardsdatascience.com/boosting-your-sequence-generation-performance-with-beam-search-language-model-decoding-74ee64de435a

import math

def beam_search_decoder(predictions, top_k = 3):
    #start with an empty sequence with zero score
    output_sequences = [([], 0)]
    
    #looping through all the predictions
    for token_probs in predictions:
        new_sequences = []
        
        #append new tokens to old sequences and re-score
        for old_seq, old_score in output_sequences:
            for char_index in range(len(token_probs)):
                new_seq = old_seq + [char_index]
                #considering log-likelihood for scoring
                new_score = old_score + math.log(token_probs[char_index])
                new_sequences.append((new_seq, new_score))
                
        #sort all new sequences in the de-creasing order of their score
        output_sequences = sorted(new_sequences, key = lambda val: val[1], reverse = True)
        
        #select top-k based on score 
        # *Note- best sequence is with the highest score
        output_sequences = output_sequences[:top_k]
        
    return output_sequences

In [11]:

# test beam search 
idx = 10
nbeams = 10
softmax = nn.Softmax(dim=2)

#
pred = []
input_dict = df_train['audio_signal'][idx:idx+1].apply(prepare_dataset)

for idx in range(len(input_dict)):
  #print('-----------------')
  logits = XLSRmodel(input_dict.values[idx].input_values.to("cuda")).logits
  # sum_j(output_ij) = 1 where i is column and j is row
  output = softmax(logits) # logits -> probabilities

  beams_int = beam_search_decoder(torch.squeeze(output).tolist(), top_k = nbeams) # beams
  beams_str = nbeams*['']

  for k in range(nbeams):
    pred_ids, pred_prob = beams_int[k]
    print(processor.decode(pred_ids))

marché yeumbeul laa bëgg dem
marché yeumbeul laa bëgg dem
marché yeumbeul laa bëgg dem
marché yeumbeul laa bëgg dem
marché yeumbeul laa bëgg dem
marché yeumbeul laa bëgg dem
marché yeumbeul laa bëgg dem
marché yeumbeul laa bëgg dem
marché yeumbeul laa bëgg dem
marché yeumbeul laa bëgg dem


#### Language Model

In [12]:
# need to find optimal n of n-gram

## Language Model (n-gram vs KenLM)
# https://surfertas.github.io/deeplearning/pytorch/2017/08/20/n-gram.html # pytorch code (NN parametrization of LM)
# https://web.stanford.edu/~jurafsky/slp3/slides/LM_4.pdf
# https://www.kaggle.com/alvations/n-gram-language-model-with-nltk # code taken from here
# https://web.stanford.edu/~jurafsky/slp3/old_oct19/3.pdf -> improvements to LM's

df_lm = df_train[:5]

from nltk.util import ngrams, bigrams
from nltk.lm.preprocessing import padded_everygram_pipeline

from nltk.lm import MLE

# padding
from nltk.lm.preprocessing import pad_both_ends
#print(list(pad_both_ends(df_lm['transcription'].values[0], n=2)))
#print(list(bigrams(pad_both_ends(df_lm['transcription'].values[0], n=2))))

'''
# materialize
for ngramlize_sent in train_data:
    print(list(ngramlize_sent))
    print()
print('#############')
list(padded_sents)
'''

"\n# materialize\nfor ngramlize_sent in train_data:\n    print(list(ngramlize_sent))\n    print()\nprint('#############')\nlist(padded_sents)\n"

In [124]:
def train_ngram(LMmodel, data):
  '''
  input: model, list of sentences
  output: trained model
  '''

  # one long string of words
  word_string = ' '.join(data)

  # one long list of words
  word_list = word_string.split(' ')

  # lower casing
  word_list_lower = [list(map(str.lower, [word]))[0]
                     for word in word_list]

  # preprocess for language model
  train_data, padded_words = padded_everygram_pipeline(2, word_list_lower)
  
  # fit model
  LMmodel.fit(train_data, padded_words)

  return LMmodel

In [209]:
# WORD LEVEL LANGUAGE MODEL ################################################
def train_ngram(LMmodel, data):
  '''
  input: model, list of sentences
  output: trained model
  '''
  #
  sentence_list = [sentence for sentence in data]

  # lower casing
  word_list_lower = [[word.lower() for word in sentence.split(' ')] for sentence in sentence_list]

  # preprocess for language model
  train_data, padded_words = padded_everygram_pipeline(3, word_list_lower)
  
  # fit model
  LMmodel.fit(train_data, padded_words)

  return LMmodel

In [210]:
## language model
# IMPORTANT there seems to be a missmatch in the vocabulary (44 vs 49 chars)
# -> could lead to language model not knowing the character
LMmodel = MLE(3) # Lets train a n-gram model
LMmodel = train_ngram(LMmodel, df_train['transcription'].values)
print(LMmodel.vocab)
print(len(tokenizer.get_vocab()))

'''
print(LMmodel.counts['c'])
print(LMmodel.counts[['c']]['o'])  # P('o'|'c')
print(LMmodel.score('o', ['c']))
print(LMmodel.vocab.lookup([char for char in test_lower[5]]))
'''

<Vocabulary with cutoff=1 unk_label='<UNK>' and 783 items>
49


"\nprint(LMmodel.counts['c'])\nprint(LMmodel.counts[['c']]['o'])  # P('o'|'c')\nprint(LMmodel.score('o', ['c']))\nprint(LMmodel.vocab.lookup([char for char in test_lower[5]]))\n"

In [160]:
# voabulary of language model (extracted from data)
print([ch for ch in LMmodel.vocab])
print("VERY STRANGE THAT THERE IS A c-cedi IN THE VOCABULARY EXTRACTED FROM THE TRAIN DATASET")

['<s>', 'rufsac', '</s>', 'pharmacie', 'talibou', 'dabo', 'avenue', 'faidherbe', 'cité', 'mére', 'thérésa', 'gare', 'de', 'thiaroye', 'rue', 'baffa', 'séne', 'double', 'less', 'grande', 'mosquée', 'derkle', 'thokho', 'tournalou', 'yeumbeul', 'marché', 'laa', 'bëgg', 'dem', 'sonadis', 'rufisque', 'sococim', 'depot', 'layousse', 'faouzy', 'grand', 'dakar', 'fann', 'hock', 'canada', 'taly', 'bu', 'makk', 'pont', 'colobane', 'garage', 'camion', 'vidange', 'hopital', 'jean', 'la', 'fontaine', 'mariste', 'lamine', 'gueye', 'croisement', 'keur', 'massar', 'essence', 'touré', 'comico', 'darou', 'salam', 'parc', 'forestier', 'hann', 'massalikoul', 'jinan', 'ecobank', 'des', 'far', 'dama', 'mame', 'sira', 'ban', 'oto', 'mooy', 'jaar', 'yoff', 'yarakh', 'malicka', 'champ', 'course', 'pikine', 'seydina', 'limamoulaye', 'avenue,', 'edk', 'oil', 'ali', 'baba', 'rond', 'point', 'mbao', 'diaxay', 'lycée', 'thierno', 'seydou', 'nourou', 'tall', 'petit', 'extension', 'bountou', 'ecole', 'les', 'pédagogu

In [99]:
# vocabulary used by tokenizer (french alphabet)
vocab_dict = {v for k, v in enumerate(tokenizer.get_vocab())}
vocab_tokenizer = [v.lower() for v in vocab_dict]
print(vocab_tokenizer)

['ê', 'j', 'z', 'd', 'f', 's', 'ÿ', 'u', "'", 'à', 'h', 'û', '<pad>', 'b', 'a', 'â', 'y', 'î', 'i', 'r', '</s>', 'p', 'è', '-', 'w', 'é', '<unk>', 'g', 'ù', 'æ', 'q', 'o', 'ç', '<s>', '|', 'x', 'm', 'ô', 'n', 'c', 'k', 'l', 'œ', 'ü', 'ë', 'e', 't', 'ï', 'v']


#### Pipeline

In [126]:
# perplexity to acount for longer sequences
def ngram_logprobability(sentence):
    log_prob = 0
    count = 0
    for words in sentence:
      for ngram in words:
        # to avoid log(0) for unknown chars => many methods exist in the literature such as smoothing
        # since log is monotonically increasing, adding a const should not change the ordering, right?
        log_prob += np.log(LMmodel.score(ngram[1], [ngram[0]])+ 1e-8)
        count += 1
    return np.power(np.exp(log_prob), 1/count) # (inverse) perplexity to account for different word/ sentence length

In [214]:
# WORD LEVEL LANGUAGE MODEL ################################################
def ngram_logprobability(sentence):
    log_prob = 0
    count = 0
    for ngram in sentence:
      # to avoid log(0) for unknown chars => many methods exist in the literature such as smoothing
      # since log is monotonically increasing, adding a const should not change the ordering, right?
      log_prob += np.log(LMmodel.score(ngram[2], [ngram[0], ngram[1]])+ 1e-8)
      count += 1
    return np.power(np.exp(log_prob), 1/count) # (inverse) perplexity to account for different word/ sentence length

In [200]:
def logprob_sentences(sentences):
  '''
  input: list of sentences
  output: log probability for sentences
  '''
  # list with log probabilities
  log_probs = len(sentences)* [-np.infty]

  # creating list of sentences from string
  list_sentences = [sentence.split(' ') for sentence in sentences]

  # lower casing
  for k in range(len(list_sentences)):
    list_sentences[k] = [list(map(str.lower, [sent]))[0]
                        for sent in list_sentences[k]]

  # list(sentence_list(word_list(ngrams)))
  list_ngrams = [[list(ngrams(pad_both_ends(word, n=2), n=2)) for word in sentence] for sentence in list_sentences]

  for k, sentence in enumerate(list_ngrams):
    log_probs[k] = ngram_logprobability(sentence)

  return log_probs

In [215]:
# WORD LEVEL LANGUAGE MODEL ################################################
def logprob_sentences(sentences):
  '''
  input: list of sentences
  output: log probability for sentences
  '''

  '''
  #
  sentence_list = [sentence for sentence in data]

  # lower casing
  word_list_lower = [[word.lower() for word in sentence.split(' ')] for sentence in sentence_list]

  # preprocess for language model
  train_data, padded_words = padded_everygram_pipeline(2, word_list_lower)
  '''

  # list with log probabilities
  log_probs = len(sentences)* [-np.infty]

  # creating list of sentences from string
  list_sentences = [sentence.split(' ') for sentence in sentences]

  # lower casing
  for k in range(len(list_sentences)):
    list_sentences[k] = [word.lower() for word in list_sentences[k]]

  # list(sentence_list(word_list(ngrams)))
  list_ngrams = [list(ngrams(pad_both_ends(sentence, n=3), n=3)) for sentence in list_sentences]

  for k, sentence in enumerate(list_ngrams):
    log_probs[k] = ngram_logprobability(sentence)

  return log_probs

In [171]:
## Unit Tests

In [164]:
df_lm['transcription'].values

array(['Rufsac', 'Pharmacie Talibou Dabo', 'Avenue Faidherbe',
       'Cité mére Thérésa', 'Gare de Thiaroye'], dtype=object)

In [166]:
LMmodel.score('e', ['l', 'y', 'c'])

0.21875

In [191]:
print(logprob_sentences(['lycée', 'lycee', 'lycèe']))
print(logprob_sentences(['lycée camp', 'lycee camp', 'lycèe camp']))

[9.371477854353765e-07, 6.7002595641296955e-06, 9.999999999999982e-09]
[0.0002041969179860819, 7.657042980709192e-07, 9.999999999999994e-09]


In [192]:
print(logprob_sentences(['keur', 'keur', 'keurr']))
print(logprob_sentences(['keur massar', 'keur masar', 'keurr massar']))

[1.3677197907879315e-06, 1.3677197907879315e-06, 9.999999999999982e-09]
[0.05430633274969011, 2.654593871608607e-07, 4.641588849084741e-06]


In [120]:
#
input_dict = df_valid['audio_signal'].apply(prepare_dataset)

In [219]:
# ATTENTION: XLSR model seems to be overconfident -> places all probability mass on one logit
## Predictions
nbeams = 50

# word error rate
wer_ = []

#
softmax = nn.Softmax(dim=2)

## WER over everything (one long string)
label_str = ''
pred_str = ''

for idx in range(len(df_valid)):
  #print('-----------------')
  logits = XLSRmodel(input_dict.values[idx].input_values.to("cuda")).logits
  # sum_j(output_ij) = 1 where i is column and j is row
  output = softmax(logits) # logits -> probabilities

  # beam search
  beams_int = beam_search_decoder(torch.squeeze(output).tolist(), top_k = nbeams) # beams
  beams_str = nbeams*['']

  for k in range(nbeams):
    pred_ids, pred_prob = beams_int[k]
    beams_str[k] = processor.decode(pred_ids)

  '''
  if beams_str[0] != beams_str[1]:
    print('--------------------')
    print(df_valid['ID'].values[idx])
    print(beams_str)
    print(logprob_sentences(beams_str))
    print(beams_str[np.argmax(logprob_sentences(beams_str))])
  '''

  # prediction P(final) = Alpha * P(model) + Beta * P(L.M.)
  ## WER per sentence and then average
  #logprob = logprob_sentences(beams_str)
  #pred_str = beams_str[np.argmax(logprob)]
  #label_str = df_valid["transcription"].values[idx].lower()

  # need same length for wer_metric ? really
  #label_str = label_str.ljust(len(pred_str))
  #pred_str = pred_str.ljust(len(label_str))

  ## WER over everything (one long string)
  pred_str+= beams_str[np.argmax(logprob_sentences(beams_str))]+ ' '
  
  label_str+= df_valid["transcription"].values[idx].lower()+ ' '

wer_.append(wer_metric.compute(predictions=[pred_str], references=[label_str]))

print(np.mean(wer_))

0.05319896883056011


In [None]:
## spell checking
# 1) train LM as n-gram. prediction with model. run prediction through levenstein distance, use n-gram to vote
# 2) use pystellchecker -> only in French so far
# https://pypi.org/project/pyspellchecker/
# 3) textBlob 
# https://stackabuse.com/spelling-correction-in-python-with-textblob/
# https://github.com/sloria/TextBlob

In [70]:
id= 'a8d8c6221854d1b721162a4ecfbdf87554e0b39c782ccd1914aeaddf3491a92df99ac7cee4264d5b031b0e779e1e64d7206deca98ea39009e579fb7cab164ffe'

input_dict = df_valid[df_valid['ID']==id]['audio_signal'].apply(prepare_dataset)
logits = XLSRmodel(input_dict.values[0].input_values.to("cuda")).logits
pred_ids = torch.argmax(logits, dim=-1)[0]

print(processor.decode(pred_ids))

4390    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
Name: audio_signal, dtype: object
[{'input_values': tensor([[-7.9619e-05, -7.9619e-05, -7.9619e-05,  ..., -7.9619e-05,
         -7.9619e-05, -7.9619e-05]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]])}]
sheikh anta diop avenue<unk> dakar


In [None]:
# with word 3-gram
N=3 -> 0.07475978439184439
N=5 -> 0.07030700726505741
N=10 -> 0.06304194984766816
N=50 -> 0.05319896883056011 => 0.118 = 11.8% auf test satz

# with word 2-gram
N=3 -> 0.07499414108272791
N=5 -> 0.07077572064682447
N=10 -> 0.06397937661120225
N=20 -> 0.059057886102648234

# with word 1-gram
N=3 -> 0.07710335130067963
N=5 -> 0.07499414108272791
N=10 -> 0.07382235762831028

In [None]:
# with char 5-gram
N=3 -> 0.07780642137333021
N=5 -> 0.07569721115537849 down
N=10 ->0.07405671431919382 down

# with char 4-gram
N=3 -> 0.07921256151863136
N=5 -> 0.0787438481368643 down
N=10 ->0.07850949144598078 down

# with char 3-gram
N=1 -> 0.0824935551910007
N=2 -> 0.08202484180923365 down
N=3 -> 0.08366533864541832 up
N=5 -> 0.08577454886337005 up
N=80 ->0.14811342863838764 up

# with char 2-gram
N=3 -> 0.08600890555425357
N=5 -> 0.09210217951722521 up

# w/o
    -> 0.0824935551910007

### Prediction

In [220]:
## prediction
# load data (dataframe) -> empty entries
df_test = pd.read_feather('drive/MyDrive/Colab Notebooks/data/ASR_test_audio1564.ft')
df_test = df_test[['ID', 'audio_signal']]

df_test.head()

def prepare_dataset(batch):
    return processor(batch, return_tensors="pt", sampling_rate=16*1e3)

In [221]:
#
input_dict = df_test['audio_signal'].apply(prepare_dataset)

In [222]:
# run through processor
import difflib
#input_dict = df_test['audio_signal'].apply(prepare_dataset)
preds = []

nbeams = 50

#
softmax = nn.Softmax(dim=2)

# run through model and decoder
for i in range(len(df_test)):
  logits = XLSRmodel(input_dict.values[i].input_values.to('cuda')).logits
  output = softmax(logits) # logits -> probabilities
  
  # beam search
  beams_int = beam_search_decoder(torch.squeeze(output).tolist(), top_k = nbeams) # beams
  beams_str = nbeams*['']

  for k in range(nbeams):
    pred_ids, pred_prob = beams_int[k]
    beams_str[k] = processor.decode(pred_ids)

  # prediction P(final) = Alpha * P(model) + Beta * P(L.M.)
  logprob = logprob_sentences(beams_str)
  pred_str = beams_str[np.argmax(logprob)]

  preds.append(pred_str)

  if i % 200 == 0:
    print('Sentence '+ str(i))

# save as csv
dfpred = pd.DataFrame(list(zip(list(df_test['ID'].values), preds)), columns=['ID', 'transcription'])
dfpred.to_csv('./drive/MyDrive/Colab Notebooks/predictionsLM_18AprLM.csv', index=False)

In [None]:
from nltk.probability import FreqDist
from wordcloud import WordCloud, ImageColorGenerator

#
words = df['transcription']
allwords = []

for wordlist in words:
  allwords += list(wordlist.lower().split())

# histogram
mostcommon_small = FreqDist(allwords).most_common(10000)
xv, yv = zip(*mostcommon_small)