### Setup

In [1]:
## path
path = 'drive/MyDrive/Colab Notebooks/'

In [2]:
%%capture
!pip install datasets # to use
!pip install git+https://github.com/huggingface/transformers # to user huggingface transformer
!pip install jiwer # for wer metric

!pip install -U pip
!pip install -U dill
!pip install -U nltk==3.4

In [3]:
## load packages
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
import os.path

import torch

import torch.nn as nn
import torch.nn.functional as F

from transformers import Wav2Vec2ForCTC, Wav2Vec2CTCTokenizer
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Processor

from datasets import load_metric

import librosa as lb

from sklearn.model_selection import train_test_split

from nltk.util import pad_sequence
from nltk.util import ngrams, bigrams
from nltk.lm.preprocessing import pad_both_ends
from nltk.lm.preprocessing import flatten

In [4]:
# seeding
random.seed(10)
np.random.seed(10)
torch.manual_seed(10)
torch.cuda.manual_seed_all(10)

In [5]:
## mount drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Data

In [6]:
## read into memory (small)
df = pd.read_feather('drive/MyDrive/Colab Notebooks/data/ASR_train_audio6683.ft')

In [7]:
## train valid split
df_train, df_valid = train_test_split(df, test_size=0.2)

### XLSR Model

In [8]:
##
# load XLSR model
if not 'XLSRmodel' in globals():
  print('Load model')
  XLSRmodel = Wav2Vec2ForCTC.from_pretrained('./drive/MyDrive/Colab Notebooks/model/wav2vec2-large-xlsr-french-test/checkpoint-750/').to("cuda")

# load processor
if not 'processor' in globals():
  print('Load processor')
  tokenizer = Wav2Vec2CTCTokenizer.from_pretrained("facebook/wav2vec2-large-xlsr-53-french")
  feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)
  processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

# prepare dataset
def prepare_dataset(batch):
    return processor(batch, return_tensors="pt", sampling_rate=16*1e3)

# word error rate
wer_metric = load_metric("wer")
wer_ = []

#
input_dict = df_valid['audio_signal'].apply(prepare_dataset)

for idx in range(len(df_valid)):
  #print('-----------------')
  logits = XLSRmodel(input_dict.values[idx].input_values.to("cuda")).logits
  pred_ids = torch.argmax(logits, dim=-1)[0]

  #print("Prediction:")
  pred_str = processor.decode(pred_ids)
  #print(pred_str)

  #print("\nReference:")
  label_str = df_valid["transcription"].values[idx].lower()
  #print(label_str)

  # need same length for wer_metric
  label_str = label_str.ljust(len(pred_str))
  pred_str = pred_str.ljust(len(label_str))
  wer_.append(wer_metric.compute(predictions=[pred_str], references=[label_str]))

print(np.mean(wer_))

Load model
Load processor


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=460.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=378.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=85.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1764.0, style=ProgressStyle(description…


0.10110470016977122


### Language Model

In [9]:
# to be considered
# model produces output of the form (CTC)
# <pad> <pad> <pad> <pad> <pad> <pad> r <pad> <pad> o u u <pad> <pad> t <pad> <pad> e <pad>
# => do we perform beam search on this sequence or first clean up the <pad> tokens?

#### Beam Search

In [10]:
# Beam Search
# https://towardsdatascience.com/boosting-your-sequence-generation-performance-with-beam-search-language-model-decoding-74ee64de435a

import math

def beam_search_decoder(predictions, top_k = 3):
    #start with an empty sequence with zero score
    output_sequences = [([], 0)]
    
    #looping through all the predictions
    for token_probs in predictions:
        new_sequences = []
        
        #append new tokens to old sequences and re-score
        for old_seq, old_score in output_sequences:
            for char_index in range(len(token_probs)):
                new_seq = old_seq + [char_index]
                #considering log-likelihood for scoring
                new_score = old_score + math.log(token_probs[char_index])
                new_sequences.append((new_seq, new_score))
                
        #sort all new sequences in the de-creasing order of their score
        output_sequences = sorted(new_sequences, key = lambda val: val[1], reverse = True)
        
        #select top-k based on score 
        # *Note- best sequence is with the highest score
        output_sequences = output_sequences[:top_k]
        
    return output_sequences

#### Language Model

In [11]:
# need to find optimal n of n-gram

## Language Model (n-gram vs KenLM)
# https://surfertas.github.io/deeplearning/pytorch/2017/08/20/n-gram.html # pytorch code (NN parametrization of LM)
# https://web.stanford.edu/~jurafsky/slp3/slides/LM_4.pdf
# https://www.kaggle.com/alvations/n-gram-language-model-with-nltk # code taken from here
# https://web.stanford.edu/~jurafsky/slp3/old_oct19/3.pdf -> improvements to LM's

df_lm = df_train[:5]

from nltk.util import ngrams, bigrams
from nltk.lm.preprocessing import padded_everygram_pipeline

from nltk.lm import MLE

# padding
from nltk.lm.preprocessing import pad_both_ends
#print(list(pad_both_ends(df_lm['transcription'].values[0], n=2)))
#print(list(bigrams(pad_both_ends(df_lm['transcription'].values[0], n=2))))

'''
# materialize
for ngramlize_sent in train_data:
    print(list(ngramlize_sent))
    print()
print('#############')
list(padded_sents)
'''

"\n# materialize\nfor ngramlize_sent in train_data:\n    print(list(ngramlize_sent))\n    print()\nprint('#############')\nlist(padded_sents)\n"

In [12]:
def train_ngram(LMmodel, data):
  '''
  input: model, list of sentences
  output: trained model
  '''

  # one long string of words
  word_string = ' '.join(data)

  # one long list of words
  word_list = word_string.split(' ')

  # lower casing
  word_list_lower = [list(map(str.lower, [word]))[0]
                     for word in word_list]

  # preprocess for language model
  train_data, padded_words = padded_everygram_pipeline(3, word_list_lower)
  
  # fit model
  LMmodel.fit(train_data, padded_words)

  return LMmodel

In [13]:
## language model
# IMPORTANT there seems to be a missmatch in the vocabulary (44 vs 49 chars)
LMmodel = MLE(3) # Lets train a 2-gram model
LMmodel = train_ngram(LMmodel, df_train['transcription'].values)
print(LMmodel.vocab)
print(len(tokenizer.get_vocab()))

'''
print(LMmodel.counts['c'])
print(LMmodel.counts[['c']]['o'])  # P('o'|'c')
print(LMmodel.score('o', ['c']))
print(LMmodel.vocab.lookup([char for char in test_lower[5]]))
'''

<Vocabulary with cutoff=1 unk_label='<UNK>' and 44 items>
49


"\nprint(LMmodel.counts['c'])\nprint(LMmodel.counts[['c']]['o'])  # P('o'|'c')\nprint(LMmodel.score('o', ['c']))\nprint(LMmodel.vocab.lookup([char for char in test_lower[5]]))\n"

In [14]:
# voabulary of language model (extracted from data)
print([ch for ch in LMmodel.vocab])

['<s>', 's', 'o', 'n', 'a', 't', 'e', 'l', '</s>', 'd', 'h', 'm', 'i', 'c', 'r', 'u', 'f', 'q', 'b', 'k', 'p', 'y', 'é', 'g', 'è', 'v', 'x', "'", 'j', 'w', 'z', '-', '’', 'ë', ',', '(', ')', 'ô', 'î', 'â', 'œ', 'ç', '"', '<UNK>']


In [15]:
# vocabulary used by tokenizer (french alphabet)
vocab_dict = {v: k for k, v in enumerate(tokenizer.get_vocab())}
print(vocab_dict.keys())

dict_keys(['<pad>', '<s>', '</s>', '<unk>', '|', 'E', 'S', 'A', 'I', 'T', 'N', 'R', 'U', 'L', 'O', 'D', 'M', 'C', 'P', 'É', 'V', "'", 'Q', 'F', 'G', 'B', 'H', 'J', 'À', 'X', 'È', 'Y', '-', 'Ê', 'Z', 'Â', 'Ç', 'Î', 'Ô', 'Û', 'Ù', 'K', 'Œ', 'Ï', 'W', 'Ë', 'Ü', 'Æ', 'Ÿ'])


#### Pipeline

In [16]:
# TODO: how to solve the decreasing probability of longer words?
def ngram_logprobability(sentence):
    log_prob = 0
    count = 0
    for words in sentence:
      for ngram in words:
        log_prob += max([np.log(LMmodel.score(ngram[2], [ngram[0], ngram[1]])), -1e3]) # to avoid log(0) for unknown chars
        count += 1
    return np.power(np.exp(log_prob), 1/count) # (inverse) perplexity to account for different word/ sentence length

In [17]:
def logprob_sentences(sentences):
  '''
  input: list of sentences
  output: log probability for sentences
  '''
  # list with log probabilities
  log_probs = len(sentences)* [-np.infty]

  #
  list_sentences = [sentence.split(' ') for sentence in sentences]

  # lower casing
  for k in range(len(list_sentences)):
    list_sentences[k] = [list(map(str.lower, [sent]))[0]
                        for sent in list_sentences[k]]

  # list(sentence_list(word_list(ngrams)))
  list_ngrams = [[list(ngrams(pad_both_ends(word, n=3), n=3)) for word in sentence] for sentence in list_sentences]

  for k, sentence in enumerate(list_ngrams):
    log_probs[k] = ngram_logprobability(sentence)

  return log_probs

In [18]:
df_lm['transcription'].values

array(['Sonatel des HLM', 'Thiossane', 'Ecole Castor Rufisque',
       'Terrain Basket', 'Pharmacie Baye Niasse'], dtype=object)

In [19]:
LMmodel.score('n', ['s', 'o'])

0.2956521739130435

In [20]:
logprob_sentences(['terrrain', 'terrain', 'terain'])

  import sys


[0.0, 0.2517159068513733, 0.0]

In [21]:
#
input_dict = df_valid['audio_signal'].apply(prepare_dataset)

In [22]:
# ATTENTION: XLSR model seems to be overconfident -> places all probability mass on one logit
## Predictions
nbeams = 10

# word error rate
wer_ = []

#
softmax = nn.Softmax(dim=2)

for idx in range(len(df_valid)):
  #print('-----------------')
  logits = XLSRmodel(input_dict.values[idx].input_values.to("cuda")).logits
  output = softmax(logits) # logits -> probabilities

  # beam search
  beams_int = beam_search_decoder(torch.squeeze(output).tolist(), top_k = nbeams) # beams
  beams_str = nbeams*['']

  for k in range(nbeams):
    pred_ids, pred_prob = beams_int[k]
    beams_str[k] = processor.decode(pred_ids)

  # prediction P(final) = Alpha * P(model) + Beta * P(L.M.)
  logprob = logprob_sentences(beams_str)
  pred_str = beams_str[np.argmax(logprob)]

  #print("\nReference:")
  label_str = df_valid["transcription"].values[idx].lower()

  # need same length for wer_metric
  label_str = label_str.ljust(len(pred_str))
  pred_str = pred_str.ljust(len(label_str))

  wer_.append(wer_metric.compute(predictions=[pred_str], references=[label_str]))

print(np.mean(wer_))

  import sys


0.10848054754187887


### Prediction

In [23]:
## prediction
# load data (dataframe) -> empty entries
df_test = pd.read_feather('drive/MyDrive/Colab Notebooks/data/ASR_test_audio1564.ft')
df_test = df_test[['ID', 'audio_signal']]

df_test.head()

def prepare_dataset(batch):
    return processor(batch, return_tensors="pt", sampling_rate=16*1e3)

In [38]:
#
input_dict = df_test['audio_signal'].apply(prepare_dataset)

In [1]:
# run through processor
import difflib
#input_dict = df_test['audio_signal'].apply(prepare_dataset)
preds = []


nbeams = 10

# word error rate
wer_ = []

#
softmax = nn.Softmax(dim=2)

# run through model and decoder
for i in range(len(df_test)):
  logits = XLSRmodel(input_dict.values[i].input_values.to('cuda')).logits
  output = softmax(logits) # logits -> probabilities
  
  # beam search
  beams_int = beam_search_decoder(torch.squeeze(output).tolist(), top_k = nbeams) # beams
  beams_str = nbeams*['']

  print(beams_str)
  if i == 10:
    break

  for k in range(nbeams):
    pred_ids, pred_prob = beams_int[k]
    beams_str[k] = processor.decode(pred_ids)

  # prediction P(final) = Alpha * P(model) + Beta * P(L.M.)
  logprob = logprob_sentences(beams_str)
  pred_str = beams_str[np.argmax(logprob)]

  preds.append(pred_str)
  '''
  print('------')
  print(pred_str)

  print([difflib.get_close_matches(word, xv, n=3) for word in processor.decode(pred_ids).split(' ')])

  if i == 10:
    break
  '''

  '''
  pred = []
  for word in processor.decode(pred_ids[0]).split(' '):
    sim = difflib.get_close_matches(word, xv, n=3)
    if sim == []:
      sim = word
    else:
      sim = sim[0]
        
    pred.append(sim)
  
  preds.append(' '.join(pred))
  '''


# save as csv
dfpred = pd.DataFrame(list(zip(list(df_test['ID'].values), preds)), columns=['ID', 'transcription'])
dfpred.to_csv('./drive/MyDrive/Colab Notebooks/predictionsLM_10Apr.csv', index=False)

NameError: ignored

In [26]:
from nltk.probability import FreqDist
from wordcloud import WordCloud, ImageColorGenerator

#
words = df['transcription']
allwords = []

for wordlist in words:
  allwords += list(wordlist.lower().split())

# histogram
mostcommon_small = FreqDist(allwords).most_common(10000)
xv, yv = zip(*mostcommon_small)