### Advanced Topics in Machine Learning
#### Assignment 2
- Joona Kareinen

#### Data loading

In [1]:
!python app.py

In [None]:
import librosa

file_path = 'static/audio/input_audio-id-1.wav'
audio, sr = librosa.load(file_path, sr=16000)
audio

In [None]:
import json
import numpy as np

# Open the data file
file_path = './data/train-v2.0.json'
with open(file_path, 'rb') as f:
    # Load the data
    data_dict = json.load(f)


unique_contexts = []
contexts = []
pairs = []
for category in data_dict["data"]:
    for passage in category["paragraphs"]:
        context = passage["context"]
        unique_contexts.append(context)
        for qa in passage["qas"]:
            question = qa["question"]
            for answer in qa["answers"]:
                pairs.append([question, answer])
                contexts.append(context)


# Print some data
num_titles = len(unique_contexts)
print(f"In the dataset there are {num_titles} different categories with total of {len(pairs)} question/answer pairs.")
# Test that the data was loaded correctly

print(np.array(pairs[10:15]))


#### Normalize the data, and create pairs and sentences arrayas for training and word2vec

In [None]:
import re
from pandas.core.common import flatten
import unicodedata

def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s, is_answer):
    # Lowercase
    s = s.lower()
    s = unicodeToAscii(s)
    # Do some pruning to the data
    s = re.sub('[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', '', s)
    s = re.sub('\W', ' ', s).lower().split()

    return s

tokenized_pairs = []
tokenized_sentences = []
for idx, pair in enumerate(pairs):
    s1 = normalizeString(pair[0], 0)
    s2 = normalizeString(pair[1]["text"], 1)
    if len(s1) > 1 and len(s2) > 1:
        tokenized_pairs.append([s1,s2])
        tokenized_sentences.append(s1)
        tokenized_sentences.append(s2)

for sentence in tokenized_sentences[:10]:
    print(sentence)

for sentence in tokenized_pairs[:10]:
    print(sentence)

#### Count words and plot the sentence lengths

In [None]:
import matplotlib.pyplot as plt

word2count = {}
sen_len = []
for sentence in tokenized_sentences:
    sen_len.append(len(sentence))
    for word in sentence:
        if word not in word2count:
            word2count[word] = 1
        else:
            word2count[word] += 1

sorted_word2vec = {k: v for k, v in sorted(word2count.items(), key=lambda item: item[1], reverse=True)}

plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.hist(sen_len)
plt.title("Histogram of the sentence lengths")
plt.xlabel("Sentence length")
plt.ylabel("Total amount")

plt.subplot(1, 2, 2)
plt.hist(list(sorted_word2vec.values()), bins=50, log=True)
plt.title("Average word count")
plt.xlabel("word count")
plt.ylabel("Total amount")

plt.show()

#### Train word2vec

In [None]:
from gensim.models.word2vec import Word2Vec


model = Word2Vec(tokenized_sentences, vector_size=30, min_count=5, window=10)

In [None]:
len(model.wv)


In [None]:
import json 


print(f"Retrieved {len(unique_contexts)} passages")

In [None]:
unique_contexts[1]

In [None]:
from sentence_transformers import SentenceTransformer, CrossEncoder

semb_model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
xenc_model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

In [None]:
import os
import pickle

# Define hnswlib index path
embeddings_cache_path = './qa_embeddings_cache.pkl'

# Load cache if available
if os.path.exists(embeddings_cache_path):
    print('Loading embeddings cache')
    with open(embeddings_cache_path, 'rb') as f:
        corpus_embeddings = pickle.load(f)
# Else compute embeddings
else:
    print('Computing embeddings')
    corpus_embeddings = semb_model.encode(unique_contexts, convert_to_tensor=True, show_progress_bar=True)
    # Save the index to a file for future loading
    print(f'Saving index to: \'{embeddings_cache_path}\'')
    with open(embeddings_cache_path, 'wb') as f:
        pickle.dump(corpus_embeddings, f)

In [None]:
import os
import hnswlib
import time
start = time.time()
# Create empthy index
index = hnswlib.Index(space='cosine', dim=corpus_embeddings.size(1))

# Define hnswlib index path
index_path = './qa_hnswlib_100.index'

# Load index if available
if os.path.exists(index_path):
    print('Loading index...')
    index.load_index(index_path)
# Else index data collection
else:
    # Initialise the index
    print('Start creating HNSWLIB index')
    index.init_index(max_elements=corpus_embeddings.size(0), ef_construction=100, M=64) # see https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md for parameter description
    # Compute the HNSWLIB index (it may take a while)
    index.add_items(corpus_embeddings.cpu(), list(range(len(corpus_embeddings))))
    # Save the index to a file for future loading
    print(f'Saving index to: {index_path}')
    index.save_index(index_path)

end = time.time()
print(f"Exectution time: {int((end - start) / 60)}:{int((end - start) % 60)} min:sec")

In [None]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large", device_map="auto", torch_dtype=torch.float16)

In [None]:
input_text = 'Translate the following sentence from Italian to English: "Amo la pizza"'

input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)

output_ids = model.generate(input_ids, max_new_tokens=32)
output_text = tokenizer.decode(output_ids[0])
print(output_text)

In [None]:
def qa_pipeline(
    question,
    similarity_model=semb_model,
    embeddings_index=index,
    re_ranking_model=xenc_model,
    generative_model=model,
    device=device
):
    if not question.endswith('?'):
        question = question + '?'
    # Embed question
    question_embedding = similarity_model.encode(question, convert_to_tensor=True)
    # Search documents similar to question in index
    corpus_ids, distances = embeddings_index.knn_query(question_embedding.cpu(), k=64)
    # Re-rank results
    xenc_model_inputs = [(question, unique_contexts[idx]) for idx in corpus_ids[0]]
    cross_scores = re_ranking_model.predict(xenc_model_inputs)
    # Get best matching passage
    passage_idx = np.argsort(-cross_scores)[0]
    passage = unique_contexts[corpus_ids[0][passage_idx]]
    # Encode input
    input_text = f"Given the following passage, answer the related question.\n\nPassage:\n\n{passage}\n\nQ: {question}"
    print('INPUT TEXT:', input_text, "\n")
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)
    # Generate output
    output_ids = generative_model.generate(input_ids, max_new_tokens=512)
    # Decode output
    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # Return result
    return output_text

In [None]:
question = input("Ask a question >>> ")  # e.g., "How many fingers in a hand?", "What is the oldest newspaper in Chile?", ...
print()

print(qa_pipeline(question))