In [1]:
import pandas as pd
import pickle
import json

In [2]:
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 6 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


In [None]:
def normalize_text(text):
    from bs4 import BeautifulSoup
    import re
    def strip_html_tags(text):
        soup = BeautifulSoup(text, "html.parser")
        stripped_text = soup.get_text(separator=" ")
        return stripped_text
    text = strip_html_tags(text)
    # remove 'BULLET::::-'
    text = re.sub(r'BULLET::::-', ' ', text)
    # remove = if more than 1
    text = re.sub(r'={2,}', ' ', text)
    # emove duplicate spaces
    text = re.sub(r'  +', ' ', text)
    # remove duplicate newline characters if more than 2
    text = re.sub(r'\n{2,}', '\n', text)
    # remove all \n if at the end of the text
    text = re.sub(r'\n+$', '', text)
    # remove all \n if at the beginning of the text
    text = re.sub(r'^\n+', '', text)

    # tokenize_text = tokenize(text)
    return text

In [7]:
df = pd.read_json('C:/Users/ngoph/Desktop/zalo_ai/wikipedia_20220620_cleaned/wikipedia_20220620_cleaned.jsonl', lines=True)

In [None]:
# df = df.sample(frac=1).reset_index(drop=True)

In [None]:
df.head(5)

In [None]:
df['text'] = df['text'].parallel_apply(normalize_text)

In [None]:
df.to_json('data_gen/wikipedia_20220620_cleaned_parsed.jsonl', orient='records', lines=True)

# Title matching

In [3]:
import transformers
transformers.logging.set_verbosity_error()

In [4]:
from sentence_transformers import SentenceTransformer, util
import torch
model = SentenceTransformer('VoVanPhuc/sup-SimCSE-VietNamese-phobert-base',device='cuda')

No sentence-transformers model found with name C:\Users\ngoph/.cache\torch\sentence_transformers\VoVanPhuc_sup-SimCSE-VietNamese-phobert-base. Creating a new one with MEAN pooling.


In [5]:
df = pd.read_json('data_gen/wikipedia_20220620_cleaned_parsed.jsonl', orient='records', lines=True)

In [6]:
titles = df['title'].tolist()
titles[:5]

['Trang Chính', 'Internet Society', 'Tiếng Việt', 'Ohio', 'California']

In [None]:
# pickle.dump(titles, open('data_gen/titles.pkl', 'wb'))

In [7]:
title_embeddings = model.encode(titles,convert_to_tensor=True,show_progress_bar=True,device='cuda')

Batches:   0%|          | 0/39796 [00:00<?, ?it/s]

In [None]:
# pickle.dump(title_embeddings, open('data_gen/title_embeddings.pkl', 'wb'))

In [None]:
# title_embeddings = pickle.load(open('data_gen/title_embeddings.pkl', 'rb'))

In [None]:
q_raw = '''huyện Tuy An'''
query_embedding = model.encode(q_raw,convert_to_tensor=True,)
print(q_raw)
hits = util.semantic_search(query_embedding, title_embeddings, top_k=20)[0]

answers =[]
for hit in hits:
    corpus_id = hit['corpus_id']
    doc_score = hit['score']
    title = titles[corpus_id]
    print(f'{title} - {doc_score}')

# Document Retriever

In [None]:
sentences = df['text'].tolist()

In [None]:
len(sentences)

In [None]:
pickle.dump(sentences,open('data_gen/sentences.pkl','wb'))

In [None]:
# sentences = pickle.load(open('data_gen/sentences.pkl','rb'))

In [None]:
corpus_embeddings = model.encode(sentences,convert_to_tensor=True,show_progress_bar=True,device='cuda')

In [None]:
# save corpus_embeddings
pickle.dump(corpus_embeddings,open('data_gen/corpus_embeddings_2.pkl','wb'))

In [None]:
# corpus_embeddings = pickle.load(open('data_gen/corpus_embeddings.pkl','rb'))

In [None]:
corpus_embeddings.shape

In [None]:
with open('data/zac2022_train_merged_final.json',encoding='utf-8') as f:
    data=json.load(f)
    ddf = pd.json_normalize(data,'data')

In [None]:
ddf=ddf[ddf['title']!='']
ddf=ddf[ddf['category']=='FULL_ANNOTATION']

In [None]:
example = ddf.sample(1).iloc[0]
q_raw = example['question']
query_embedding = model.encode(q_raw,convert_to_tensor=True,)
hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=100)[0]
print(q_raw)
for hit in hits:
    corpus_id = hit['corpus_id']
    doc_score = hit['score']
    title = df.iloc[corpus_id]['title']
    print(f'{title} - {doc_score}')