# **Lab5**

In [None]:
import re
import os
from tqdm import tqdm

In [None]:
def split_into_fragments(text, frag_length=100):
    fragments = []
    for i in range(0, len(text), frag_length):
        cur_text = text[i:i+frag_length]
        if len(cur_text) == 100:
            fragments.append(cur_text)
    return fragments

In [None]:
filepath = '/kaggle/input/nlp-lab/20news-bydate/20news-bydate/20news-bydate-train/alt.atheism/51227'
with open(filepath, 'r', encoding='latin1') as file_name:
    text = file_name.read()
    print(split_into_fragments(text))

In [None]:
def find_author(text):
    match = re.search(r'From:(.*?)(?=\w+:|$)', text, re.DOTALL)
    if match:
        result = match.group(1).strip()
        return result
    else:
        return None

In [None]:

with open(filepath, 'r', encoding='latin1') as file_name:
    text = file_name.read()
    print(find_author(text))

In [None]:
def processing_dataset(dataset_path, len = 100):
    all_fragments = list()
    all_metadata = list()
    catalogs = os.listdir(dataset_path)
    for catalog in tqdm(catalogs):
        path_catalog = os.path.join(f'{dataset_path}/{catalog}')
        files = os.listdir(path_catalog)
        for file in files:
            file_path = os.path.join(f'{path_catalog}/{file}')
            with open(file_path, 'r', encoding='latin1') as file_name:
                sample_content = file_name.read()
                cleaned_text = sample_content.replace('\t', ' ').replace('\n', ' ')
                cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
                fragments = split_into_fragments(cleaned_text)
                author = find_author(sample_content)
                count = 0
                for fragment in fragments:
                    meta_fragment = {'class': path_catalog.split('/')[-1],
                                     'doc_fragment': f'{file}-{count}',
                                     'author': author}
                    all_metadata.append(meta_fragment)
                    all_fragments.append(fragment)
                    count += 1
    return all_fragments, all_metadata

In [None]:
data_path = '/kaggle/input/nlp-lab/20news-bydate/20news-bydate/20news-bydate-train'
all_fragments, all_metadata = processing_dataset(data_path)

In [None]:
all_metadata

In [None]:
from sentence_transformers import SentenceTransformer

class Embedder():
    def __init__(self):
        self.model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')
    
    def __call__(self, input):
        input = self.model.encode(input).tolist()
        return input
    
embedder = Embedder()

In [None]:
import chromadb



class ChromaDB():
    def __init__(self):
        self.embedding_model = Embedder()
        self.client = chromadb.PersistentClient(path='/kaggle/working/ChromaDB')
        self.collection = self.client.get_or_create_collection(name="colls", embedding_function=self.embedding_model)

    def add_collection(self, all_fragments, all_metadata):
        o_b = 0
        b = 5000
        ids = [str(i) for i in range(len(all_metadata))]
        while True:
            if b > len(all_fragments):
                break
            else:
                self.collection.add(
                    documents = all_fragments[o_b:b],
                    embeddings = self.embedding_model(all_fragments[o_b:b]),
                    metadatas = all_metadata[o_b:b],
                    ids=ids[o_b:b]
                )
                o_b = b
                b += 5000

        self.collection.add(
                    documents = all_fragments[o_b:b],
                    embeddings = self.embedding_model(all_fragments[o_b:]),
                    metadatas = all_metadata[o_b:],
                    ids=ids[o_b:]
                )
        print("loaded")

    def search(self, text, count = 1):
        vector = self.embedding_model(text)
        result = self.collection.query(
            query_embeddings = vector,
            n_results = count,
            include=['distances','embeddings', 'documents', 'metadatas'],
        )
        return result
        

In [None]:
cdb = ChromaDB()

In [None]:
cdb.add_collection(all_fragments, all_metadata)

In [None]:
questions = [
    ['What ary you thinking about Christ?', 2],
    ['What is your favorite color?', 1],
    ['Where is your favorite car?', 1],
    ['What do you think about religion?', 3],
    ['What is your major?', 4],
    ['where should I go for the weekend?', 1]
]
count = 0
for question in questions:
    result = cdb.search(question[0], question[1])
    #print(f"-----------Вопрос №{count}-----------")
    print(f"Вопрос: {question[0]}")
    print(f"Ответы: {result['documents']}")
    print()
    count +=1

# **Lab6**

In [None]:
from ctransformers import AutoModelForCausalLM
llm = AutoModelForCausalLM.from_pretrained("TheBloke/Mistral-7B-OpenOrca-GGUF", model_file="mistral-7b-openorca.Q4_K_M.gguf", model_type="mistral", gpu_layers=50)

In [None]:
from evaluate import load
bertscore = load("bertscore")

In [None]:
def echo(message, history):
    result = cdb.search(message, 1)
    promt = f"Context: {result['documents'][0][0]}." + f"Question: {message}"
    answer = llm(promt)
    metric = bertscore.compute(predictions=result['documents'][0], references=[message], model_type="distilbert-base-uncased")
    return f"{answer}" +f"\n\nPrecision: {metric['precision'][0]}" + f"\nRecall: {metric['recall'][0]}"

In [None]:
import gradio as gr

ex = ['What ary you thinking about Christ?', 'What is your favorite color?', 'What is your favorite car?', 'where should I go for the weekend?']
demo = gr.ChatInterface(fn=echo, examples=ex, title="Echo Bot")
demo.launch(share=True)