In [1]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning, message="`resume_download` is deprecated")

In [2]:
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch

# Last inn datasettet
data_path = 'renset_fil_1.csv'  
data = pd.read_csv(data_path, sep=';')

# Forbehandling av datasettet
data['Spørsmål_uten_stopwords'].fillna('manglende', inplace=True)
data['combined_info'] = data[['Svar', 'Les mer her…', 'Link']].apply(lambda x: f"{x['Svar']} {x['Les mer her…']} {x['Link']}", axis=1)

# Initialiser modell og tokenizer
model1_name = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'
tokenizer = AutoTokenizer.from_pretrained(model1_name)
model1 = AutoModel.from_pretrained(model1_name)

# Funksjon for å generere embeddings
def encode(texts):
    with torch.no_grad():
        return model1(**tokenizer(texts, return_tensors='pt', padding=True, truncation=True)).last_hidden_state.mean(dim=1)


question_embeddings = encode(data['Spørsmål_uten_stopwords'].tolist())

def find_most_similar_question(user_question, k=1):
    user_question_embedding = encode([user_question])
    cos_sim = torch.nn.functional.cosine_similarity(user_question_embedding, question_embeddings)
    top_results = torch.topk(cos_sim, k)

    # Sjekk og hent det faktiske indeksnummeret fra tensorobjektet
    best_index = top_results.indices[0].item()  # Bruk .item() for å konvertere tensor til et Python heltall

    return data.iloc[best_index]['combined_info']

# Lagre modellen og tokenizeren
model1_path = './saved_model_L12'
tokenizer_path = './saved_tokenizer_L12'
model1.save_pretrained(model1_path)
tokenizer.save_pretrained(tokenizer_path)


('./saved_tokenizer_L12\\tokenizer_config.json',
 './saved_tokenizer_L12\\special_tokens_map.json',
 './saved_tokenizer_L12\\unigram.json',
 './saved_tokenizer_L12\\added_tokens.json',
 './saved_tokenizer_L12\\tokenizer.json')

In [3]:
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch

# Sjekker om CUDA (GPU-støtte) er tilgjengelig, ellers bruk CPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Last inn datasettet
data_path = 'renset_fil_1.csv'  
data = pd.read_csv(data_path, sep=';')

# Forbehandling av datasettet
data['Spørsmål_uten_stopwords'].fillna('manglende', inplace=True)
data['combined_info'] = data[['Svar', 'Les mer her…', 'Link']].apply(
    lambda x: f"{x['Svar']} {x['Les mer her…']} {x['Link']}", axis=1)

# Initialiser modell og tokenizer
model2_name = 'sentence-transformers/paraphrase-xlm-r-multilingual-v1'
tokenizer = AutoTokenizer.from_pretrained(model2_name)
model2 = AutoModel.from_pretrained(model2_name).to(device)

# Funksjon for å generere embeddings med batch-behandling
def batch_encode(texts, batch_size=32):
    model2.eval()
    all_embeddings = []
    
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        encoded_input = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True).to(device)
        with torch.no_grad():
            batch_embeddings = model2(**encoded_input).last_hidden_state.mean(dim=1)
        all_embeddings.append(batch_embeddings.cpu())
    
    return torch.cat(all_embeddings, dim=0)

# Generer embeddings for alle spørsmål
question_embeddings = batch_encode(data['Spørsmål_uten_stopwords'].tolist())

# Finn det mest lignende spørsmålet
def find_most_similar_question(user_question, k=1):
    user_question_embedding = batch_encode([user_question])
    cos_sim = torch.nn.functional.cosine_similarity(user_question_embedding, question_embeddings)
    top_results = torch.topk(cos_sim, k)

    best_index = top_results.indices[0].item()
    return data.iloc[best_index]['combined_info']

# Lagre modellen og tokenizeren for senere bruk
model2_path = './saved_model_paraphrase_xlm_r_multilingual_v1'
tokenizer_path = './saved_tokenizer_paraphrase_xlm_r_multilingual_v1'
model2.save_pretrained(model2_path)
tokenizer.save_pretrained(tokenizer_path)

('./saved_tokenizer_paraphrase_xlm_r_multilingual_v1\\tokenizer_config.json',
 './saved_tokenizer_paraphrase_xlm_r_multilingual_v1\\special_tokens_map.json',
 './saved_tokenizer_paraphrase_xlm_r_multilingual_v1\\sentencepiece.bpe.model',
 './saved_tokenizer_paraphrase_xlm_r_multilingual_v1\\added_tokens.json',
 './saved_tokenizer_paraphrase_xlm_r_multilingual_v1\\tokenizer.json')

In [4]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel

# Assume you have loaded the dataset and combined information as previously shown
data_path = 'renset_fil_1.csv'  
data = pd.read_csv(data_path, sep=';')
data['Spørsmål_uten_stopwords'].fillna('manglende', inplace=True)
data['combined_info'] = data[['Svar', 'Les mer her…', 'Link']].apply(
    lambda x: f"{x['Svar']} {x['Les mer her…']} {x['Link']}", axis=1)

# Loading models and tokenizers
model1_path = './saved_model_L12'
tokenizer1_path = './saved_tokenizer_L12'
model1 = AutoModel.from_pretrained(model1_path)
tokenizer1 = AutoTokenizer.from_pretrained(tokenizer1_path)

model2_path = './saved_model_paraphrase_xlm_r_multilingual_v1'
tokenizer2_path = './saved_tokenizer_paraphrase_xlm_r_multilingual_v1'
model2 = AutoModel.from_pretrained(model2_path)
tokenizer2 = AutoTokenizer.from_pretrained(tokenizer2_path)

# Separate embeddings generation
def encode(model, tokenizer, texts):
    model.eval()
    device = model.device  # Ensures using the same device as the model
    with torch.no_grad():
        encoded_input = tokenizer(texts, return_tensors='pt', padding=True, truncation=True).to(device)
        outputs = model(**encoded_input)
        return outputs.last_hidden_state.mean(dim=1)

# Generating embeddings separately for each model
question_embeddings1 = encode(model1, tokenizer1, data['Spørsmål_uten_stopwords'].tolist())
question_embeddings2 = encode(model2, tokenizer2, data['Spørsmål_uten_stopwords'].tolist())

# Adjusted find_most_similar_question function
def find_most_similar_question(model, tokenizer, embeddings, user_question, k=1):
    user_question_embedding = encode(model, tokenizer, [user_question])
    cos_sim = torch.nn.functional.cosine_similarity(user_question_embedding, embeddings)
    top_results = torch.topk(cos_sim, k)
    best_index = top_results.indices[0].item()
    return data.iloc[best_index]['combined_info']

def compare_answers(user_question):
    answer1 = find_most_similar_question(model1, tokenizer1, question_embeddings1, user_question)
    answer2 = find_most_similar_question(model2, tokenizer2, question_embeddings2, user_question)
    
    
    if answer1 == answer2:
        return answer1
    else:
        print(answer1)
        user_response = input("Er dette svaret du trenger (ja/nei)? ")
        if user_response.lower() == 'nei':
            return answer2
        else:
            return "Så bra, ha en fin dag!"


In [15]:
# Example of usage
user_question = "Hvor mange egenmeldingsdager kan jeg bruke? "
final_answer = compare_answers(user_question)
print("Selected answer:", final_answer)


Selected answer: Du kan bruke egenmelding:-Når du har arbeidet i Husbanken i minst to måneder.-Når du er borte fra jobben i minst én hel dag.-I opp til 8 kalenderdager i løpet av 16 kalenderdager.-I til sammen 24 kalenderdager i løpet av en 12 månedersperiode. Les mer her… https://husnettet-d/HR-Portalen/For-ansatte/Sider/Sykdom
