In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, AutoModelForCausalLM
from nltk.translate.meteor_score import meteor_score
import nltk
from rouge_score import rouge_scorer
import rouge_score
import sqlite3
from contextlib import contextmanager
from DataMigration.database_functions import create_tables, log_to_sqlite


# Ensure NLTK data is downloaded
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)



True

QWEN2.5-0.5b

In [None]:
# Model Loading and Initialization
model_name = "Qwen/Qwen2.5-0.5B-Instruct"
Qwen_25_tokenizer = AutoTokenizer.from_pretrained(model_name)
Qwen_25 = AutoModelForCausalLM.from_pretrained(model_name)

# GPU Check
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using device: {device}')

# Move model to GPU and set training mode
Qwen_25 = Qwen_25.to(device)
Qwen_25.train()  # Enable dropout in inference phase


In [None]:
def process_inputs(text, dropout_prob=0, num_samples=1):
    # Paramétrage du taux de dropout pour tous les modules Dropout du modèle
    for module in Qwen_25.modules():
        if isinstance(module, torch.nn.Dropout):
            module.p = dropout_prob

    # Fonction pour calculer l'entropie
    def calculate_entropy(probabilities):
        entropy = -torch.sum(probabilities * torch.log(probabilities + 1e-12), dim=-1)
        return entropy
    
    inputs = Qwen_25_tokenizer(text, return_tensors="pt").to(device)
        
    with torch.no_grad():
        embeddings = Qwen_25.get_input_embeddings()(inputs['input_ids'])
    
    responses = []
    token_entropies_list = []
    avg_token_entropies_list = []
    top_logits_list = []
    top_tokens_list = []
    top_probabilities_list = []
    
    for _ in range(num_samples):
        with torch.no_grad():
            output = Qwen_25.generate(**inputs, return_dict_in_generate=True, output_scores=True, do_sample=True, max_new_tokens=700)
            generated_text = Qwen_25_tokenizer.decode(output.sequences[0], skip_special_tokens=True).replace(text, '').strip()
            responses.append(generated_text)
                
            logits = torch.stack(output.scores, dim=0)  # Logits des tokens générés
            probabilities = torch.softmax(logits, dim=-1)
            
            # Calculer l'entropie des tokens
            token_entropies = calculate_entropy(probabilities)
            avg_token_entropy = token_entropies.mean().item()
            
            # Obtenir les indices et logits des top 10 pour le dernier token
            top_logits, top_indices = torch.topk(logits[-1], k=10, dim=-1)
            mask = top_logits >= 1e-3  # Filtrer les logits faibles
            filtered_logits = top_logits[mask]
            filtered_indices = top_indices[mask]
            
            filtered_probabilities = torch.softmax(filtered_logits, dim=-1) if len(filtered_logits) > 0 else []

            top_tokens = [Qwen_25_tokenizer.decode(idx) for idx in filtered_indices.tolist()]
            
            # Stocker les informations dans les listes
            token_entropies_list.append(token_entropies.cpu().numpy())
            avg_token_entropies_list.append(avg_token_entropy)
            top_logits_list.append(filtered_logits.cpu().numpy())
            top_tokens_list.append(top_tokens)
            top_probabilities_list.append(filtered_probabilities.cpu().numpy())
    
    # Assembler les données dans un DataFrame pour ce texte
    results_df = pd.DataFrame({
        'original_text': [text] * num_samples,
        'generated_response': responses,
        'embeddings': [embeddings.cpu().numpy()] * num_samples,
        'token_entropies': token_entropies_list,
        'avg_token_entropy': avg_token_entropies_list,
        'top_10_logits': top_logits_list,
        'top_10_tokens': top_tokens_list,
        'top_10_probabilities': top_probabilities_list
    })
    log_to_sqlite("process_input_table", results_df)
    
    return results_df

QWEN2.5-1.5b

In [2]:
# Model Loading and Initialization
model_name = "Qwen/Qwen2.5-1.5B-Instruct"
Qwen_25_tokenizer = AutoTokenizer.from_pretrained(model_name)
Qwen_25 = AutoModelForCausalLM.from_pretrained(model_name)

# GPU Check
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using device: {device}')

# Move model to GPU and set training mode
Qwen_25 = Qwen_25.to(device)
Qwen_25.train()  # Enable dropout in inference phase


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Using device: cpu


Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 1536)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear(in_features=1536, out_features=1536, bias=True)
          (k_proj): Linear(in_features=1536, out_features=256, bias=True)
          (v_proj): Linear(in_features=1536, out_features=256, bias=True)
          (o_proj): Linear(in_features=1536, out_features=1536, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=1536, out_features=8960, bias=False)
          (up_proj): Linear(in_features=1536, out_features=8960, bias=False)
          (down_proj): Linear(in_features=8960, out_features=1536, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm()
        (post_attention_layernorm): Qwen2RMSNorm()
      )
    )
    (norm): Qwen2RMSNorm()
  )
  (lm_head): Linear

In [3]:
def process_inputs(text, dropout_prob=0, num_samples=1):
    # Paramétrage du taux de dropout pour tous les modules Dropout du modèle
    for module in Qwen_25.modules():
        if isinstance(module, torch.nn.Dropout):
            module.p = dropout_prob

    # Fonction pour calculer l'entropie
    def calculate_entropy(probabilities):
        entropy = -torch.sum(probabilities * torch.log(probabilities + 1e-12), dim=-1)
        return entropy
    
    inputs = Qwen_25_tokenizer(text, return_tensors="pt").to(device)
        
    with torch.no_grad():
        embeddings = Qwen_25.get_input_embeddings()(inputs['input_ids'])
    
    responses = []
    token_entropies_list = []
    avg_token_entropies_list = []
    top_logits_list = []
    top_tokens_list = []
    top_probabilities_list = []
    
    for _ in range(num_samples):
        with torch.no_grad():
            output = Qwen_25.generate(**inputs, return_dict_in_generate=True, output_scores=True, do_sample=True, max_new_tokens=700)
            generated_text = Qwen_25_tokenizer.decode(output.sequences[0], skip_special_tokens=True).replace(text, '').strip()
            responses.append(generated_text)
                
            logits = torch.stack(output.scores, dim=0)  # Logits des tokens générés
            probabilities = torch.softmax(logits, dim=-1)
            
            # Calculer l'entropie des tokens
            token_entropies = calculate_entropy(probabilities)
            avg_token_entropy = token_entropies.mean().item()
            
            # Obtenir les indices et logits des top 10 pour le dernier token
            top_logits, top_indices = torch.topk(logits[-1], k=10, dim=-1)
            mask = top_logits >= 1e-3  # Filtrer les logits faibles
            filtered_logits = top_logits[mask]
            filtered_indices = top_indices[mask]
            
            filtered_probabilities = torch.softmax(filtered_logits, dim=-1) if len(filtered_logits) > 0 else []

            top_tokens = [Qwen_25_tokenizer.decode(idx) for idx in filtered_indices.tolist()]
            
            # Stocker les informations dans les listes
            token_entropies_list.append(token_entropies.cpu().numpy())
            avg_token_entropies_list.append(avg_token_entropy)
            top_logits_list.append(filtered_logits.cpu().numpy())
            top_tokens_list.append(top_tokens)
            top_probabilities_list.append(filtered_probabilities.cpu().numpy())
    
    # Assembler les données dans un DataFrame pour ce texte
    results_df = pd.DataFrame({
        'original_text': [text] * num_samples,
        'generated_response': responses,
        'embeddings': [embeddings.cpu().numpy()] * num_samples,
        'token_entropies': token_entropies_list,
        'avg_token_entropy': avg_token_entropies_list,
        'top_10_logits': top_logits_list,
        'top_10_tokens': top_tokens_list,
        'top_10_probabilities': top_probabilities_list
    })
    log_to_sqlite("process_input_table", results_df)
    
    return results_df

In [None]:
input = "c'est quoi le méchanisme de fast attention"
output = process_inputs(input)
print(output["generated_response"][0])

QWEN2.5-7b

In [None]:
# Model Loading and Initialization
model_name = "Qwen/Qwen2.5-7B-Instruct"
Qwen_25_tokenizer = AutoTokenizer.from_pretrained(model_name)
Qwen_25 = AutoModelForCausalLM.from_pretrained(model_name)

# GPU Check
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using device: {device}')

# Move model to GPU and set training mode
Qwen_25 = Qwen_25.to(device)
Qwen_25.train()  # Enable dropout in inference phase


In [None]:
def process_inputs(text, dropout_prob=0, num_samples=1):
    # Paramétrage du taux de dropout pour tous les modules Dropout du modèle
    for module in Qwen_25.modules():
        if isinstance(module, torch.nn.Dropout):
            module.p = dropout_prob

    # Fonction pour calculer l'entropie
    def calculate_entropy(probabilities):
        entropy = -torch.sum(probabilities * torch.log(probabilities + 1e-12), dim=-1)
        return entropy
    
    inputs = Qwen_25_tokenizer(text, return_tensors="pt").to(device)
        
    with torch.no_grad():
        embeddings = Qwen_25.get_input_embeddings()(inputs['input_ids'])
    
    responses = []
    token_entropies_list = []
    avg_token_entropies_list = []
    top_logits_list = []
    top_tokens_list = []
    top_probabilities_list = []
    
    for _ in range(num_samples):
        with torch.no_grad():
            output = Qwen_25.generate(**inputs, return_dict_in_generate=True, output_scores=True, do_sample=True, max_new_tokens=700)
            generated_text = Qwen_25_tokenizer.decode(output.sequences[0], skip_special_tokens=True).replace(text, '').strip()
            responses.append(generated_text)
                
            logits = torch.stack(output.scores, dim=0)  # Logits des tokens générés
            probabilities = torch.softmax(logits, dim=-1)
            
            # Calculer l'entropie des tokens
            token_entropies = calculate_entropy(probabilities)
            avg_token_entropy = token_entropies.mean().item()
            
            # Obtenir les indices et logits des top 10 pour le dernier token
            top_logits, top_indices = torch.topk(logits[-1], k=10, dim=-1)
            mask = top_logits >= 1e-3  # Filtrer les logits faibles
            filtered_logits = top_logits[mask]
            filtered_indices = top_indices[mask]
            
            filtered_probabilities = torch.softmax(filtered_logits, dim=-1) if len(filtered_logits) > 0 else []

            top_tokens = [Qwen_25_tokenizer.decode(idx) for idx in filtered_indices.tolist()]
            
            # Stocker les informations dans les listes
            token_entropies_list.append(token_entropies.cpu().numpy())
            avg_token_entropies_list.append(avg_token_entropy)
            top_logits_list.append(filtered_logits.cpu().numpy())
            top_tokens_list.append(top_tokens)
            top_probabilities_list.append(filtered_probabilities.cpu().numpy())
    
    # Assembler les données dans un DataFrame pour ce texte
    results_df = pd.DataFrame({
        'original_text': [text] * num_samples,
        'generated_response': responses,
        'embeddings': [embeddings.cpu().numpy()] * num_samples,
        'token_entropies': token_entropies_list,
        'avg_token_entropy': avg_token_entropies_list,
        'top_10_logits': top_logits_list,
        'top_10_tokens': top_tokens_list,
        'top_10_probabilities': top_probabilities_list
    })
    log_to_sqlite("process_input_table", results_df)
    
    return results_df

In [None]:
input = "c'est quoi le méchanisme de fast attention"
output = process_inputs(input)
print(output["generated_response"][0])