# Projet DM

In [1]:
!pip install unidecode
!pip install spacy
!python -m spacy download fr_core_news_sm




[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting fr-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.8.0/fr_core_news_sm-3.8.0-py3-none-any.whl (16.3 MB)
     ---------------------------------------- 0.0/16.3 MB ? eta -:--:--
     ----------------- ---------------------- 7.1/16.3 MB 43.4 MB/s eta 0:00:01
     ---------------------------------------- 16.3/16.3 MB 48.8 MB/s  0:00:00
[38;5;2m‚úî Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import json
import os
import pandas as pd
import re
from unidecode import unidecode


def load_jsonl(filename):
    """Load a .jsonl file. Tries both the given path and the 'data' folder.
    Returns a list of dicts."""
    p = os.path.join('data', filename)
    if os.path.exists(p):
        data = []
        with open(p, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                try:
                    data.append(json.loads(line))
                except Exception as e:
                    print(f"Error parsing a line in {p}: {e}")
        return data
    raise FileNotFoundError(f"File not found: {filename} (tried: {p})")

### 1. Data Upload

In [3]:
# Load the data 
train_path = 'train_v2.jsonl'
test_path = 'test_v4.jsonl'

train_data = load_jsonl(train_path)
test_data = load_jsonl(test_path)

print(f"Train records: {len(train_data)}, Test records: {len(test_data)}")

# Convert to DataFrame for easier handling
train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

# Quick preview
print("-- Train head --")
print(train_df.head())
print("\n-- Info --")
print(train_df.info())

Train records: 492, Test records: 519
-- Train head --
                                                text acronym  \
0  LRA  limite de r√©sistance des attelages PAR po...     PAR   
1                              D√©signa -tion des PN       PN   
2  pr√©d√©termin√©es de trains : _x0001_ les masses ...      EM   
3  /Commentaires N¬∞ AC B81500 thermique:  compati...      AC   
4  kilom√®tres/heure (ex : 12 pour 120 km/h), _x00...     TIV   

                                             options  
0  {'Plan d'action r√©gularit√©': False, 'Poste d'a...  
1  {'Passages √† niveau : fichier des pn, recensem...  
2  {'EMERAINVILLE PONTAULT COMBAULT': False, 'Eng...  
3  {'ACc√®s': False, 'Agent d'aCcompagnement ': Fa...  
4  {'THIVIERS': False, 'Trafic international voya...  

-- Info --
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 492 entries, 0 to 491
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   text     492 non-null   

### 2. Data Cleaning

In [4]:
# Basic cleaning and review
# 1) Columns, types, and missing values
print("Columns:", list(train_df.columns))
print("Missing values per column:\n", train_df.isna().sum())

# 2) Remove simple duplicates
train_df = train_df.drop_duplicates(subset=['text', 'acronym']).reset_index(drop=True)

# 3) Fill missing values in text columns with empty string (example)
for col in train_df.select_dtypes(include='object').columns:
    train_df[col] = train_df[col].fillna('')

print("After cleaning, shape:", train_df.shape)

Columns: ['text', 'acronym', 'options']
Missing values per column:
 text       0
acronym    0
options    0
dtype: int64
After cleaning, shape: (492, 3)


### 3. Pre-Processing

In [5]:
import nltk
import spacy
from unidecode import unidecode
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Descargar recursos de NLTK (solo una vez)
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("stopwords-fr")

# Cargar modelo de spaCy
nlp = spacy.load("fr_core_news_sm")

# Obtener stopwords en franc√©s
stop_words = set(stopwords.words("french"))

def preprocessing(text, options=False):
    """
    Funci√≥n unificada de preprocesamiento que aplica los mismos pasos a texto y opciones
    """
    # Verificar si el texto es NaN o None
    if pd.isna(text) or text is None:
        return ""
    
    # 1. Convertir a string y lowercase
    text = str(text).lower()
    
    # 2. Quitar acentos
    text = unidecode(text)
    
    # 3. Tokenizar y quitar stopwords
    try:
        tokens = word_tokenize(text, language='french')
    except:
        # Fallback para tokenizaci√≥n simple si word_tokenize falla
        tokens = text.split()
    
    if options:
        # PARA OPCIONES: mantener stopwords
        filtered_tokens = [word for word in tokens if word.isalpha()]
    else:
        # PARA TEXTO NORMAL: quitar stopwords  
        filtered_tokens = [word for word in tokens if word not in stop_words and word.isalpha()]
    
    # 4. Reconstruir texto para lematizaci√≥n
    text_to_lemmatize = " ".join(filtered_tokens)
    
    # 5. Lemmatizar con spaCy
    try:
        doc = nlp(text_to_lemmatize)
        lemmas = [token.lemma_ for token in doc if token.is_alpha]
        return " ".join(lemmas)
    except Exception as e:
        print(f"Error en lematizaci√≥n: {e}")
        return text_to_lemmatize  # Fallback sin lematizaci√≥n

# Aplicar a los textos principales
print("Procesando textos de train...")
train_df['text_processed'] = train_df['text'].apply(preprocessing)
print("Procesando textos de test...")
test_df['text_processed'] = test_df['text'].apply(preprocessing)

# Funciones para procesar opciones
def process_options_dict(options_dict):
    """Procesar opciones del training (diccionario)"""
    processed = {}
    for key, value in options_dict.items():
        processed_key = preprocessing(key,True)
        processed[processed_key] = value
    return processed

def process_options_list(options_list):
    """Procesar opciones del test (lista)"""
    return [preprocessing(opt,True) for opt in options_list]

# Aplicar a las opciones
print("Procesando opciones de train...")
train_df['options_processed'] = train_df['options'].apply(process_options_dict)
print("Procesando opciones de test...")
test_df['options_processed'] = test_df['options'].apply(process_options_list)

print("‚úÖ Preprocesamiento completado")
print(f"Ejemplo de texto procesado: {train_df['text_processed'].iloc[0][:100]}...")
print(f"Ejemplo de opciones procesadas: {list(train_df['options_processed'].iloc[0].keys())}")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\santi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\santi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Error loading stopwords-fr: Package 'stopwords-fr' not
[nltk_data]     found in index


Procesando textos de train...
Procesando textos de test...
Procesando opciones de train...
Procesando opciones de test...
‚úÖ Preprocesamiento completado
Ejemplo de texto procesado: lra limite resistance attelage poste regulation pl pleine lign pn passage avoir niveau rfn reseau fe...
Ejemplo de opciones procesadas: ['plan regularite', 'poste et de regulation assurer le commande de installation de signalisation et le gestion de le circulation de huit ligne avoir grand vitesse', 'pont de', 'plan regional']


### 5. Training

### TF-IDF

In [6]:
# Partir datos en train y validation
from sklearn.model_selection import train_test_split
# Partir datos en train y validation
train_final, val_final = train_test_split(train_df, test_size=0.3, random_state=100)


In [7]:
# === NUEVO: SISTEMA DE SIMILITUD ===
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

print("Construyendo vocabulario TF-IDF...")

# Crear un corpus con TODOS los textos relevantes
all_texts_for_tfidf = []

# 1. Agregar textos principales de entrenamiento y prueba
all_texts_for_tfidf.extend(train_final['text_processed'].tolist())

# 2. Agregar TODAS las opciones preprocesadas (esto es clave)
for options_dict in train_final['options_processed']:
    all_texts_for_tfidf.extend(options_dict.keys())

# 3. Agregar textos de test tambi√©n para mejor cobertura
all_texts_for_tfidf.extend(test_df['text_processed'].tolist())

# 4. Entrenar TF-IDF GLOBAL con par√°metros mejorados
global_tfidf = TfidfVectorizer(
    ngram_range=(1, 3),  # Incluir bigramas y trigramas
    min_df=2,            # Ignorar t√©rminos muy raros
    max_features=10000   # Limitar vocabulario
)
global_tfidf.fit(all_texts_for_tfidf)

print(f"‚úÖ TF-IDF GLOBAL entrenado con {len(global_tfidf.vocabulary_)} t√©rminos")


Construyendo vocabulario TF-IDF...
‚úÖ TF-IDF GLOBAL entrenado con 7425 t√©rminos


In [8]:
val_final.head()
#Add a new column with an int being the index of the correct answer among the options
def get_answer_index(row):
    options = row['options_processed']
    for idx, option in enumerate(options):
        if options[option] == True:
            return idx
    return -1  # En caso de no encontrar
val_final['answer_index'] = val_final.apply(get_answer_index, axis=1)
val_final.head()

Unnamed: 0,text,acronym,options,text_processed,options_processed,answer_index
184,VENISSIEUX √† AMBERIEU Article B101 Domaine de...,EF,{'Entreprise Ferroviaire : Toute entreprise ...,venissieux avoir amberieu article domaine circ...,{'entreprise ferroviaire tout entreprise avoir...,0
252,Boingneville PLBV ‚Äì Poste 1MALESHERBES(1) ‚Äì Et...,BV,"{'Bassin Versant': False, 'B√¢timent des Voyage...",boingnevill plbv poste etablissement pl suscep...,"{'bassin verser': False, 'batiment de voyageur...",1
397,Feuqui√®res Fressenneville - PL 30Woincourt - P...,PL,"{'Panneaux lumineux': False, 'Pleine Ligne. ...",feuquiere fressennevill pl pl kilom√®tre ligne ...,"{'panneau lumineux': False, 'pleine lign etabl...",-1
325,franchissement des signaux d'arr√™t Ligne √©quip...,DAAT,"{'Diagnostique Amiante Avant Travaux': False, ...",franchissemer signal ligne equipe dispositif a...,"{'diagnostique amiant avant travail': False, '...",1
136,Page 4 NPDC-RT-2211B- Version 01 du 15-12-2014...,EF,"{'Equipement fixe': False, 'Essai de Frein': F...",page version article domaine circulation appar...,"{'equipement fixe': False, 'essai de frein': F...",2


In [9]:
# Validation con m√∫ltiples par√°metros
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def improved_similarity_prediction_validation(df, threshold=0.01, ngram_range=(1, 2), 
                                            use_acronym=True, query_strategy="text_acronym",
                                            fallback_strategy="first"):
    """
    Predicci√≥n mejorada con m√∫ltiples par√°metros ajustables
    
    Args:
        df: DataFrame de validaci√≥n
        threshold: Umbral de similitud m√≠nima
        ngram_range: Rango de n-gramas para TF-IDF
        use_acronym: Si incluir el acr√≥nimo en la consulta
        query_strategy: Estrategia para construir la consulta
        fallback_strategy: Estrategia cuando no supera el threshold
    """
    predictions = []
    validation_scores = []
    confidence_scores = []
    
    for idx, row in df.iterrows():
        try:
            text = row['text_processed']
            acronym = row['acronym']
            options = row['options_processed']
            
            # Diferentes estrategias para construir la consulta
            if query_strategy == "text_only":
                query = text
            elif query_strategy == "acronym_only":
                query = acronym
            elif query_strategy == "text_acronym":
                query = f"{text} {acronym}"
            elif query_strategy == "acronym_text":
                query = f"{acronym} {text}"
            elif query_strategy == "weighted_text":
                # Dar m√°s peso al texto repiti√©ndolo
                query = f"{text} {text} {acronym}"
            else:
                query = f"{text} {acronym}"
            
            # Crear vectorizador TF-IDF con par√°metros ajustables
            vectorizer = TfidfVectorizer(ngram_range=ngram_range)
            
            # Since options in validation are in dict format, we need to get the keys
            options_list = list(options.keys())
            corpus = [query] + options_list
            tfidf_matrix = vectorizer.fit_transform(corpus)
            
            # Calcular similitudes (query est√° en posici√≥n 0)
            similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])[0]
            
            # Encontrar la opci√≥n m√°s similar
            best_idx = np.argmax(similarities)
            best_score = similarities[best_idx]
            
            confidence_scores.append(best_score)
            
            if best_score > threshold:
                # Check if the predicted option is indeed the correct one
                if best_idx == row['answer_index']:
                    validation_scores.append(1)
                else:
                    validation_scores.append(0)
                predictions.append([best_idx])
            else:
                # Diferentes estrategias de fallback
                if fallback_strategy == "first":
                    fallback_idx = 0
                elif fallback_strategy == "random":
                    fallback_idx = np.random.randint(0, len(options_list))
                elif fallback_strategy == "last":
                    fallback_idx = len(options_list) - 1
                elif fallback_strategy == "middle":
                    fallback_idx = len(options_list) // 2
                else:
                    fallback_idx = 0
                
                predictions.append([fallback_idx])
                if fallback_idx == row['answer_index']:
                    validation_scores.append(1)
                else:
                    validation_scores.append(0)
                    
        except Exception as e:
            print(f"Error en fila {idx}: {e}")
            # Fallback seguro
            predictions.append([0])
            validation_scores.append(0)
    
    # Calcular m√©tricas
    accuracy = np.mean(validation_scores) if validation_scores else 0
    avg_confidence = np.mean(confidence_scores) if confidence_scores else 0
    coverage = np.mean([1 if score > threshold else 0 for score in confidence_scores])
    
    return predictions, accuracy, avg_confidence, coverage

# ===== PRUEBA DE DIFERENTES PAR√ÅMETROS =====

print("=== VALIDACI√ìN COMPLETA CON M√öLTIPLES PAR√ÅMETROS ===\n")

# 1. Prueba de diferentes thresholds
print("1. PROBANDO DIFERENTES THRESHOLDS:")
thresholds = [0, 0.001, 0.005, 0.01, 0.02, 0.05, 0.1, 0.15, 0.2, 0.25]
for thresh in thresholds:
    _, accuracy, avg_conf, coverage = improved_similarity_prediction_validation(
        val_final, threshold=thresh)
    print(f"  Threshold {thresh:.3f}: Accuracy={accuracy:.4f}, "
          f"Confianza_promedio={avg_conf:.4f}, Cobertura={coverage:.4f}")

# 2. Prueba de diferentes rangos de n-gramas
print("\n2. PROBANDO DIFERENTES N-GRAMAS:")
ngram_ranges = [(1, 1), (1, 2), (1, 3), (2, 2), (2, 3), (3, 3)]
for ngram in ngram_ranges:
    _, accuracy, avg_conf, coverage = improved_similarity_prediction_validation(
        val_final, ngram_range=ngram)
    print(f"  ngram_range {ngram}: Accuracy={accuracy:.4f}, "
          f"Confianza_promedio={avg_conf:.4f}, Cobertura={coverage:.4f}")

# 3. Prueba de diferentes estrategias de consulta
print("\n3. PROBANDO ESTRATEGIAS DE CONSULTA:")
query_strategies = ["text_only", "acronym_only", "text_acronym", "acronym_text", "weighted_text"]
for strategy in query_strategies:
    _, accuracy, avg_conf, coverage = improved_similarity_prediction_validation(
        val_final, query_strategy=strategy)
    print(f"  Query '{strategy}': Accuracy={accuracy:.4f}, "
          f"Confianza_promedio={avg_conf:.4f}, Cobertura={coverage:.4f}")

# 4. Prueba de diferentes estrategias de fallback
print("\n4. PROBANDO ESTRATEGIAS DE FALLBACK:")
fallback_strategies = ["first", "random", "last", "middle"]
for fallback in fallback_strategies:
    _, accuracy, avg_conf, coverage = improved_similarity_prediction_validation(
        val_final, fallback_strategy=fallback)
    print(f"  Fallback '{fallback}': Accuracy={accuracy:.4f}, "
          f"Confianza_promedio={avg_conf:.4f}, Cobertura={coverage:.4f}")

# 5. Combinaciones prometedoras
print("\n5. COMBINACIONES PROMETEDORAS:")
combinations = [
    {"threshold": 0.01, "ngram_range": (1, 2), "query_strategy": "text_acronym"},
    {"threshold": 0.05, "ngram_range": (1, 3), "query_strategy": "weighted_text"},
    {"threshold": 0.02, "ngram_range": (1, 2), "query_strategy": "text_only"},
    {"threshold": 0.1, "ngram_range": (2, 3), "query_strategy": "acronym_text"},
]

for i, combo in enumerate(combinations):
    _, accuracy, avg_conf, coverage = improved_similarity_prediction_validation(
        val_final, **combo)
    print(f"  Combo {i+1}: Accuracy={accuracy:.4f}, "
          f"Confianza_promedio={avg_conf:.4f}, Cobertura={coverage:.4f}")
    print(f"    Par√°metros: {combo}")

# 6. B√∫squeda m√°s fina alrededor del mejor threshold encontrado
print("\n6. B√öSQUEDA FINA DE THRESHOLD:")
fine_thresholds = [0.005, 0.008, 0.01, 0.012, 0.015, 0.018, 0.02, 0.025]
best_threshold = 0
best_accuracy = 0

for thresh in fine_thresholds:
    _, accuracy, avg_conf, coverage = improved_similarity_prediction_validation(
        val_final, threshold=thresh)
    print(f"  Threshold {thresh:.3f}: Accuracy={accuracy:.4f}, "
          f"Confianza_promedio={avg_conf:.4f}, Cobertura={coverage:.4f}")
    
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_threshold = thresh

print(f"\nüéØ MEJOR THRESHOLD: {best_threshold} con accuracy: {best_accuracy:.4f}")

# Funci√≥n para encontrar los mejores par√°metros autom√°ticamente (CORREGIDA)
def find_best_parameters(validation_df, n_trials=50):
    """Encuentra los mejores par√°metros mediante b√∫squeda aleatoria"""
    best_params = {}
    best_accuracy = 0
    
    # Listas de par√°metros para seleccionar aleatoriamente
    thresholds = [0, 0.001, 0.005, 0.01, 0.02, 0.05, 0.1]
    ngram_ranges = [(1, 1), (1, 2), (1, 3), (2, 2), (2, 3)]
    query_strategies = ["text_only", "acronym_only", "text_acronym", "weighted_text"]
    
    for i in range(n_trials):
        # Seleccionar par√°metros aleatorios usando √≠ndices
        threshold = np.random.choice(thresholds)
        ngram_range = ngram_ranges[np.random.randint(len(ngram_ranges))]
        query_strategy = query_strategies[np.random.randint(len(query_strategies))]
        
        try:
            _, accuracy, _, _ = improved_similarity_prediction_validation(
                validation_df, 
                threshold=threshold,
                ngram_range=ngram_range,
                query_strategy=query_strategy
            )
            
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_params = {
                    'threshold': threshold,
                    'ngram_range': ngram_range,
                    'query_strategy': query_strategy,
                    'accuracy': accuracy
                }
                print(f"üî• Nuevo mejor: Accuracy={accuracy:.4f}, threshold={threshold}, ngram_range={ngram_range}, query_strategy='{query_strategy}'")
                
        except Exception as e:
            print(f"Error en trial {i}: {e}")
            continue
    
    return best_params

print("\n7. B√öSQUEDA ALEATORIA DE PAR√ÅMETROS (CORREGIDA):")
best_params = find_best_parameters(val_final, n_trials=30)
print(f"\nüèÜ MEJORES PAR√ÅMETROS ENCONTRADOS:")
for key, value in best_params.items():
    print(f"  {key}: {value}")

# 8. Prueba adicional con los mejores par√°metros encontrados
if best_params:
    print("\n8. EVALUACI√ìN FINAL CON MEJORES PAR√ÅMETROS:")
    final_predictions, final_accuracy, final_confidence, final_coverage = improved_similarity_prediction_validation(
        val_final, 
        threshold=best_params['threshold'],
        ngram_range=best_params['ngram_range'],
        query_strategy=best_params['query_strategy']
    )
    print(f"‚úÖ Resultado final: Accuracy={final_accuracy:.4f}")

=== VALIDACI√ìN COMPLETA CON M√öLTIPLES PAR√ÅMETROS ===

1. PROBANDO DIFERENTES THRESHOLDS:
  Threshold 0.000: Accuracy=0.5068, Confianza_promedio=0.0606, Cobertura=0.7500
  Threshold 0.001: Accuracy=0.5068, Confianza_promedio=0.0606, Cobertura=0.7500
  Threshold 0.005: Accuracy=0.5068, Confianza_promedio=0.0606, Cobertura=0.7500
  Threshold 0.010: Accuracy=0.5000, Confianza_promedio=0.0606, Cobertura=0.7230
  Threshold 0.020: Accuracy=0.4595, Confianza_promedio=0.0606, Cobertura=0.6554
  Threshold 0.050: Accuracy=0.3581, Confianza_promedio=0.0606, Cobertura=0.4392
  Threshold 0.100: Accuracy=0.2973, Confianza_promedio=0.0606, Cobertura=0.1959
  Threshold 0.150: Accuracy=0.2905, Confianza_promedio=0.0606, Cobertura=0.1149
  Threshold 0.200: Accuracy=0.2635, Confianza_promedio=0.0606, Cobertura=0.0676
  Threshold 0.250: Accuracy=0.2568, Confianza_promedio=0.0606, Cobertura=0.0473

2. PROBANDO DIFERENTES N-GRAMAS:
  ngram_range (1, 1): Accuracy=0.5068, Confianza_promedio=0.0903, Cobertur

In [10]:
### 6. PREDICCI√ìN FINAL EN TEST

def improved_similarity_prediction_test(df, threshold=0.01, ngram_range=(1, 2), 
                                      query_strategy="text_acronym", fallback_strategy="first"):
    """
    Predicci√≥n mejorada para el conjunto de test
    """
    predictions = []
    confidence_scores = []
    
    for idx, row in df.iterrows():
        try:
            text = row['text_processed']
            acronym = row['acronym']
            options = row['options_processed']  # En test es una lista
            
            # Construir la consulta seg√∫n la estrategia
            if query_strategy == "text_only":
                query = text
            elif query_strategy == "acronym_only":
                query = acronym
            elif query_strategy == "text_acronym":
                query = f"{text} {acronym}"
            elif query_strategy == "acronym_text":
                query = f"{acronym} {text}"
            elif query_strategy == "weighted_text":
                query = f"{text} {text} {acronym}"
            else:
                query = f"{text} {acronym}"
            
            # Crear vectorizador TF-IDF
            vectorizer = TfidfVectorizer(ngram_range=ngram_range)
            
            # En test, options es una lista, no un diccionario
            options_list = options
            corpus = [query] + options_list
            tfidf_matrix = vectorizer.fit_transform(corpus)
            
            # Calcular similitudes (query est√° en posici√≥n 0)
            similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])[0]
            
            # Encontrar la opci√≥n m√°s similar
            best_idx = np.argmax(similarities)
            best_score = similarities[best_idx]
            
            confidence_scores.append(best_score)
            
            if best_score > threshold:
                predictions.append([best_idx])
            else:
                # Estrategia de fallback
                if fallback_strategy == "first":
                    fallback_idx = 0
                elif fallback_strategy == "random":
                    fallback_idx = np.random.randint(0, len(options_list))
                elif fallback_strategy == "last":
                    fallback_idx = len(options_list) - 1
                elif fallback_strategy == "middle":
                    fallback_idx = len(options_list) // 2
                else:
                    fallback_idx = 0
                
                predictions.append([fallback_idx])
                    
        except Exception as e:
            print(f"Error en fila {idx}: {e}")
            # Fallback seguro
            predictions.append([0])
            confidence_scores.append(0)
    
    avg_confidence = np.mean(confidence_scores) if confidence_scores else 0
    coverage = np.mean([1 if score > threshold else 0 for score in confidence_scores])
    
    return predictions, avg_confidence, coverage

# Usar los mejores par√°metros encontrados para hacer las predicciones finales
print("=== HACIENDO PREDICCIONES FINALES EN TEST ===")

if 'best_params' in locals() and best_params:
    print(f"Usando mejores par√°metros encontrados: {best_params}")
    
    test_predictions, test_confidence, test_coverage = improved_similarity_prediction_test(
        test_df,
        threshold=best_params['threshold'],
        ngram_range=best_params['ngram_range'],
        query_strategy=best_params['query_strategy']
    )
else:
    # Si no se encontraron best_params, usar par√°metros por defecto
    print("Usando par√°metros por defecto")
    test_predictions, test_confidence, test_coverage = improved_similarity_prediction_test(
        test_df,
        threshold=0.01,
        ngram_range=(1, 2),
        query_strategy="text_acronym"
    )

print(f"‚úÖ Predicciones en test completadas:")
print(f"   - N√∫mero de predicciones: {len(test_predictions)}")
print(f"   - Confianza promedio: {test_confidence:.4f}")
print(f"   - Cobertura: {test_coverage:.4f}")

# Crear DataFrame de submission
predictions_df = pd.DataFrame({
    'id': test_df['id'],
    'answer': [pred for pred in test_predictions]
})

# Mostrar distribuci√≥n de las predicciones
print("\nDistribuci√≥n de las respuestas predichas:")
print(predictions_df['answer'].value_counts().sort_index())

# Guardar el archivo de submission
output_filename = 'submission_improved.csv'
predictions_df.to_csv(output_filename, index=False)
print(f"\nüéØ Archivo de submission guardado como: {output_filename}")

# Validaci√≥n r√°pida del archivo guardado
print("\n=== VALIDACI√ìN DEL ARCHIVO GUARDADO ===")
saved_df = pd.read_csv(output_filename)
print(f"Archivo cargado: {saved_df.shape[0]} filas")
print("Primeras 5 filas:")
print(saved_df.head())

=== HACIENDO PREDICCIONES FINALES EN TEST ===
Usando mejores par√°metros encontrados: {'threshold': 0.005, 'ngram_range': (1, 2), 'query_strategy': 'text_acronym', 'accuracy': 0.5067567567567568}
‚úÖ Predicciones en test completadas:
   - N√∫mero de predicciones: 519
   - Confianza promedio: 0.0480
   - Cobertura: 0.6686

Distribuci√≥n de las respuestas predichas:
answer
[0]     255
[1]     117
[2]      64
[3]      65
[4]       6
[5]       2
[6]       3
[7]       3
[8]       1
[9]       2
[12]      1
Name: count, dtype: int64

üéØ Archivo de submission guardado como: submission_improved.csv

=== VALIDACI√ìN DEL ARCHIVO GUARDADO ===
Archivo cargado: 519 filas
Primeras 5 filas:
   id answer
0   0    [1]
1   1    [1]
2   2    [0]
3   3    [1]
4   4    [0]


# TD-IDF 

### **Project Summary: Acronym Expansion System**

**Objective:** Build an NLP system to identify the correct long-form expansion of an acronym within a French text.

**Methodology & Key Steps:**

*   **Data Processing:**
    *   Loaded and cleaned JSONL datasets (`train.jsonl`, `test.jsonl`).
    *   Handled missing values and removed duplicates.

*   **Text Preprocessing:**
    *   Unified pipeline for text and acronym options.
    *   Applied lowercase conversion, accent removal, and tokenization.
    *   Utilized NLTK for stopword removal and spaCy for French lemmatization.

*   **Feature Engineering:**
    *   Created a TF-IDF vectorizer trained on a combined corpus of all processed texts and options.
    *   Incorporated n-grams (unigrams, bigrams, trigrams) to capture contextual phrases.

*   **Model & Prediction:**
    *   Core model based on **Cosine Similarity** between the TF-IDF vectors of the context (text + acronym) and the potential options.
    *   Implemented a confidence threshold to filter weak matches.

*   **Validation & Hyperparameter Tuning:**
    *   Conducted an extensive grid search to optimize key parameters:
        *   Similarity Threshold
        *   N-gram Range
        *   Query Construction Strategy (e.g., text-only, text+acronym)
    *   Systematically evaluated performance to find the best configuration.

**Outcome:**
*   Successfully generated a `submission_improved.csv` file with predictions for the test set.
*   Delivered a robust, explainable model based on semantic similarity rather than a black-box classifier.

Result : 0.51538

In [11]:
# 1. Estad√≠sticas b√°sicas del dataset
print("=== DATASET STATISTICS ===")
print(f"Training samples: {len(train_df)}")
print(f"Test samples: {len(test_df)}")
print(f"Validation samples: {len(val_final)}")
print(f"Unique acronyms in train: {train_df['acronym'].nunique()}")

# 2. Distribuci√≥n de opciones por pregunta
train_options_count = train_df['options'].apply(len)
print(f"\nOpciones por pregunta:")
print(f"  M√≠nimo: {train_options_count.min()}")
print(f"  M√°ximo: {train_options_count.max()}")
print(f"  Promedio: {train_options_count.mean():.2f}")

# 3. M√©tricas del preprocesamiento
text_lengths_before = train_df['text'].str.len()
text_lengths_after = train_df['text_processed'].str.len()
print(f"\n=== PREPROCESSING IMPACT ===")
print(f"Longitud promedio texto original: {text_lengths_before.mean():.2f} chars")
print(f"Longitud promedio texto procesado: {text_lengths_after.mean():.2f} chars")
print(f"Reducci√≥n: {((text_lengths_before.mean() - text_lengths_after.mean()) / text_lengths_before.mean() * 100):.1f}%")

# 4. Performance del modelo (si tenemos las mejores m√©tricas)
if 'best_params' in locals() and 'final_accuracy' in locals():
    print(f"\n=== MODEL PERFORMANCE ===")
    print(f"Best validation accuracy: {final_accuracy:.4f}")
    print(f"Best parameters: {best_params}")
    
# Si no, calculemos la accuracy del validation actual
else:
    from sklearn.metrics import accuracy_score
    val_true = val_final['answer_index'].values
    val_pred = [p[0] for p in val_predictions]
    current_accuracy = accuracy_score(val_true, val_pred)
    print(f"\n=== MODEL PERFORMANCE ===")
    print(f"Current validation accuracy: {current_accuracy:.4f}")

=== DATASET STATISTICS ===
Training samples: 492
Test samples: 519
Validation samples: 148
Unique acronyms in train: 77

Opciones por pregunta:
  M√≠nimo: 2
  M√°ximo: 13
  Promedio: 4.42

=== PREPROCESSING IMPACT ===
Longitud promedio texto original: 208.66 chars
Longitud promedio texto procesado: 141.72 chars
Reducci√≥n: 32.1%

=== MODEL PERFORMANCE ===
Best validation accuracy: 0.5068
Best parameters: {'threshold': 0.005, 'ngram_range': (1, 2), 'query_strategy': 'text_acronym', 'accuracy': 0.5067567567567568}
