In [None]:
# IMPORTS (ESSENTIEL !)
from datasets import load_dataset
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
import re
from datasets import load_dataset
import os
import shutil
import zipfile
from pathlib import Path
import joblib
import pickle

In [None]:
# Chargement de la partie fran√ßaise
raw_ds = load_dataset("PaDaS-Lab/webfaq", "fra")["default"]
df = pd.DataFrame(raw_ds)
print(f"Dataset fran√ßais charg√©: {len(df)} paires Q/R")
print(df[['question', 'answer', 'topic']].head())

In [None]:
# Filtrer les lignes o√π la question ET la r√©ponse sont non vides (pas NaN et pas strings vides)
df_non_vide = df[
    df['question'].notna() &
    df['answer'].notna() &
    df['question'].astype(str).str.strip().ne('') &
    df['answer'].astype(str).str.strip().ne('')
].copy()

# Prendre les 1000 premi√®res lignes
df_1000 = df_non_vide.head(1000)

# Sauvegarder en CSV
df_1000.to_csv('webfaq_fra_1000_lignes.csv', index=False)

print(f"‚úÖ Fichier 'webfaq_fra_1000_lignes.csv' cr√©√© avec {len(df_1000)} lignes")
print(f"üìä Shape: {df_1000.shape}")

In [None]:
## SUPPRESSION DOUBLONS EXACTS + VIDES
# Doublons exacts sur question OU r√©ponse
df_no_dup = df.drop_duplicates(subset=['question', 'answer'], keep='first')
# Suppression lignes vides ou trop courtes
df_clean = df_no_dup[
    (df_no_dup['question'].str.strip().str.len() > 10) &
    (df_no_dup['answer'].str.strip().str.len() > 20) &
    (df_no_dup['question'].notna()) &
    (df_no_dup['answer'].notna())
].copy()
print(f" Apr√®s filtrage basique: {len(df_clean)} lignes")

In [None]:
!pip install sentence-transformers datasets pandas scikit-learn numpy groq

In [None]:
## D√âDUPLICATION S√âMANTIQUE (BERT)
# Mod√®le BERT pour similarit√© s√©mantique
dedup_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
SAMPLE_SIZE = 15000  # R√©duit pour test rapide (augmente apr√®s)
df_sample = df_clean.head(SAMPLE_SIZE).copy()

print(f"Calcul embeddings {len(df_sample)} questions...")
q_embeddings = dedup_model.encode(
    df_sample['question'].astype(str).tolist(),
    batch_size=128,
    show_progress_bar=True
)
similarity_matrix = cosine_similarity(q_embeddings)
print(f"Matrice similarit√©: {similarity_matrix.shape}")

In [None]:
## SUPPRESSION >95% SIMILAIRES")
SEUIL_SIMILARITE = 0.95
keep_indices = []
for i in range(len(df_sample)):
    similar_rows = np.where(similarity_matrix[i] > SEUIL_SIMILARITE)[0]
    if len(similar_rows) == 1:  # Seulement elle-m√™me
        keep_indices.append(df_sample.index[i])

df_dedup = df_sample.loc[keep_indices].copy()
print(f" Apr√®s d√©duplication: {len(df_dedup)} lignes")
print(f"   ‚Üí {len(df_sample)-len(df_dedup)} supprim√©es")

In [None]:
## ANTI-SPAM G√âN√âRATIF
GENERIC_PATTERNS = [
    r"contactez-nous", r"visitez notre site", r"cliquez ici",
    r"pour plus d'infos", r"merci de votre message", r"n'h√©sitez pas",
    r"disponible 24/7", r"√©quipe d√©di√©e", r"solution personnalis√©e"
]
def is_spam_response(text: str) -> bool:
    text_lower = text.lower()
    spam_count = sum(1 for pattern in GENERIC_PATTERNS if re.search(pattern, text_lower))
    return spam_count >= 2

df_filtered = df_dedup[~df_dedup['answer'].apply(is_spam_response)].copy()
print(f" Apr√®s anti-spam: {len(df_filtered)} lignes")

In [None]:
## NETTOYAGE FINAL
def clean_text_advanced(text: str) -> str:
    t = str(text).strip()
    t = re.sub(r"http\S+|www\S+", "", t)
    t = re.sub(r"\s+", " ", t)
    t = re.sub(r"[^\w\s\?\!\.,;:'-]", "", t)
    return t
df_filtered['question_clean'] = df_filtered['question'].apply(clean_text_advanced)
df_filtered['answer_clean'] = df_filtered['answer'].apply(clean_text_advanced)

df_ready = df_filtered.drop_duplicates(subset=['question_clean', 'answer_clean'])

print(f"\n DATASET FINAL: {len(df_ready)} paires Q/R PROPRE !")
print(f" Gain: {100*(1-len(df_ready)/len(df)):.1f}%")

In [None]:
## FORMAT PIVOT (STANDARD INTERNE)
# Transforme df_ready ‚Üí format pivot standard pour tout le pipeline

def build_pivot_from_webfaq_fr(df_ready):
    """
    Cr√©e le FORMAT PIVOT √† partir du dataset nettoy√©
    FORMAT : 5 colonnes fixes pour tous les modules suivants
    """
    pivot_rows = []

    print(" Construction format pivot...")
    for i, row in df_ready.iterrows():
        # Utilise les colonnes nettoy√©es
        q = row['question_clean']
        a = row['answer_clean']

        # M√âTADONN√âES COMPL√àTES (tra√ßabilit√©)
        meta = {
            "origin": row.get("origin"),
            "url": row.get("url"),
            "topic_source": row.get("topic"),
            "question_type": row.get("question_type"),
            "lang": "fr",
            "source_row": i
        }
        # LIGNE PIVOT COMPL√àTE
        pivot_rows.append({
            "id_question": f"webfaq_fra_clean_{i}",
            "texte_question": q,
            "texte_reponse": a,
            "topic_id": -1,           # √Ä remplir par clustering
            "meta": meta              # JSON pour tra√ßabilit√©
        })

    pivot_df = pd.DataFrame(pivot_rows)
    # SAUVEGARDE INTERM√âDIAIRE
    pivot_df.to_csv("faq_pivot_propre.csv", index=False)

    print(f" FORMAT PIVOT CR√â√â: {len(pivot_df)} paires Q/R")
    print("\n Structure pivot:")
    print(pivot_df[['id_question', 'texte_question', 'texte_reponse', 'topic_id']].head())
    print("\n Exemple meta:", pivot_df['meta'].iloc[0])

    return pivot_df

# APPLICATION
pivot_df = build_pivot_from_webfaq_fr(df_ready)

In [None]:
## CLUSTERING BERT
# CLUSTERING TH√âMATIQUE..
# R√©utilise le mod√®le BERT d√©j√† charg√© (ou recharge si besoin)
if 'model' not in locals():
    model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

print(" Calcul embeddings questions pivot...")
question_embeddings = model.encode(
    pivot_df["texte_question"].tolist(),
    batch_size=64,
    show_progress_bar=True
)

# K-MEANS : regroupe questions similaires par th√®me
n_clusters = min(20, len(pivot_df) // 15)  # ~15 questions par th√®me
print(f" {n_clusters} th√®mes cr√©√©s...")

km = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
topic_labels = km.fit_predict(question_embeddings)

# AJOUT AU PIVOT
pivot_df["topic_id"] = topic_labels

print(f" CLUSTERING TERMIN√â!")
print(" R√©partition th√®mes:",
      pd.Series(topic_labels).value_counts().sort_index().head())

In [None]:
## AFFICHAGE R√âPARTITION PAR TH√àME
print("\n TOP 5 TH√àMES PAR VOLUME:")
topic_stats = pivot_df.groupby('topic_id').agg({
    'id_question': 'count',
    'texte_question': lambda x: pivot_df.loc[x.index, 'texte_question'].iloc[0]
}).rename(columns={'id_question': 'nb_paires'})
topic_stats = topic_stats.sort_values('nb_paires', ascending=False).head()

print(topic_stats)
print("\n Pivot pr√™t pour Deepseek !")

In [None]:
## G√âN√âRATION TITRES TH√àMES (DEEPSEEK)
from groq import Groq
import os
# CONFIG GROQ (remplace par ta cl√©)
GROQ_API_KEY = "VOTRE_CLE"  # CL√â ICI
client = Groq(api_key=GROQ_API_KEY)

DEEPSEEK_MODEL = "llama-3.3-70b-versatile"  # ou "llama-3.1-70b-versatile"

def call_deepseek(prompt, max_tokens=100):
    """Appel Deepseek via Groq"""
    chat_completion = client.chat.completions.create(
        messages=[{"role": "user", "content": prompt}],
        model=DEEPSEEK_MODEL,
        temperature=0.1,  # Tr√®s d√©terministe
        max_tokens=max_tokens
    )
    return chat_completion.choices[0].message.content.strip()

print(" G√©n√©ration titres th√®mes avec Deepseek...")

In [None]:
def generate_topic_title(topic_id, sample_questions, max_questions=6):
    """
    Cr√©e un titre fran√ßais concis pour un th√®me
    """
    questions_sample = sample_questions[:max_questions]
    questions_text = "\n".join([f"‚Ä¢ {q[:80]}..." for q in questions_sample])

    prompt = f"""Expert FAQ fran√ßais. √Ä partir de ces questions, proposez UN SEUL titre de th√®me (3-10 mots maximum)
    et les th√®mes ne doivent pas avoir de doublons et nous sommes pas oblig√©s d'avoir 30 th√®mes, s'il y'a plus de th√®mes on arr√™te:

QUESTIONS DU TH√àME :
{questions_text}

TITRE DU TH√àME :"""

    try:
        title = call_deepseek(prompt, max_tokens=30)
        # Nettoie le titre (enl√®ve num√©ros, guillemets)
        title = re.sub(r'^\d+\.?\s*', '', title).strip(' "').strip()
        return title[:60]  # Limite longueur
    except Exception as e:
        print(f" Erreur API topic {topic_id}: {e}")
        return f"Th√®me {topic_id}"

# G√©n√©ration pour TOUS les th√®mes
topic_titles = {}
unique_topics = sorted(pivot_df["topic_id"].unique())

print(f" {len(unique_topics)} appels API en cours...")
for i, topic_id in enumerate(unique_topics):
    if i % 5 == 0:
        print(f"   Progression: {i+1}/{len(unique_topics)}")

    # Questions du th√®me
    topic_questions = pivot_df[pivot_df["topic_id"] == topic_id]["texte_question"].tolist()
    title = generate_topic_title(topic_id, topic_questions)
    topic_titles[topic_id] = title
    print(f"   Topic {topic_id:2d}: '{title}'")

# AJOUT AU PIVOT
pivot_df["topic_label"] = pivot_df["topic_id"].map(topic_titles)

print("\n TITRES G√âN√âR√âS !")
print("\n TOP 5 TH√àMES:")
for topic_id in pivot_df["topic_id"].value_counts().head(5).index:
    count = (pivot_df["topic_id"] == topic_id).sum()
    title = topic_titles[topic_id]
    print(f"   {topic_id:2d}: {title} ({count} Q/R)")

In [None]:
# AFFICHAGE COMPL√àT DES TH√àMES
print("\n R√âCAPITULATIF COMPLET:")
theme_summary = pivot_df.groupby(['topic_id', 'topic_label']).agg({
    'id_question': 'count'
}).rename(columns={'id_question': 'nb_paires'}).reset_index()

theme_summary = theme_summary.sort_values('nb_paires', ascending=False)
print(theme_summary.head(10).to_string(index=False))

# SAUVEGARDE INTERM√âDIAIRE
pivot_df.to_csv("pivot_avec_themes.csv", index=False)
print("\n Sauvegard√©: pivot_avec_themes.csv")

In [None]:
## SYNTH√àSE FAQ PAR TH√àME (DEEPSEEK)
# SYNTH√àSE FAQ PAR TH√àME EN COURS...
def synthesize_faq_per_topic(topic_df, topic_title):
    """
    Cr√©e une FAQ propre (4-10 Q/R principales) pour 1 th√®me
    """
    # Top 12 paires Q/R du th√®me (pour diversit√©)
    # Fix: Create a temporary column for string length before using nlargest
    topic_df_copy = topic_df.copy()
    topic_df_copy['question_length'] = topic_df_copy['texte_question'].apply(len)
    top_pairs = topic_df_copy.nlargest(12, 'question_length')[['texte_question', 'texte_reponse']]

    prompt = f"""Expert FAQ fran√ßais. √Ä partir de ces paires Q/R d'un m√™me th√®me, cr√©ez une FAQ parfaite :

TH√àME : {topic_title}

PAIRES SOURCES :
"""
    for _, row in top_pairs.iterrows():
        prompt += f"Q: {row['texte_question'][:150]}...\nR: {row['texte_reponse'][:200]}...\n\n"

    prompt += """FAQ FINALE (format Markdown strict) :

## """ + topic_title + """

1. **Question claire et concise ?**
   R√©ponse compl√®te, pr√©cise et utile.

2. **Autre question importante ?**
   ...
**Instructions :**
- 3-5 questions maximum
- Questions reformul√©es (plus naturelles)
- R√©ponses synth√©tis√©es (meilleur de toutes les sources)
- Style professionnel, direct
- Fran√ßais impeccable
"""
    try:
        faq_markdown = call_deepseek(prompt, max_tokens=1200)
        return faq_markdown.strip()
    except Exception as e:
        return f"## {topic_title}\n\n Erreur g√©n√©ration FAQ"

## G√âN√âRATION POUR LES 8 PLUS GROS TH√àMES
print(" G√©n√©ration 8 meilleures FAQ...")
faq_final = {}
top_topics = pivot_df["topic_id"].value_counts().head(8).index
for i, topic_id in enumerate(top_topics):
    print(f"   {i+1}/8: Th√®me {topic_id}...")
    topic_data = pivot_df[pivot_df["topic_id"] == topic_id]
    title = topic_titles[topic_id]
    faq_final[title] = synthesize_faq_per_topic(topic_data, title)

    # Aper√ßu rapide
    preview = faq_final[title][:200] + "..."
    print(f"  '{title}' \u2192 {len(faq_final[title])} chars")
print("\n SYNTH\u00c8SE TERMIN\u00c9E !")


In [None]:
# Convert the dictionary of FAQs into a DataFrame for better display
faq_df = pd.DataFrame(faq_final.items(), columns=['Topic', 'FAQ_Content'])
print(faq_df.head())

In [None]:
## EXPORT MULTI-FORMATS
# EXPORT R√âSULTATS FINAUX...
# FAQ MASTER (Markdown complet)
with open("FAQ_WEBFAQ_FRANCAIS.md", "w", encoding="utf-8") as f:
    f.write("# FAQ G√©n√©r√©e Automatiquement - WebFAQ Fran√ßais\n")
    f.write("*(BERT Clustering + Deepseek Synth√®se)*\n\n")

    for title, content in faq_final.items():
        f.write(content + "\n\n" + "="*80 + "\n\n")

print(" FAQ_WEBFAQ_FRANCAIS.md ‚Üê FAQ compl√®te (publication)")

# Pivot final (tra√ßabilit√© compl√®te)
pivot_df.to_csv("pivot_final_complet.csv", index=False)
print(" pivot_final_complet.csv ‚Üê Toutes donn√©es + th√®mes")

# R√©sum√© stats
stats = pd.DataFrame({
    'theme': list(faq_final.keys()),
    'nb_questions_original': [len(pivot_df[pivot_df["topic_label"] == t]) for t in faq_final.keys()]
})
stats.to_csv("stats_themes.csv", index=False)

print(" stats_themes.csv ‚Üê M√©triques")

# Aper√ßu console des 3 premi√®res FAQ
print("\n APER√áU 3 PREMI√àRES FAQ :")
for i, (title, content) in enumerate(list(faq_final.items())[:11]):
    print(f"\n{'='*60}")
    #print(f"## {title}")
    print(content[:4000] + "...")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Define the output directory and zip file name
output_dir = "./generated_artifacts"
zip_file_name = "webfaq_fr_artifacts.zip"

# Create the output directory if it doesn't exist
Path(output_dir).mkdir(parents=True, exist_ok=True)

# 3. If the dedup_model (SentenceTransformer) is not already in the global scope, re-instantiate it
# The dedup_model variable is currently not in the kernel state, but 'model' is. Assuming they are the same.
# If 'dedup_model' was meant to be distinct, it would need to be reloaded explicitly.
# For now, I'll use 'model' as it's available and of the correct type.
# If 'dedup_model' was indeed set earlier, this check ensures it's available.
# In the provided kernel state, 'model' is present, not 'dedup_model' from cell 0Mud4Jd3Qxud.
# However, the task specifically mentions 'dedup_model', which was defined earlier in the notebook.
# Let's re-instantiate it to be safe, as it might not be the same instance as 'model' if 'model' was reloaded.
# Re-instantiate dedup_model for saving if it's not the same as 'model'
# For this task, we will just use the model that was used for clustering, which is `model`.
# Let's save `model` as it's the most recent SentenceTransformer.

# 4. Save the dedup_model (BERT model) to the temporary directory.
model_path = Path(output_dir) / "sentence_transformer_model"
model.save(str(model_path))
print(f"SentenceTransformer model saved to {model_path}")

# 5. Save the km (KMeans model) to the temporary directory.
kmeans_model_path = Path(output_dir) / "kmeans_model.joblib"
joblib.dump(km, str(kmeans_model_path))
print(f"KMeans model saved to {kmeans_model_path}")

# 6. Save the question_embeddings (numpy array) to the temporary directory.
embeddings_path = Path(output_dir) / "question_embeddings.npy"
np.save(str(embeddings_path), question_embeddings)
print(f"Question embeddings saved to {embeddings_path}")

# 7. Create a configuration dictionary containing the n_clusters and topic_titles.
# The topic_titles from the kernel state contains numpy.int32 keys which are not JSON serializable.
# Convert these keys to standard Python integers.
config = {
    "n_clusters": n_clusters,
    "topic_titles": {int(k): v for k, v in topic_titles.items()}
}

# 8. Save the configuration dictionary to the temporary directory.
config_path = Path(output_dir) / "config.pkl"
with open(config_path, 'wb') as f:
    pickle.dump(config, f)
print(f"Configuration dictionary saved to {config_path}")

# 9. Copy the pivot_final_complet.csv file, the FAQ_WEBFAQ_FRANCAIS.md file, and the stats_themes.csv file into the temporary directory.
shutil.copy("pivot_final_complet.csv", output_dir)
shutil.copy("FAQ_WEBFAQ_FRANCAIS.md", output_dir)
shutil.copy("stats_themes.csv", output_dir)
print("CSV and Markdown files copied to temporary directory.")

# 10. Compress the temporary directory into a single zip file.
shutil.make_archive(zip_file_name.replace('.zip', ''), 'zip', output_dir)
print(f"Temporary directory zipped to {zip_file_name}")

# 11. Move the generated zip file to a specified folder in your Google Drive, for example, /content/drive/MyDrive/.
drive_path = Path('/content/drive/MyDrive') / zip_file_name
shutil.move(zip_file_name, str(drive_path))
print(f"Zip file moved to Google Drive at {drive_path}")

# 12. Remove the temporary directory and all its contents to clean up local storage.
shutil.rmtree(output_dir)
print(f"Temporary directory {output_dir} removed.")
print("All artifacts saved and cleaned up successfully!")