## 1. Installation des bibliothèques

In [7]:
!pip install -qU langchain sentence_transformers langchain_community langchain-huggingface faiss-cpu kagglehub tiktoken transformers sentencepiece langchain-google-genai

## 2. Téléchargement et chargement du dataset

In [2]:
import kagglehub

# Téléchargement du dataset
path = kagglehub.dataset_download("cryptexcode/mpst-movie-plot-synopses-with-tags")
print("Path to dataset files:", path)

import pandas as pd

# Chargement du CSV
try:
    df = pd.read_csv(path + '/mpst_full_data.csv')
    print(df.head())
except FileNotFoundError:
    print(f"Error: File not found at {path + '/mpst_full_data.csv'}. Check the path or filename.")
except Exception as e:
    print(f"An error occurred: {e}")

Downloading from https://www.kaggle.com/api/v1/datasets/download/cryptexcode/mpst-movie-plot-synopses-with-tags?dataset_version_number=1...


100%|██████████| 28.8M/28.8M [00:00<00:00, 47.4MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/cryptexcode/mpst-movie-plot-synopses-with-tags/versions/1
     imdb_id                                          title  \
0  tt0057603                        I tre volti della paura   
1  tt1733125  Dungeons & Dragons: The Book of Vile Darkness   
2  tt0033045                     The Shop Around the Corner   
3  tt0113862                             Mr. Holland's Opus   
4  tt0086250                                       Scarface   

                                       plot_synopsis  \
0  Note: this synopsis is for the orginal Italian...   
1  Two thousand years ago, Nhagruul the Foul, a s...   
2  Matuschek's, a gift store in Budapest, is the ...   
3  Glenn Holland, not a morning person by anyone'...   
4  In May 1980, a Cuban man named Tony Montana (A...   

                                                tags  split synopsis_source  
0          cult, horror, gothic, murder, atmospheric  train            imdb  
1              

## 3. Nettoyage et préparation

In [3]:
print(df.columns)
if 'plot_synopsis' in df.columns:
    df.rename(columns={'plot_synopsis': 'text'}, inplace=True)

df.dropna(subset=['text'], inplace=True)
df.reset_index(drop=True, inplace=True)
df.head()

Index(['imdb_id', 'title', 'plot_synopsis', 'tags', 'split',
       'synopsis_source'],
      dtype='object')


Unnamed: 0,imdb_id,title,text,tags,split,synopsis_source
0,tt0057603,I tre volti della paura,Note: this synopsis is for the orginal Italian...,"cult, horror, gothic, murder, atmospheric",train,imdb
1,tt1733125,Dungeons & Dragons: The Book of Vile Darkness,"Two thousand years ago, Nhagruul the Foul, a s...",violence,train,imdb
2,tt0033045,The Shop Around the Corner,"Matuschek's, a gift store in Budapest, is the ...",romantic,test,imdb
3,tt0113862,Mr. Holland's Opus,"Glenn Holland, not a morning person by anyone'...","inspiring, romantic, stupid, feel-good",train,imdb
4,tt0086250,Scarface,"In May 1980, a Cuban man named Tony Montana (A...","cruelty, murder, dramatic, cult, violence, atm...",val,imdb


# 4. Création d'embeddings et Vector Store

In [5]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

model_name = 'all-MiniLM-L6-v2'
embedding_model = SentenceTransformer(model_name)

corpus = df['text'].tolist()
embeddings = embedding_model.encode(corpus, show_progress_bar=True)

embedding_dim = embeddings.shape[1]
index = faiss.IndexFlatL2(embedding_dim)
index.add(np.array(embeddings, dtype=np.float32))
print("Nombre de vecteurs dans l'index:", index.ntotal)

Batches:   0%|          | 0/464 [00:00<?, ?it/s]

Nombre de vecteurs dans l'index: 14828


### 4.1 Utilisation de LangChain pour gérer le VectorStore

In [8]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.docstore.document import Document

documents = []
for i, row in df.iterrows():
    doc = Document(
        page_content=row['text'],
        metadata={'title': row['movie_title'] if 'movie_title' in row else f'Movie_{i}'}
    )
    documents.append(doc)

embedding_fn = HuggingFaceEmbeddings(model_name=model_name)
vectorstore = FAISS.from_documents(documents, embedding_fn)
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 10})

## 5. Implémentation du Reranking (Cross-Encoder)

In [9]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

cross_encoder_model_name = "cross-encoder/ms-marco-MiniLM-L-6-v2"
tokenizer = AutoTokenizer.from_pretrained(cross_encoder_model_name)
cross_encoder = AutoModelForSequenceClassification.from_pretrained(cross_encoder_model_name)

def rerank(query, documents):
    pairs = [(query, doc.page_content) for doc in documents]
    inputs = tokenizer(
        pairs,
        padding=True, truncation=True,
        return_tensors='pt'
    )
    with torch.no_grad():
        scores = cross_encoder(**inputs).logits.squeeze()
    doc_scores = list(zip(documents, scores.tolist()))
    doc_scores = sorted(doc_scores, key=lambda x: x[1], reverse=True)
    return doc_scores

tokenizer_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

### 5.1 Test du Reranking

In [10]:
query_test = "A story about a group of friends who go on a road trip."
candidate_docs = retriever.get_relevant_documents(query_test)

print("---- Documents avant Reranking ----")
for i, doc in enumerate(candidate_docs):
    print(f"{i+1}. {doc.metadata} -> {doc.page_content[:100]}...")

reranked = rerank(query_test, candidate_docs)
print("\n---- Documents après Reranking ----")
for i, (doc, score) in enumerate(reranked):
    print(f"{i+1}. Score={score:.4f} | {doc.metadata} -> {doc.page_content[:100]}...")

  candidate_docs = retriever.get_relevant_documents(query_test)


---- Documents avant Reranking ----
1. {'title': 'Movie_3950'} -> A group of friends is on vacation. Michael (Joseph Cross) and Lyla (Briana Evigan) are a couple, as ...
2. {'title': 'Movie_4835'} -> It begins with a TV newsman talking about Flight 180 and its survivors and who got killed. Only one ...
3. {'title': 'Movie_8849'} -> Three Americans from New York arrive in Tangier in 1947. Port Moresby (John Malkovich) and his wife ...
4. {'title': 'Movie_2391'} -> A family is driving down a desert highway when they hit a deer. The dad gets out and comes back with...
5. {'title': 'Movie_14503'} -> A family is driving down a desert highway when they hit a deer. The dad gets out and comes back with...
6. {'title': 'Movie_4442'} -> Five college students leave New York City for a weekend in the country, and 48 hours later they vani...
7. {'title': 'Movie_13174'} -> Austin, Lawson, Michael, and Will are four college-aged Christians who have grown up in the bubble o...
8. {'title': 'Movie_1208

## 6. Chaîne RAG avec Reranking

In [None]:
import os
GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY") or input("Veuillez saisir votre clé Google : ")

In [19]:
from langchain.chains import RetrievalQA
from langchain_google_genai import ChatGoogleGenerativeAI
from typing import List, Callable
from langchain.schema import Document as LCDocument
from langchain.schema import BaseRetriever

# ---- 2) Instanciation du modèle Gemini ----
model = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash",
    google_api_key=GOOGLE_API_KEY,
    temperature=0.0
)

# ---- 3) Votre RerankingRetriever défini précédemment ----
class RerankingRetriever(BaseRetriever):
    """
    Un retriever qui encapsule un retriever de base + un rerank_fn
    """
    def __init__(
        self,
        base_retriever: BaseRetriever,
        rerank_fn: Callable,
        k: int = 5
    ):
        """
        On stocke les paramètres dans des attributs "privés" pour éviter
        le conflit avec Pydantic (champ inconnu).
        """
        super().__init__()
        self._base_retriever = base_retriever
        self._rerank_fn = rerank_fn
        self._k = k

    def _get_relevant_documents(self, query: str) -> List[LCDocument]:
        docs = self._base_retriever.get_relevant_documents(query)
        reranked = self._rerank_fn(query, docs)
        top_k_docs = [doc_score[0] for doc_score in reranked[: self._k]]
        return top_k_docs

    async def _aget_relevant_documents(self, query: str) -> List[LCDocument]:
        raise NotImplementedError

reranking_retriever = RerankingRetriever(
    base_retriever=retriever,  # votre retriever FAISS/LangChain existant
    rerank_fn=rerank,          # votre fonction Cross-Encoder (déjà définie)
    k=5
)

# ---- 4) Construction de la chaîne RAG avec le modèle Gemini ----
qa_chain_rerank = RetrievalQA.from_chain_type(
    llm=model,
    chain_type="stuff",
    retriever=reranking_retriever
)

# ---- 5) Test de la chaîne ----
query_rerank = "what movies is with magician ?"
result_rerank = qa_chain_rerank.invoke(query_rerank)
print("Réponse du LLM (avec Reranking) :", result_rerank)

Réponse du LLM (avec Reranking) : {'query': 'what movies is with magician ?', 'result': 'The provided text contains information about the following movies with magicians:\n\n* **The Illusionist:**  A film about a magician who falls in love with a woman betrothed to a ruthless crown prince.\n* **The Prestige:** A story of two rival magicians in 19th century England, focusing on their rivalry and the lengths they go to for the perfect illusion.\n* **The Great Houdini:** A film about a magician who incorporates ventriloquism into his act and whose career takes off, but whose life takes a dark turn.\n* **Now You See Me:** A movie about a group of illusionists known as The Horsemen who pull off elaborate heists during their shows.\n* **Oz the Great and Powerful:** A film about an ambitious illusionist who is swept away to the land of Oz and becomes the Wizard.'}
