In [31]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/rotten-tomatoes-movies-and-critic-reviews-dataset/rotten_tomatoes_movies.csv
/kaggle/input/rotten-tomatoes-movies-and-critic-reviews-dataset/rotten_tomatoes_critic_reviews.csv


### 1. Importar librerías y preprocesamiento

In [32]:
import pandas as pd
import numpy as np
import re
import nltk
import unicodedata
from collections import Counter
from nltk.stem import porter, wordnet
from nltk.corpus import stopwords

# ----------------------------------------------------------------------
# 0. Inicialización de NLTK
# ----------------------------------------------------------------------
try:
    stemmer = porter.PorterStemmer()
    lemmatizer = wordnet.WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
except LookupError:
    print("Recursos de NLTK no encontrados localmente")
    stemmer = porter.PorterStemmer()
    lemmatizer = None
    stop_words = set()

# ----------------------------------------------------------------------
# 1. Funciones de preprocesamiento 
# ----------------------------------------------------------------------
def clean_and_normalize(doc):
    doc = unicodedata.normalize('NFKD', doc).encode('ascii','ignore').decode('utf-8')
    doc = doc.lower()
    doc = re.sub(r'[^a-z\s]', ' ', doc)
    doc = re.sub(r'\s+', ' ', doc).strip()
    return doc

def tokenize_unique(doc):
    tokens = doc.split()
    return list(set(tokens))

def remove_stopwords(tokens):
    return [w for w in tokens if w not in stop_words]

def apply_stemming(tokens):
    return [stemmer.stem(w) for w in tokens]

def apply_lemmatization(tokens):
    if lemmatizer:
        return [lemmatizer.lemmatize(w) for w in tokens]
    else:
        return tokens

def filter_tokens_by_rules(tokens):
    valid_tokens = [
        tok for tok in tokens
        if 2 <= len(tok) <= 20 and
        not re.search(r'(.)\1{2,}', tok) and
        sum(c in 'aeiou' for c in tok) > 0 and
        not re.search(r'[^aeiou]{5,}', tok)
    ]
    return valid_tokens

def preprocess_document(doc, apply_stem=False, apply_lemma=False):
    cleaned = clean_and_normalize(doc)
    tokens = tokenize_unique(cleaned)
    tokens = filter_tokens_by_rules(tokens)
    tokens = remove_stopwords(tokens)
    if apply_stem and not apply_lemma:
        tokens = apply_stemming(tokens)
    elif apply_lemma and not apply_stem:
        tokens = apply_lemmatization(tokens)
    return ' '.join(tokens)

def build_vocabulary(docs_list):
    return sorted(set(word for doc in docs_list for word in doc.split()))


### 2. Carga y limpieza de datos

In [33]:
# Cargar datasets
movies = pd.read_csv("/kaggle/input/rotten-tomatoes-movies-and-critic-reviews-dataset/rotten_tomatoes_movies.csv")
reviews = pd.read_csv("/kaggle/input/rotten-tomatoes-movies-and-critic-reviews-dataset/rotten_tomatoes_critic_reviews.csv")

# Seleccionar columnas importantes
movies_small = movies[['rotten_tomatoes_link','movie_title','genres','movie_info','critics_consensus']]

# Reparar la columna review_content antes de agrupar
reviews["review_content"] = reviews["review_content"].astype(str).fillna("")

# Agrupar reviews por película
reviews_grouped = reviews.groupby("rotten_tomatoes_link")["review_content"].apply(lambda x: " ".join(x)).reset_index()

# Merge de películas y reviews
data = movies_small.merge(reviews_grouped, on="rotten_tomatoes_link", how="left")

# Crear un texto completo de cada película
data["raw_text"] = (
    data["movie_title"].astype(str) + " " +
    data["genres"].astype(str) + " " +
    data["movie_info"].astype(str) + " " +
    data["critics_consensus"].astype(str) + " " +
    data["review_content"].astype(str)
)

data.head()


Unnamed: 0,rotten_tomatoes_link,movie_title,genres,movie_info,critics_consensus,review_content,raw_text
0,m/0814255,Percy Jackson & the Olympians: The Lightning T...,"Action & Adventure, Comedy, Drama, Science Fic...","Always trouble-prone, the life of teenager Per...",Though it may seem like just another Harry Pot...,A fantasy adventure that fuses Greek mythology...,Percy Jackson & the Olympians: The Lightning T...
1,m/0878835,Please Give,Comedy,Kate (Catherine Keener) and her husband Alex (...,Nicole Holofcener's newest might seem slight i...,"Like Holofcener's previous pictures, Please Gi...",Please Give Comedy Kate (Catherine Keener) and...
2,m/10,10,"Comedy, Romance","A successful, middle-aged Hollywood songwriter...",Blake Edwards' bawdy comedy may not score a pe...,10 (1979) is known for its numerical rating sy...,"10 Comedy, Romance A successful, middle-aged H..."
3,m/1000013-12_angry_men,12 Angry Men (Twelve Angry Men),"Classics, Drama",Following the closing arguments in a murder tr...,Sidney Lumet's feature debut is a superbly wri...,"A film with texture, humour and relevance at a...","12 Angry Men (Twelve Angry Men) Classics, Dram..."
4,m/1000079-20000_leagues_under_the_sea,"20,000 Leagues Under The Sea","Action & Adventure, Drama, Kids & Family","In 1866, Professor Pierre M. Aronnax (Paul Luk...","One of Disney's finest live-action adventures,...",[The] embodiment of Disney at his best -- fami...,"20,000 Leagues Under The Sea Action & Adventur..."


### 3. Preprocesamiento de todo el texto

In [34]:
# Aplicar el pipeline de preprocesamiento
data["processed_text"] = data["raw_text"].apply(lambda x: preprocess_document(x, apply_stem=True))

# Construir vocabulario
vocab = build_vocabulary(data["processed_text"].tolist())
print(f"Vocabulario tamaño: {len(vocab)}")


Vocabulario tamaño: 87240


### 4. Construcción de índice TF-IDF 

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorizador usando tu preprocesamiento
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(data["processed_text"])

# Diccionario de palabras -> índice
feature_names = vectorizer.get_feature_names_out()


### 5. Función de búsqueda por consulta

In [37]:
from sklearn.metrics.pairwise import cosine_similarity

def search_movies(query, top_n=5):
    # Preprocesar consulta
    query_proc = preprocess_document(query, apply_stem=True)
    
    # Transformar consulta a TF-IDF
    query_vec = vectorizer.transform([query_proc])
    
    # Calcular similitud coseno
    sim_scores = cosine_similarity(query_vec, tfidf_matrix).flatten()
    
    # Obtener índices de las top N películas
    top_indices = sim_scores.argsort()[::-1][:top_n]
    
    # Mostrar resultados
    results = data.iloc[top_indices][["movie_title","genres","critics_consensus"]].copy()
    results["score"] = sim_scores[top_indices]
    return results


### 6. Simulación de consultas

In [38]:
print("Consulta: Películas sobre viajes espaciales")
display(search_movies("Películas sobre viajes espaciales", top_n=5))

print("Consulta: Películas para ver en familia")
display(search_movies("Películas para ver en familia", top_n=5))


Consulta: Películas sobre viajes espaciales


Unnamed: 0,movie_title,genres,critics_consensus,score
1041,A Little Princess,"Drama, Kids & Family",Alfonso Cuarón adapts Frances Hodgson Burnett'...,0.08789
15494,The Motorcycle Diaries,Drama,The Motorcycle Diaries is heartfelt and profou...,0.082245
6948,Get Real,"Comedy, Drama, Gay & Lesbian",An authentic portrayal of homosexuality in hig...,0.074415
10897,The Namesake,Drama,An ambitious exploration of the immigrant expe...,0.070396
16154,Thirteen,Drama,"An emotionally wrenching, not to mention terri...",0.068111


Consulta: Películas para ver en familia


Unnamed: 0,movie_title,genres,critics_consensus,score
14774,The Cup,"Art House & International, Comedy, Drama",,0.119695
12804,Sagrada: The Mystery of Creation,"Documentary, Special Interest",,0.08883
2009,50 First Dates,Comedy,Gross-out humor overwhelms the easy chemistry ...,0.073991
8687,Jimmy Neutron - Boy Genius,"Animation, Kids & Family, Science Fiction & Fa...",What Jimmy Neutron lacks in computer animation...,0.068779
16146,Things You Can Tell Just by Looking at Her,"Comedy, Drama",,0.065136


### 7. Análisis de resultados

* Los resultados son los esperados para las consultas realizadas (que devuelva los índices de las películas), aunque visualmente no son tan precisas, pero para garantizar la funcionalidad se debe profundizar más en cómo realizar las métricas.
* Éste sistema se lo ha diseñado para realizar una búsqueda simple, a partir de TF-IDF, pero para mejorar o complementar de mejor manera los resultados, se podría añadir métricas de Precisión@k, Recall, F1-Score y MAP.
  
Las cuales intentaré implementar a continuación:

In [39]:
def precision_at_k(query, k=5):
    results = search_movies(query, top_n=k)
    relevant_set = set(relevant_movies[query])
    retrieved_set = set(results["movie_title"].tolist())
    precision = len(retrieved_set & relevant_set) / k
    return precision


Como para Precisión@k, Recall@k, F1, etc., necesitamos un conjunto de relevancia conocido:

In [41]:
# Diccionario de películas relevantes por consulta
relevant_movies = {
    "Películas sobre viajes espaciales": [
        "Interstellar",
        "Gravity",
        "The Martian",
        "2001: A Space Odyssey",
        "Star Wars"
    ],
    "Películas para ver en familia": [
        "The Lion King",
        "Finding Nemo",
        "Toy Story",
        "Paddington",
        "Frozen"
    ]
}


In [42]:
precision_at_k("Películas sobre viajes espaciales", k=5)


0.0

In [43]:
def recall_at_k(query, k=5):
    results = search_movies(query, top_n=k)
    relevant_set = set(relevant_movies[query])
    retrieved_set = set(results["movie_title"].tolist())
    recall = len(retrieved_set & relevant_set) / len(relevant_set)
    return recall


In [47]:
recall_espaciales = recall_at_k("Películas sobre viajes espaciales", k=5)
print(f"Recall@5 (viajes espaciales): {recall_espaciales:.2f}")

Recall@5 (viajes espaciales): 0.00


In [44]:
def f1_score_at_k(query, k=5):
    p = precision_at_k(query, k)
    r = recall_at_k(query, k)
    if p + r == 0:
        return 0.0
    return 2 * (p * r) / (p + r)


In [48]:
f1_espaciales = f1_score_at_k("Películas sobre viajes espaciales", k=5)
print(f"F1-Score@5 (viajes espaciales): {f1_espaciales:.2f}")

F1-Score@5 (viajes espaciales): 0.00


In [45]:
def average_precision(query, k=5):
    results = search_movies(query, top_n=k)
    relevant_set = set(relevant_movies[query])
    hits = 0
    sum_precisions = 0.0
    for i, movie in enumerate(results["movie_title"].tolist(), start=1):
        if movie in relevant_set:
            hits += 1
            sum_precisions += hits / i
    if hits == 0:
        return 0.0
    return sum_precisions / len(relevant_set)

def mean_average_precision(queries, k=5):
    return sum(average_precision(q, k) for q in queries) / len(queries)


In [49]:
ap_espaciales = average_precision("Películas sobre viajes espaciales", k=5)
print(f"Average Precision@5 (viajes espaciales): {ap_espaciales:.2f}")

Average Precision@5 (viajes espaciales): 0.00


In [46]:
queries = ["Películas sobre viajes espaciales", "Películas para ver en familia"]
map_score = mean_average_precision(queries, k=5)
print(f"MAP@5: {map_score:.3f}")


MAP@5: 0.000


Ahora si, viendo los resultados de las métricas, se concluye que los resultados devueltos no son los esperados para un sistema de recuperación de la información.
Se tiene que analizar el preprocesamiento y volver a analizar la similitud coseno para mejorar los resultados del modelo