In [9]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/imdb-dataset/IMDB Dataset.csv
/kaggle/input/clapper-massive-rotten-tomatoes-movies-and-reviews/rotten_tomatoes_movies.csv
/kaggle/input/clapper-massive-rotten-tomatoes-movies-and-reviews/rotten_tomatoes_movie_reviews.csv


**Ejercicio 1: Introducción a Recuperación de Información**

**Objetivo de la práctica**
* Entender el problema de buscar información en colecciones de texto.
* Comprender por qué se necesita un índice invertido en recuperación de información.
* Programar una primera solución manual y luego optimizarla con un índice.
* Evaluar la mejora en tiempos de búsqueda cuando usamos estructuras adecuadas.

**Parte 1: Búsqueda lineal en documentos**
**Actividad**

1. Se te proporcionará un dataset con reviews de películas.

2. Escribe una función que:

* Lea todos los documentos.
* Busque una palabra ingresada por el usuario.
* Muestre en qué documentos aparece la palabra.

In [10]:
import pandas as pd

1.- leer la información del csv del dataset

In [13]:

df = pd.read_csv('/kaggle/input/imdb-dataset/IMDB Dataset.csv')
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


2.- Definir la función

In [14]:
def hasquery(query, string):
    #Esta función devuelve true si el string indicado como parámetro contiene la palabra como _query_
    return query in string

Busque una palabra ingresada por el ususario

In [15]:
query = 'bad'
df['review'].apply(lambda x: hasquery(query, x))

0        False
1        False
2        False
3        False
4        False
         ...  
49995    False
49996     True
49997     True
49998    False
49999    False
Name: review, Length: 50000, dtype: bool

Muestre en qué documentos está la palabra

In [16]:
df[df['review'].apply(lambda x: hasquery(query, x))]

Unnamed: 0,review,sentiment
7,"This show was an amazing, fresh & innovative i...",negative
12,So im not a big fan of Boll's work but then ag...,negative
14,This a fantastic movie of three prisoners who ...,positive
15,"Kind of drawn in by the erotic scenes, only to...",negative
16,Some films just simply should not be remade. T...,positive
...,...,...
49982,"To be hones, I used to like this show and watc...",negative
49991,"Les Visiteurs, the first movie about the medie...",negative
49994,This is your typical junk comedy.<br /><br />T...,negative
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative


**Parte 2: Construcción de un índice invertido**

**Actividad**

1. Escribe un programa que:

    * Recorra todos los documentos.

    * Construya un índice invertido, es decir, un diccionario donde:

      * Cada palabra clave apunta a una lista de documentos donde aparece.

2. Escribe una nueva función de búsqueda que:

    * Consulte directamente el índice para encontrar los documentos relevantes.

    * Sea mucho más rápida que la búsqueda lineal.

Definimos docs

In [18]:
docs = pd.read_csv('/kaggle/input/imdb-dataset/IMDB Dataset.csv')
docs

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


Definimos la función

In [19]:
def buscar(docs, query):
    mask = docs.str.contains(query)
    result = docs[mask]
    return result

Probamos con un query

In [24]:
buscar(docs['review'], 'wonderful')

1        A wonderful little production. <br /><br />The...
2        I thought this was a wonderful way to spend ti...
29       'War movie' is a Hollywood genre that has been...
41       This movie is based on the book, "A Many Splen...
59       I just watched The Dresser this evening, havin...
                               ...                        
49852    Russ and Valerie are having discussions about ...
49921    Antonio Margheriti's "Danza Macabra" aka. "Cas...
49935    "Nurse Betty" is the kind of movie you can't d...
49938    I made a big mistake going to see this film. T...
49941    Why did the histories of Mary and Rhoda have t...
Name: review, Length: 3083, dtype: object

Vamos a realizar un preprocesado del texto

In [41]:
import re

# Función para preprocesar el texto
def preprocess(text):
    # Convierte el texto a minúsculas y extrae solo las palabras
    return re.findall(r'\b\w+\b', text.lower())



Ahora se construye el índice invertido:

In [61]:
from collections import defaultdict

def construir_indice_invertido(docs):
    # Diccionario que mapea palabra -> {documento: [posiciones]}
    indice_invertido = defaultdict(lambda: defaultdict(list))
    
    # Recorremos cada documento en la columna 'review'
    for idx, review in enumerate(docs['review']):
        palabras = preprocess(review)
        
        # Para cada palabra, obtenemos las posiciones en el documento
        for pos, palabra in enumerate(palabras):
            indice_invertido[palabra][idx].append(pos)
    
    return indice_invertido

Finalmente la función de búsqueda

In [62]:
def buscar_con_indice(docs, query, indice_invertido):
    # Preprocesamos la consulta (convertirla en una lista de palabras)
    query_palabras = preprocess(query)
    
    # Diccionario que almacenará las posiciones de cada palabra en todos los documentos
    resultados = defaultdict(list)
    
    # Buscamos las palabras en el índice invertido y agregamos sus posiciones
    for palabra in query_palabras:
        if palabra in indice_invertido:
            for posiciones in indice_invertido[palabra].values():
                resultados[palabra].extend(posiciones)
    
    # Ordenar las posiciones de cada palabra (opcional, para mejor presentación)
    for palabra in resultados:
        resultados[palabra] = sorted(set(resultados[palabra]))  # Eliminar duplicados y ordenar
    
    # Convertir los resultados en un DataFrame con la información de las posiciones
    resultados_df = []
    for palabra, posiciones in resultados.items():
        resultados_df.append({
            'palabra': palabra,
            'posiciones': posiciones
        })
    
    return pd.DataFrame(resultados_df)

# Construir el índice invertido
indice_invertido = construir_indice_invertido(docs)

In [63]:
# Consulta a buscar
query = "wonderful"

# Buscar en los documentos usando el índice invertido
resultados = buscar_con_indice(docs, query, indice_invertido)

# Mostrar los primeros resultados
print(resultados.head())


     palabra                                         posiciones
0  wonderful  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...


**Parte 3: Evaluación de tiempos de búsqueda**

**Actividad**

1. Realiza la búsqueda de varias palabras usando:

    * Corpus pequeño.

    * Corpus grande.


2. Mide el tiempo de ejecución:

    * Para búsqueda lineal.

    * Para búsqueda usando índice invertido.

    * Grafica o presenta los resultados en una tabla comparativa.

In [36]:
df_large = pd.read_csv('/kaggle/input/clapper-massive-rotten-tomatoes-movies-and-reviews/rotten_tomatoes_movie_reviews.csv')
df_large

Unnamed: 0,id,reviewId,creationDate,criticName,isTopCritic,originalScore,reviewState,publicatioName,reviewText,scoreSentiment,reviewUrl
0,beavers,1145982,2003-05-23,Ivan M. Lincoln,False,3.5/4,fresh,Deseret News (Salt Lake City),Timed to be just long enough for most youngste...,POSITIVE,http://www.deseretnews.com/article/700003233/B...
1,blood_mask,1636744,2007-06-02,The Foywonder,False,1/5,rotten,Dread Central,It doesn't matter if a movie costs 300 million...,NEGATIVE,http://www.dreadcentral.com/index.php?name=Rev...
2,city_hunter_shinjuku_private_eyes,2590987,2019-05-28,Reuben Baron,False,,fresh,CBR,The choreography is so precise and lifelike at...,POSITIVE,https://www.cbr.com/city-hunter-shinjuku-priva...
3,city_hunter_shinjuku_private_eyes,2558908,2019-02-14,Matt Schley,False,2.5/5,rotten,Japan Times,The film's out-of-touch attempts at humor may ...,NEGATIVE,https://www.japantimes.co.jp/culture/2019/02/0...
4,dangerous_men_2015,2504681,2018-08-29,Pat Padua,False,,fresh,DCist,Its clumsy determination is endearing and some...,POSITIVE,http://dcist.com/2015/11/out_of_frame_dangerou...
...,...,...,...,...,...,...,...,...,...,...,...
1444958,thor_love_and_thunder,102706151,2022-07-05,Christie Cronan,False,7/10,fresh,Raising Whasians,Solid but not totally sold&#44; Thor&#58; Ragn...,POSITIVE,https://raisingwhasians.com/thor-love-and-thun...
1444959,thor_love_and_thunder,102706150,2022-07-05,Ian Sandwell,False,4/5,fresh,Digital Spy,Thor&#58; Love and Thunder is the most enterta...,POSITIVE,https://www.digitalspy.com/movies/a40496050/th...
1444960,thor_love_and_thunder,102706149,2022-07-05,Lauren LaMagna,False,8/10,fresh,Next Best Picture,&quot;Thor&#58; Love and Thunder&quot; is a st...,POSITIVE,https://www.nextbestpicture.com/thor-love-and-...
1444961,thor_love_and_thunder,102706148,2022-07-05,Jake Cole,True,1/4,rotten,Slant Magazine,Across Taika Waititi&#8217;s film&#44; a war a...,NEGATIVE,https://www.slantmagazine.com/film/thor-love-a...
