# DESARROLLO SISTEMA DE RECOMENDACIONES

___
1. Preprocesamiento de datos
___

In [212]:
# Importamos las librerías necesarias
# RAKE = Rapid Automatic Keyword Extraction
# nltk = Natural Language Toolkit
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from rake_nltk import Rake
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [213]:
# nltk rake necesitará un diccionario de palabras vacías, "stopwords" para suprimir palabras que no aportan nada al modelo.
# también necesitamos el tokenizador de oraciones 'punkt' que ayudará a eliminar signos de puntuación, espacios en blanco y otros caracteres
# no deseados.. y 'wordnet' para usar 'WordNetLemmatizer'
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\raftx\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\raftx\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\raftx\AppData\Roaming\nltk_data...


True

In [186]:
# Cargamos el set de datos limpios resultante del proceso ETL anterior
df=pd.read_csv('./data/datos_limpios.csv')

In [314]:
df['directed_by']

0           ['John Lasseter']
1            ['Joe Johnston']
2           ['Howard Deutch']
3         ['Forest Whitaker']
4           ['Charles Shyer']
                 ...         
41299            ['Ben Rock']
41300       ['Aaron Osborne']
41301            ['Lav Diaz']
41302    ['Yakov Protazanov']
41303       ['Daisy Asquith']
Name: directed_by, Length: 41304, dtype: object

In [188]:
df.shape

(45348, 22)

In [189]:
df.head()

Unnamed: 0,index,id,budget,original_language,overview,popularity,release_date,revenue,runtime,spoken_languages,...,title,vote_average,vote_count,return,release_year,directed_by,franchise,produced_by,produced_in,genres_clean
0,0,862,30000000,en,"Led by Woody, Andy's toys live happily in his ...",21.946943,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",...,Toy Story,7.7,5415.0,12.451801,1995,['John Lasseter'],Toy Story Collection,['Pixar Animation Studios'],['United States of America'],['Animation' 'Comedy' 'Family']
1,1,8844,65000000,en,When siblings Judy and Peter discover an encha...,17.015539,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",...,Jumanji,6.9,2413.0,4.043035,1995,['Joe Johnston'],,['TriStar Pictures' 'Teitler Film' 'Interscope...,['United States of America'],['Adventure' 'Fantasy' 'Family']
2,2,15602,0,en,A family wedding reignites the ancient feud be...,11.7129,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",...,Grumpier Old Men,6.5,92.0,0.0,1995,['Howard Deutch'],Grumpy Old Men Collection,['Warner Bros.' 'Lancaster Gate'],['United States of America'],['Romance' 'Comedy']
3,3,31357,16000000,en,"Cheated on, mistreated and stepped on, the wom...",3.859495,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",...,Waiting to Exhale,6.1,34.0,5.09076,1995,['Forest Whitaker'],,['Twentieth Century Fox Film Corporation'],['United States of America'],['Comedy' 'Drama' 'Romance']
4,4,11862,0,en,Just when George Banks has recovered from his ...,8.387519,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",...,Father of the Bride Part II,5.7,173.0,0.0,1995,['Charles Shyer'],Father of the Bride Collection,['Sandollar Productions' 'Touchstone Pictures'],['United States of America'],['Comedy']


Antes de continuar, vamos a comprobar si quedan valores nulos o faltantes

In [197]:
def check_nan_values(dataframe_name):
    df = globals()[dataframe_name]          # Obtener el dataframe usando su nombre
    nan_counts = {}                         # Diccionario para almacenar el recuento de NaN por columna
    for column in df.columns:
        nan_count = df[column].isna().sum() # Contar los valores NaN en la columna
        nan_counts[column] = nan_count
    return nan_counts

In [199]:
check_nan_values('df')

{'index': 0,
 'id': 0,
 'budget': 0,
 'original_language': 11,
 'overview': 941,
 'popularity': 0,
 'release_date': 0,
 'revenue': 0,
 'runtime': 246,
 'spoken_languages': 0,
 'status': 80,
 'tagline': 24960,
 'title': 0,
 'vote_average': 0,
 'vote_count': 0,
 'return': 0,
 'release_year': 0,
 'directed_by': 0,
 'franchise': 40862,
 'produced_by': 0,
 'produced_in': 0,
 'genres_clean': 0}

In [200]:
# Para entrenar el modelo voy a usar las columnas 'title', 'overview' y 'genres_clean'.
# ya que en 'tagline' hay muchas vacías, no la voy a tener en cuenta.
# Hemos visto que la columna 'overview' aún tiene valores vacíos o NaN en 941 filas
# Eliminamos esas filas
df.dropna(subset=['overview'], inplace=True)

In [202]:
# Eliminamos también los títulos duplicados para aligerar el modelo y tener una fila por título
df.drop_duplicates(subset=['title'],inplace=True)
df=df.reset_index(drop=True)
df.shape

(41304, 22)

In [203]:
# Exportamos a CSV el dataset en el estado actual.
# df.to_csv('./data/datos_ligeros.csv')

In [205]:
check_nan_values('df')

{'index': 0,
 'id': 0,
 'budget': 0,
 'original_language': 10,
 'overview': 0,
 'popularity': 0,
 'release_date': 0,
 'revenue': 0,
 'runtime': 0,
 'spoken_languages': 0,
 'status': 61,
 'tagline': 22269,
 'title': 0,
 'vote_average': 0,
 'vote_count': 0,
 'return': 0,
 'release_year': 0,
 'directed_by': 0,
 'franchise': 37004,
 'produced_by': 0,
 'produced_in': 0,
 'genres_clean': 0}

### creamos un dataset para el modelo ML que contenga solo las columnas necesarias

In [206]:
df_modelo=df[['title','overview','genres_clean']]

In [207]:
df_modelo.tail()

Unnamed: 0,title,overview,genres_clean
41299,The Burkittsville 7,A film archivist revisits the story of Rustin ...,['Horror']
41300,Caged Heat 3000,It's the year 3000 AD. The world's most danger...,['Science Fiction']
41301,Century of Birthing,An artist struggles to finish his work while a...,['Drama']
41302,Satan Triumphant,"In a small town live two brothers, one a minis...",[]
41303,Queerama,50 years after decriminalisation of homosexual...,[]


In [211]:
valor_fila_41303 = df_modelo.loc[41303, 'genres_clean']
filas_coincidentes = df_modelo[df_modelo['genres_clean'] == valor_fila_41303]
print("Número de filas coincidentes:", len(filas_coincidentes))
print("Filas coincidentes:")
print(filas_coincidentes)

Número de filas coincidentes: 1984
Filas coincidentes:
                                  title  \
54              Kids of the Round Table   
82          Last Summer in the Hamptons   
125                      Jupiter's Wife   
136                              Target   
388                        Desert Winds   
...                                 ...   
41290           The Untameable Whiskers   
41291  The Imperceptable Transmutations   
41296         St. Michael Had a Rooster   
41302                  Satan Triumphant   
41303                          Queerama   

                                                overview genres_clean  
54     Set in modern times, Alex finds King Arthur's ...           []  
82     Filmed entirely on location in East Hampton, L...           []  
125    Michel Negroponte, a documentary filmmaker, me...           []  
136     A subtle yet violent commentary on feudal lords.           []  
388    Jackie and Eugene are joined by a mystical win...           [

Con las últimas comprobaciones nos damos cuenta que 'genres_clean' con 1984 filas vacías tampoco nos va a aportar mucho para el modelo

___
2. Creación de keywords y scores
___

In [None]:
# Usaremos algoritmo de extracción de palabras clave Rake (Rapid Automatic Keyword Extraction) del modulo rake_ntlp
# de esta manera se aligeran los datos para el modelo

# Creamos una instancia de Rake
r = Rake()

# Leer de nuevo los datos seleccionando solo las columnas 'title' y 'overview'
data = df_modelo[['title', 'overview']]

# Crear listas para almacenar los keywords y sus puntuajes de relevancia
keywords_list = []
scores_list = []

# Iterar sobre cada fila del DataFrame
for index, row in data.iterrows():
    # Obtener el título y la descripción
    title = row['title']
    overview = row['overview']
     # Concatenar el título y la descripción en un solo texto
    text = title + ' ' + overview
     # Tokenizar el texto en palabras
    tokens = word_tokenize(text)
     # Filtrar las stopwords en inglés
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
     # Lematizar las palabras para obtener su forma base
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
     # Convertir la lista de tokens en un texto nuevamente
    processed_text = ' '.join(lemmatized_tokens)
     # Extraer los keywords y sus puntuajes de relevancia utilizando Rake
    r.extract_keywords_from_text(processed_text)
    keywords_scores = r.get_word_degrees()
     # Agregar los keywords y sus puntuajes a las listas correspondientes
    keywords_list.append(list(keywords_scores.keys()))
    scores_list.append(list(keywords_scores.values()))

# Agregar las listas de keywords y puntuajes al DataFrame original
data['keywords'] = keywords_list
data['scores'] = scores_list



In [217]:
# Comprobamos la existencia de keywords y scores
print(data[['title', 'keywords', 'scores']])

                             title  \
0                        Toy Story   
1                          Jumanji   
2                 Grumpier Old Men   
3                Waiting to Exhale   
4      Father of the Bride Part II   
...                            ...   
41299          The Burkittsville 7   
41300              Caged Heat 3000   
41301          Century of Birthing   
41302             Satan Triumphant   
41303                     Queerama   

                                                keywords  \
0      [toy, story, led, woody, andy, live, happily, ...   
1      [jumanji, sibling, judy, peter, discover, ench...   
2      [grumpier, old, men, family, wedding, reignite...   
3      [waiting, exhale, cheated, mistreated, stepped...   
4      [father, bride, part, ii, george, banks, recov...   
...                                                  ...   
41299  [burkittsville, 7, film, archivist, revisits, ...   
41300  [caged, heat, 3000, year, ad, world, dangerous...   
413

In [216]:
data.columns

Index(['title', 'overview', 'keywords', 'scores'], dtype='object')

In [218]:
# Ya podemos deshacernos de la columna 'overview'
data.drop(columns=['overview'],inplace=True)

In [223]:
data.head()

Unnamed: 0,title,keywords,scores
0,Toy Story,toy story led woody andy live happily room bir...,"[9, 4, 4, 12, 10, 5, 5, 5, 6, 6, 14, 6, 6, 6, ..."
1,Jumanji,jumanji sibling judy peter discover enchanted ...,"[12, 12, 12, 12, 12, 12, 12, 24, 12, 12, 12, 1..."
2,Grumpier Old Men,grumpier old men family wedding reignites anci...,"[9, 9, 9, 9, 9, 9, 9, 9, 9, 6, 6, 6, 6, 6, 13,..."
3,Waiting to Exhale,waiting exhale cheated mistreated stepped woma...,"[12, 3, 3, 2, 2, 3, 3, 3, 9, 9, 9, 9, 9, 9, 9,..."
4,Father of the Bride Part II,father bride part ii george banks recovered da...,"[8, 8, 8, 8, 20, 8, 8, 8, 1, 2, 2, 3, 3, 1, 1,..."


___
3. Vectorización de texto (de los keywords)
___

In [222]:
# Convertimos el contenido de 'keywords' a minúsculas
data['keywords'] = data['keywords'].apply(lambda x: [word.lower() for word in x])

# Convertimos cada lista de palabras en una cadena de texto donde las palabras estarán separadas por espacios.
data['keywords'] = data['keywords'].apply(lambda x: ' '.join(x))

# Obtenemos la lista de keywords y scores
keywords = data['keywords']
scores = data['scores']

# Creamos una instancia de CountVectorizer
vectorizer = CountVectorizer()

# Ajustamos y transformamos los keywords en vectores
vectores_keywords = vectorizer.fit_transform(keywords)



In [225]:
vectores_keywords.shape
# tenemos 41304 vectores de 75199 palabras únicas

(41304, 75199)

In [224]:
# Imprimimos los vectores para su comprobación visual
print(vectores_keywords.toarray())  # Vectores de los keywords

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


___
4. Cálculo de similitud.
___

In [227]:
from sklearn.metrics.pairwise import cosine_similarity
 # Calcular la similitud del coseno entre los vectores de keywords
similitud = cosine_similarity(vectores_keywords)
 # Imprimir la matriz de similitud
print("Matriz de similitud:")
print(similitud)

Matriz de similitud:
[[1.         0.03149704 0.         ... 0.         0.02620712 0.        ]
 [0.03149704 1.         0.05634362 ... 0.05270463 0.04622502 0.0347524 ]
 [0.         0.05634362 1.         ... 0.         0.02344036 0.03524537]
 ...
 [0.         0.05270463 0.         ... 1.         0.         0.06593805]
 [0.02620712 0.04622502 0.02344036 ... 0.         1.         0.02891575]
 [0.         0.0347524  0.03524537 ... 0.06593805 0.02891575 1.        ]]


In [233]:
similitud.shape

(41304, 41304)

___
5. La función de Recomendación.
___

In [276]:
# para implementarla en main.py en la FastAPI voy a intentar exportar los datasets necesarios en csv
print(similitud.shape)
df_similitud = pd.DataFrame(similitud)
# df_similitud.to_csv('similitud.csv',index=False)
# VERY BAD IDEA - el archivo resultante es de tamaño: 16.4 GB (17,649,255,429 bytes)


(41304, 41304)


### Idea innovadora de un dataset preparado de recomendaciones
> Consiste en crear un dataset usando el modelo ML en local, con 2 columnas, 'title' y 'recomendaciones'
A ver cuánto ocupa el archivo resultante exportado a CSV

In [286]:
# Create a new dataframe with rows that contain rare characters in the 'title' column
df_rare_chars = df_modelo[df_modelo['title'].str.contains('[^a-zA-Z0-9\s]', regex=True)]

# Print the resulting dataframe
print(df_rare_chars)

                                                   title  \
11                           Dracula: Dead and Loving It   
18                        Ace Ventura: When Nature Calls   
38                              Cry, the Beloved Country   
60                                    Mr. Holland's Opus   
61     Don't Be a Menace to South Central While Drink...   
...                                                  ...   
41281                             Blood, Sweat and Tears   
41285                    The Scheming Gambler's Paradise   
41292                   Pooh's Heffalump Halloween Movie   
41293                                   The One-Man Band   
41296                          St. Michael Had a Rooster   

                                                overview  \
11     When a lawyer shows up at the vampire's doorst...   
18     Summoned from an ashram in Tibet, Ace finds hi...   
38     A South-African preacher goes to search for hi...   
60     In 1965, passionate musician Gle

In [287]:
def recomendar_peliculas(titulo, similitud, df_modelo, num_recomendaciones=5):
    # Convertir el título a minúsculas
    titulo = titulo.lower()

    # Buscar películas que contengan el título proporcionado
    peliculas_coincidentes = df_modelo[df_modelo['title'].str.lower().str.contains(titulo, regex=False)]
    if len(peliculas_coincidentes) == 0:
        return "No se encontraron películas coincidentes"
    
    # Obtener el índice de la primera película coincidente
    indice_pelicula = peliculas_coincidentes.index[0]

    # Obtener las puntuaciones de similitud de la película con otras películas
    puntuaciones_similitud = list(enumerate(similitud[indice_pelicula]))

    # Ordenar las películas por puntuaciones de similitud en orden descendente
    puntuaciones_similitud = sorted(puntuaciones_similitud, key=lambda x: x[1], reverse=True)

    # Excluir la película de entrada de las recomendaciones
    puntuaciones_similitud = [p for p in puntuaciones_similitud if p[0] != indice_pelicula]

    # Obtener los índices de las películas recomendadas
    indices_recomendados = [i[0] for i in puntuaciones_similitud[:num_recomendaciones]]

    # Obtener los títulos de las películas recomendadas
    peliculas_recomendadas = df_modelo.loc[indices_recomendados, 'title']
    return peliculas_recomendadas.tolist()


In [288]:
# ML
# @app.get('/recomendacion/{titulo}')
def recomendacion(titulo):
    '''
    Ingresas un nombre de película y te recomienda las similares en una lista
    '''
    
    peliculas_recomendadas = recomendar_peliculas(titulo, similitud, df_modelo)
    return {'lista recomendada': peliculas_recomendadas}

In [None]:
def generar_recomendaciones(df):
    recomendaciones = []  # Lista para almacenar las recomendaciones
    for titulo in df['title']:
        # Llamar a la función recomendar_peliculas o mejor_recomendacion
        # y pasar el título como parámetro
        recomendacion = recomendar_peliculas(titulo, similitud, df_modelo)
        recomendaciones.append(recomendacion)  # Agregar la recomendación a la lista
    df_recomendaciones = pd.DataFrame({'title': df['title'], 'recomendaciones': recomendaciones})
    return df_recomendaciones

# Llamar a la función para generar las recomendaciones en el dataset df_modelo
df_recomendaciones = generar_recomendaciones(df_modelo)

# 27 minutos



In [302]:

# Cargar el archivo df_recomendaciones.csv como un dataframe
df_recomendaciones = pd.read_csv('./recomendaciones.csv')

# Definir la función de recomendación
def recomendacion2(titulo):
    # Buscar el título en el dataframe
    filtro = df_recomendaciones['title'] == titulo
    if filtro.any():
        # Obtener las recomendaciones para el título dado
        recomendaciones = df_recomendaciones.loc[filtro, 'recomendaciones'].values[0]
        return {'lista recomendada': recomendaciones}
    else:
        return {'error': 'El título no se encuentra en la base de datos'}

In [303]:
df_recomendaciones.columns

Index(['title', 'recomendaciones'], dtype='object')

In [311]:
recomendacion2('The Phantom')

{'lista recomendada': "['Slaves of New York', 'Super Capers', 'Max Steel', 'X-Men', 'Sex and the City']"}

In [298]:
# Guardar el nuevo dataset en un archivo CSV
df_recomendaciones.to_csv('recomendaciones.csv', index=False)

df_recomendaciones.sample(8)

Unnamed: 0,title,recomendaciones
28065,Satellite in the Sky,"[The Mutations, Master of the World, Antarctic..."
23721,Dead Souls,"[Signs, Naked, The Haunted, Yours, Mine and Ou..."
17988,Three Steps Above Heaven,"[Doomed Love, Winter Cherries, The Experience,..."
10953,The Howards of Virginia,"[Day of the Outlaw, Basil, The Space Between U..."
40713,Success,"[Adam & Steve, The Visitor, Dear Heart, Hungry..."
15822,The Nun,"[Story of Women, A Self-Made Hero, Red Tails, ..."
1904,Candleshoe,"[In Search of the Castaways, Saving Mr. Banks,..."
34576,The People Speak,"[C.S.A.: The Confederate States of America, 13..."


In [263]:
titulo = "Toy Story"
recomendaciones = recomendar_peliculas(titulo, similitud, df_modelo)
print(recomendaciones)

['Toy Story 2', 'Toy Story 3', 'The Sunshine Makers', 'John Apple Jack', 'The Christine Jorgensen Story']


In [272]:
recomendacion('Babe')


{'lista recomendada': ['My Brother the Pig',
  '11 Minutes',
  'The Brain',
  'Now You Know',
  'Palmipedarium']}

In [228]:
df.columns

Index(['index', 'id', 'budget', 'original_language', 'overview', 'popularity',
       'release_date', 'revenue', 'runtime', 'spoken_languages', 'status',
       'tagline', 'title', 'vote_average', 'vote_count', 'return',
       'release_year', 'directed_by', 'franchise', 'produced_by',
       'produced_in', 'genres_clean'],
      dtype='object')

In [229]:
df_modelo.columns

Index(['title', 'overview', 'genres_clean'], dtype='object')

In [230]:
data.columns

Index(['title', 'keywords', 'scores'], dtype='object')

Defining a function to get recommendations

In [None]:

def recomendaciones(titulo, cosine_sim = cosine_sim):
    # Getting the index of the movie that matches the title
    idx = model_data[model_data['title'] == str(titulo).lower()].index[0]
    # Getting the similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))
    #Sorting the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Getting the top 5 recommendations
    sim_scores = sim_scores[1:6]
    movie_indices = [i[0] for i in sim_scores]
    recommendations=list(model_data['title'].iloc[movie_indices].str.title())
    return {'lista recomendada': recommendations} 

Testing the model

In [None]:
recomendaciones('batman')

In [None]:
recomendaciones('the love letter')

In [None]:
recomendaciones('minions')

In [None]:
recomendaciones('the hunger games')

In [None]:
recomendaciones('toy story')

In [None]:
recomendaciones('Pride And Prejudice')

This works just fine on my pc, which, to be fair, has a lot of resources. However, I know I probably will not be able to use this algorithm with the full data on the free deploy since the RAM I get is much less than what I have on my local machine. So, for the API what I am going to do is take a random sample of the data (with a size of half the data) and then just use that for the API. I am not changing my alogorithm because I think the recommendations it is giving are kind of spot on, so if you actually want to try it in its full pontential, just download this file and run the previous code if you have a computer with at least 16gb of RAM. I am going put a default in the API for the function so that if a movie you search is not on the data it gives you as recommendation the top 5 most popular movies (I am going to get this information from the EDA).

In [None]:
model_data.shape

Since I am taking as a sample half of the data, n=41278/2=20639

In [None]:
sample_md=model_data.sample(n=20639, random_state=42)

Reseting the index to avoid problems 

In [None]:
sample_md=sample_md.reset_index(drop=True)

In [None]:
sample_md

I am going to export this as a csv file to avoid doing all this transformations in the API. 

In [None]:
sample_md.to_csv('ML_Data.csv')

Now, I am going to put together inside a function the model and leave out the vectorization steps.

In [None]:
sample_md=pd.read_csv('Datasets\ML_Data.csv')
cv1 = CountVectorizer(stop_words='english')
cv_matrix1 = cv1.fit_transform(sample_md['text'])
cosine_sim1 = cosine_similarity(cv_matrix1,cv_matrix1)

In [None]:
def recomendaciones1(titulo):
    try:
        # Getting the index of the movie that matches the title
        idx = sample_md[sample_md['title'] == str(titulo).lower()].index[0]
        # Getting the similarity scores
        sim_scores = list(enumerate(cosine_sim1[idx]))
        #Sorting the movies based on the similarity scores
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

        # Getting the top 5 recommendations
        sim_scores = sim_scores[1:6]
        movie_indices = [i[0] for i in sim_scores]
        recommendations=list(sample_md['title'].iloc[movie_indices].str.title())
        return {'lista recomendada': recommendations} 
    except:
        return {'lista recomendada': ['Minions', 'Wonder Woman', 'Beauty and the Beast', 'Baby Driver', 'Big Hero 6']}

In [234]:
recomendaciones1('toy story')

NameError: name 'recomendaciones1' is not defined

In [None]:
#testing it with a movie in the sample data
recomendaciones1('disconnect')

In [None]:
#testing it with a movie that is not in the sample data
recomendaciones1('barbie')

In [None]:
recomendaciones1('minions')