In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [3]:
# Cargar modelos de nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\enriq\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\enriq\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\enriq\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
# Leer archivos de datos
df_credits = pd.read_csv("tmdb_5000_credits.csv")
df_movies = pd.read_csv("tmdb_5000_movies.csv")

In [9]:
# Merge de los dos dataframes
df_movies = df_movies.merge(df_credits, on='title')
df_movies = df_movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

# Rellenar valores faltantes
for col in ['overview', 'genres', 'keywords', 'cast', 'crew']:
    df_movies[col] = df_movies[col].fillna('')

In [11]:
# Procesar texto en las columnas
def process_text(text):
    # Tokenizar
    tokens = word_tokenize(text.lower())
    # Remover puntuación y stopwords
    tokens = [word for word in tokens if word.isalpha() and word not in string.punctuation and word not in stop_words]
    # Lematizar
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

df_movies["movie_id"] = df_movies["movie_id"].astype(str)
df_movies["title"] = df_movies["title"].astype(str)
df_movies["overview"] = df_movies["overview"].astype(str).apply(process_text)
df_movies["genres"] = df_movies["genres"].astype(str).apply(process_text)
df_movies["keywords"] = df_movies["keywords"].astype(str).apply(process_text)
df_movies["cast"] = df_movies["cast"].astype(str).apply(process_text)
df_movies["crew"] = df_movies["crew"].astype(str).apply(process_text)

In [13]:
# Crear DataFrame final
df_movies["tags"] = df_movies["movie_id"] +  df_movies["title"] +  df_movies["overview"] +  df_movies["genres"]  + df_movies["keywords"] + df_movies["cast"] + df_movies["crew"]
df_final = df_movies.drop(columns=["overview", "genres", "keywords", "cast", "crew"])

# Crear TfIdfVectorizer con límites de características
tfidf = TfidfVectorizer()
vector = tfidf.fit_transform(df_final["tags"]).toarray()

# Calcular la similitud de coseno
similarity = cosine_similarity(vector)

In [15]:
# Función de recomendación
def recommend(movie):
    index = df_final[df_final["title"] == movie].index[0]
    movies = list(enumerate(similarity[index]))
    movies = sorted(movies, key=lambda x: x[1], reverse=True)[1:6]
    for i in movies:
        print(df_final["title"].iloc[i[0]])

In [17]:
# Prueba la recomendación
recommend("The Dark Knight")

The Dark Knight Rises
American Gangster
Gone Girl
Batman Begins
Batman v Superman: Dawn of Justice


In [19]:
recommend("Pirates of the Caribbean: At World's End")

Pirates of the Caribbean: Dead Man's Chest
Pirates of the Caribbean: The Curse of the Black Pearl
Pirates of the Caribbean: On Stranger Tides
Contact
American Pie 2
