<a href="https://colab.research.google.com/github/ViniciusSolon/MovieRecSys/blob/main/recomendation_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
import ast
import nltk
import sklearn
import numpy as np
import pandas as pd
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
pd.options.mode.chained_assignment = None

In [21]:
# Load datasets
df_movies = pd.read_csv("dataset_filmes.csv")
df_people = pd.read_csv("dataset_elenco.csv")

# Merge datasets on title
df_movies = df_movies.merge(df_people, on='title')

In [22]:
# Select relevant columns
df_movies = df_movies[['title', 'cast', 'movie_id', 'overview', 'genres', 'keywords', 'crew']]

# Check for missing values
df_movies.isnull().sum()

# Drop missing values
df_movies.dropna(inplace=True)


In [23]:
# AST (Abstract Syntax Trees) for parsing string representations of lists
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

df_movies['genres'] = df_movies['genres'].apply(convert)
df_movies['keywords'] = df_movies['keywords'].apply(convert)


In [24]:
# Convert cast column (limit to 3 actors)
def convert_cast(obj):
    L = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter != 3:
            L.append(i['name'])
            counter += 1
        else:
            break
    return L

df_movies['cast'] = df_movies['cast'].apply(convert_cast)


In [25]:
# Extract director from crew column
def fetch_director(obj):
    L = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            L.append(i['name'])
    return L

df_movies['crew'] = df_movies['crew'].apply(fetch_director)

# Tokenize overview column
df_movies['overview'] = df_movies['overview'].apply(lambda x: x.split())


In [26]:
# Remove spaces from genres, keywords, cast, and crew
df_movies['genres'] = df_movies['genres'].apply(lambda x: [i.replace(" ", "") for i in x])
df_movies['keywords'] = df_movies['keywords'].apply(lambda x: [i.replace(" ", "") for i in x])
df_movies['cast'] = df_movies['cast'].apply(lambda x: [i.replace(" ", "") for i in x])
df_movies['crew'] = df_movies['crew'].apply(lambda x: [i.replace(" ", "") for i in x])

# Create tags column
df_movies['tags'] = df_movies['overview'] + df_movies['genres'] + df_movies['keywords'] + df_movies['cast'] + df_movies['crew']

df_movies_final = df_movies[['movie_id', 'title', 'tags']]


In [27]:
# Convert list of tags into a single string
df_movies_final['tags'] = df_movies_final['tags'].apply(lambda x: " ".join(x))
df_movies_final['tags'] = df_movies_final['tags'].apply(lambda x: x.lower())

# Apply stemming to tags
parser_ps = PorterStemmer()

def stem(text):
    return " ".join([parser_ps.stem(i) for i in text.split()])

df_movies_final['tags'] = df_movies_final['tags'].apply(stem)


In [28]:

# Create CountVectorizer (limit to 5000 words, removing English stop words)
cv = CountVectorizer(max_features=5000, stop_words='english')

# Transform tags into feature vectors
vectors = cv.fit_transform(df_movies_final['tags']).toarray()

# Compute cosine similarity between movie vectors
similarity = cosine_similarity(vectors)

# Recommendation system function
def recommend_movie(movie):
    index = df_movies_final[df_movies_final['title'] == movie].index[0]
    distances = sorted(list(enumerate(similarity[index])), reverse=True, key=lambda x: x[1])
    for i in distances[1:6]:
        print(df_movies_final.iloc[i[0]].title)


In [35]:
recommend_movie('The Avengers')



Iron Man 3
Avengers: Age of Ultron
Captain America: Civil War
Captain America: The First Avenger
Iron Man


In [37]:
recommend_movie('The Hunger Games')

The Hunger Games: Catching Fire
The Hunger Games: Mockingjay - Part 2
The Hunger Games: Mockingjay - Part 1
Indie Game: The Movie
The Indian in the Cupboard
