In [2]:
import pandas as pd
import ast
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
# Load datasets
movies = pd.read_csv('/content/tmdb_5000_movies.csv')
credits = pd.read_csv('/content/tmdb_5000_credits.csv')

# Merge datasets on the 'title' column
movies = movies.merge(credits, on='title')

In [4]:
# Select features to be used
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

In [5]:
# Check for missing values
movies.isnull().sum()

# Drop rows with missing values
movies.dropna(inplace=True)

In [6]:
# Function to convert JSON-like strings to lists
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

# Apply the function to relevant columns
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)

# Function to extract top 3 cast members
def convert3(obj):
    L = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter != 3:
            L.append(i['name'])
            counter += 1
        else:
            break
    return L

movies['cast'] = movies['cast'].apply(convert3)

# Function to extract director's name
def fetch_director(obj):
    L = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            L.append(i['name'])
            break
    return L

movies['crew'] = movies['crew'].apply(fetch_director)

# Convert overview to list of words
movies['overview'] = movies['overview'].apply(lambda x: x.split())

# Combine all features into a single list
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

# Create a new DataFrame with necessary columns
new_df = movies[['movie_id', 'title', 'tags']]

In [7]:
# Convert list of tags to string
new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))

# Convert all text to lowercase
new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())


In [8]:
# Initialize CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')

# Fit and transform the tags
vectors = cv.fit_transform(new_df['tags']).toarray()

In [9]:
# Compute cosine similarity between vectors
similarity = cosine_similarity(vectors)

In [10]:
# Function to recommend movies based on similarity score
def recommend(movie):
    movie_index = new_df[new_df['title'] == movie].index[0]
    distances = similarity[movie_index]
    movie_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]

    for i in movie_list:
        print(new_df.iloc[i[0]].title)

In [12]:
# Example: Get recommendations for 'The Dark Knight Rises'
recommend('Toy Story')

Toy Story 2
Toy Story 3
Small Soldiers
Everything You Always Wanted to Know About Sex *But Were Afraid to Ask
The 40 Year Old Virgin
