In [5]:
# Re-load datasets if necessary to reset changes
import pandas as pd
import ast

movie = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

# Merge datasets
movies = movie.merge(credits, on='title')
movies = movies[['id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

# Drop rows with missing values
movies.dropna(inplace=True)

# Convert stringified features into lists
def convert(obj):
    return [i['name'] for i in ast.literal_eval(obj)]

movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)

# For cast, get the top 3 cast members
def convert3(obj):
    return [i['name'] for i in ast.literal_eval(obj)[:3]]

movies['cast'] = movies['cast'].apply(convert3)

# For crew, get only the director's name
def convert4(obj):
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            return [i['name']]
    return []

movies['crew'] = movies['crew'].apply(convert4)

# Remove spaces within names
movies['genres'] = movies['genres'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(" ", "") for i in x])

# Create 'tags' by combining lowercase overview with other columns
movies['tags'] = movies.apply(lambda x: x['overview'].lower().split() + x['genres'] + x['keywords'] + x['cast'] + x['crew'], axis=1)

# Join all tags into a single lowercase string
new_df = movies[['id', 'title', 'tags']]
new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))

# Display the first few rows
new_df.head(2)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))


Unnamed: 0,id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."


In [7]:
#stemming the words
import nltk
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)      


In [6]:
# Create a bag of words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')

vectors = cv.fit_transform(new_df['tags']).toarray()
vectors.shape   

(4806, 5000)

In [8]:
ps. stem('loved')

'love'

In [10]:
#Calculate cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vectors)

#this is giving the distance of each movie from all other movies
similarity[0]

array([1.        , 0.08964215, 0.05976143, ..., 0.02519763, 0.02817181,
       0.        ])

In [17]:
def recommend(movie):
    # Ensure lowercase and strip whitespace for title comparison
    movie = movie.lower().strip()
    
    # Try to find the movie index; if not found, handle gracefully
    movie_index = new_df[new_df['title'].str.lower() == movie].index
    if len(movie_index) == 0:
        print(f"Movie '{movie.title()}' not found in the dataset.")
        return

    # Retrieve the index of the found movie
    movie_index = movie_index[0]  
    distances = similarity[movie_index]  # Fetch similarity scores for this movie

    # Get the top 5 most similar movies (excluding the input movie itself)
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]
    
    # Print recommended movie titles
    print(f"Recommendations for '{movie.title()}':")
    for i in movies_list:
        print(new_df.iloc[i[0]].title)




In [21]:
# Test the function with an example
recommend('The Dark Knight Rises')

Recommendations for 'The Dark Knight Rises':
The Dark Knight
Batman Begins
Batman
Batman Returns
Batman


In [22]:
import pickle
pickle.dump(new_df, open('movies.pkl', 'wb'))


In [23]:
pickle.dump(new_df.to_dict(), open('movie_dict.pkl', 'wb'))



In [24]:
pickle.dump(similarity,open('similarity.pkl','wb'))