In [4]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
import ast
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Step 2: Load Datasets
movies = pd.read_csv("tmdb_5000_movies.csv")
credits = pd.read_csv("tmdb_5000_credits.csv")

# Step 3: Merge the data on 'title'
movies = movies.merge(credits, on='title')

# Step 4: Select required columns
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]
movies.dropna(inplace=True)

# Step 5: Convert JSON-like strings into Python objects
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)

# Get top 3 cast members
def get_top_cast(obj):
    L = []
    count = 0
    for i in ast.literal_eval(obj):
        if count < 3:
            L.append(i['name'])
            count += 1
        else:
            break
    return L

movies['cast'] = movies['cast'].apply(get_top_cast)

# Get the director's name
def get_director(obj):
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            return i['name']
    return ""

movies['crew'] = movies['crew'].apply(get_director)

# Step 6: Preprocess features
movies['overview'] = movies['overview'].apply(lambda x: x.split())
movies['genres'] = movies['genres'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['crew'] = movies['crew'].apply(lambda x: x.replace(" ", ""))

# Step 7: Create tags column
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast']
movies['tags'] = movies['tags'] + movies['crew'].apply(lambda x: [x])
new_df = movies[['movie_id', 'title', 'tags']]
new_df.loc[:, 'tags'] = new_df['tags'].apply(lambda x: " ".join(x))


# Step 8: Vectorization
cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(new_df['tags']).toarray()

# Step 9: Cosine similarity
similarity = cosine_similarity(vectors)

# Step 10: Recommendation function
def recommend(movie):
    movie = movie.lower()
    if movie not in new_df['title'].str.lower().values:
        print("Movie not found.")
        return
    idx = new_df[new_df['title'].str.lower() == movie].index[0]
    distances = list(enumerate(similarity[idx]))
    movies_list = sorted(distances, key=lambda x: x[1], reverse=True)[1:6]

    print(f"Top 5 movies similar to '{new_df.iloc[idx].title}':")
    for i in movies_list:
        print(new_df.iloc[i[0]].title)

# Step 11: Try it!
recommend("Cars 2")


Top 5 movies similar to 'Cars 2':
Cars
Ice Age: Continental Drift
The Croods
The Adventures of Rocky & Bullwinkle
Herbie Fully Loaded


In [5]:
import os
print(os.getcwd())


C:\Users\bhavy\OneDrive\Desktop\Bhavya\Intern\TASK 4
