In [1]:
# Unzip the uploaded file
!unzip movies.zip


Archive:  movies.zip.zip
  inflating: tmdb_5000_credits.csv   
  inflating: tmdb_5000_movies.csv    


In [2]:
import pandas as pd

# Load both CSV files
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

# Preview the data
movies.head(1)


Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [9]:
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]


In [11]:
# Replace NaN with empty string
movies['overview'] = movies['overview'].fillna('')

# Only split if it's not already a list
movies['overview'] = movies['overview'].apply(lambda x: x if isinstance(x, list) else x.split())


In [12]:
import ast


In [15]:
import ast

def convert(obj):
    L = []
    if isinstance(obj, str):
        obj = ast.literal_eval(obj)
    for i in obj:
        L.append(i['name'] if isinstance(i, dict) and 'name' in i else i)
    return L


In [16]:
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)


In [18]:
# Convert cast - take top 3 actors only
def convert_cast(obj):
    L = []
    if isinstance(obj, str):
        obj = ast.literal_eval(obj)
    for i in obj:
        if isinstance(i, dict) and 'name' in i:
            L.append(i['name'])
        if len(L) == 3:
            break
    return L

movies['cast'] = movies['cast'].apply(convert_cast)


In [19]:
# Convert crew - take only the director
def fetch_director(obj):
    if isinstance(obj, str):
        obj = ast.literal_eval(obj)
    for i in obj:
        if isinstance(i, dict) and i['job'] == 'Director':
            return [i['name']]
    return []

movies['crew'] = movies['crew'].apply(fetch_director)


In [20]:
# Remove spaces from all names to treat them as one word
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['genres'] = movies['genres'].apply(lambda x: [i.replace(" ", "") for i in x])


In [21]:
# Create new column: tags = overview + genres + keywords + cast + crew
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']


In [22]:
# Convert list to string
movies['tags'] = movies['tags'].apply(lambda x: " ".join(x))


In [23]:
movies['tags'] = movies['tags'].apply(lambda x: x.lower())


In [24]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(movies['tags']).toarray()


In [25]:
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vectors)


In [26]:
# Function to recommend similar movies
def recommend(movie):
    movie = movie.lower()
    if movie not in movies['title'].str.lower().values:
        return ["Movie not found."]

    index = movies[movies['title'].str.lower() == movie].index[0]
    distances = list(enumerate(similarity[index]))
    distances = sorted(distances, key=lambda x: x[1], reverse=True)

    recommended_movies = []
    for i in distances[1:6]:  # top 5
        recommended_movies.append(movies.iloc[i[0]].title)
    return recommended_movies


In [27]:
recommend("Avatar")


['Titan A.E.',
 'Small Soldiers',
 'Independence Day',
 'Aliens vs Predator: Requiem',
 'Predators']