In [1]:
!pip install scikit-learn



In [2]:
import pandas as pd
import ast
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [3]:
file_path = 'tmdb_5000_credits.csv'  

with open(file_path, 'r', encoding='utf-8') as f:
    lines = f.readlines()

cleaned_lines = [line for i, line in enumerate(lines) if i != 2215]

with open('cleaned_credits.csv', 'w', encoding='utf-8') as f:
    f.writelines(cleaned_lines)


In [4]:
credits = pd.read_csv('cleaned_credits.csv')
credits.shape, credits.head()


((4802, 4),
    movie_id                                     title  \
 0     19995                                    Avatar   
 1       285  Pirates of the Caribbean: At World's End   
 2    206647                                   Spectre   
 3     49026                     The Dark Knight Rises   
 4     49529                               John Carter   
 
                                                 cast  \
 0  [{"cast_id": 242, "character": "Jake Sully", "...   
 1  [{"cast_id": 4, "character": "Captain Jack Spa...   
 2  [{"cast_id": 1, "character": "James Bond", "cr...   
 3  [{"cast_id": 2, "character": "Bruce Wayne / Ba...   
 4  [{"cast_id": 5, "character": "John Carter", "c...   
 
                                                 crew  
 0  [{"credit_id": "52fe48009251416c750aca23", "de...  
 1  [{"credit_id": "52fe4232c3a36847f800b579", "de...  
 2  [{"credit_id": "54805967c3a36829b5002c41", "de...  
 3  [{"credit_id": "52fe4781c3a36847f81398c3", "de...  
 4  [{"credit_

In [5]:
def get_top_cast(cast_str):
    try:
        cast = ast.literal_eval(cast_str)
        names = [actor['name'] for actor in cast[:3]]
        return ' '.join(names)
    except:
        return ''

def get_director(crew_str):
    try:
        crew = ast.literal_eval(crew_str)
        for person in crew:
            if person['job'] == 'Director':
                return person['name']
        return ''
    except:
        return ''


In [6]:
credits['top_cast'] = credits['cast'].apply(get_top_cast)
credits['director'] = credits['crew'].apply(get_director)

credits['tags'] = credits['top_cast'] + ' ' + credits['director']

credits = credits[['movie_id', 'title', 'tags']]
credits.head()


Unnamed: 0,movie_id,title,tags
0,19995,Avatar,Sam Worthington Zoe Saldana Sigourney Weaver J...
1,285,Pirates of the Caribbean: At World's End,Johnny Depp Orlando Bloom Keira Knightley Gore...
2,206647,Spectre,Daniel Craig Christoph Waltz Léa Seydoux Sam M...
3,49026,The Dark Knight Rises,Christian Bale Michael Caine Gary Oldman Chris...
4,49529,John Carter,Taylor Kitsch Lynn Collins Samantha Morton And...


In [7]:
cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(credits['tags']).toarray()

similarity = cosine_similarity(vectors)


In [8]:
def recommend(movie):
    if movie not in credits['title'].values:
        print(" Movie not found!")
        return

    index = credits[credits['title'] == movie].index[0]
    distances = similarity[index]

    movie_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]

    print(f"\n Top 5 recommendations for '{movie}':\n")
    for i in movie_list:
        print(credits.iloc[i[0]].title)


In [9]:
recommend('Avatar')


 Top 5 recommendations for 'Avatar':

Aliens
Guardians of the Galaxy
Snow White: A Tale of Terror
The Cold Light of Day
Crossroads


In [10]:
recommend('The Avengers')


 Top 5 recommendations for 'The Avengers':

Avengers: Age of Ultron
Captain America: Civil War
Zodiac
The Judge
What's Your Number?


In [11]:
recommend('Iron Man')


 Top 5 recommendations for 'Iron Man':

Iron Man 2
Charlie Bartlett
The Judge
Made
R.I.P.D.
