In [2]:
import numpy as np
import pandas as pd
import ast # For safely evaluating strings that look like Python literals
from sklearn.feature_extraction.text import CountVectorizer
get_ipython().system('pip install scikit-learn')



Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
# Load the datasets
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')


In [4]:
# View the first row of each dataset
print(movies.head(1))
print(credits.head(1))

      budget                                             genres  \
0  237000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   

                      homepage     id  \
0  http://www.avatarmovie.com/  19995   

                                            keywords original_language  \
0  [{"id": 1463, "name": "culture clash"}, {"id":...                en   

  original_title                                           overview  \
0         Avatar  In the 22nd century, a paraplegic Marine is di...   

   popularity                               production_companies  \
0  150.437577  [{"name": "Ingenious Film Partners", "id": 289...   

                                production_countries release_date     revenue  \
0  [{"iso_3166_1": "US", "name": "United States o...   2009-12-10  2787965087   

   runtime                                   spoken_languages    status  \
0    162.0  [{"iso_639_1": "en", "name": "English"}, {"iso...  Released   

                       tagline   title 

In [5]:
# Merge the datasets on the 'title' column
merged_data = movies.merge(credits, on='title')

In [6]:
# Check the shape of the merged dataset
print(merged_data.shape)

(4809, 23)


In [7]:
# View the first few rows of the merged dataset
print(movies.head())

      budget                                             genres  \
0  237000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
1  300000000  [{"id": 12, "name": "Adventure"}, {"id": 14, "...   
2  245000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
3  250000000  [{"id": 28, "name": "Action"}, {"id": 80, "nam...   
4  260000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   

                                       homepage      id  \
0                   http://www.avatarmovie.com/   19995   
1  http://disney.go.com/disneypictures/pirates/     285   
2   http://www.sonypictures.com/movies/spectre/  206647   
3            http://www.thedarkknightrises.com/   49026   
4          http://movies.disney.com/john-carter   49529   

                                            keywords original_language  \
0  [{"id": 1463, "name": "culture clash"}, {"id":...                en   
1  [{"id": 270, "name": "ocean"}, {"id": 726, "na...                en   
2  [{"id": 470, "nam

In [8]:
# Value counts for the 'original_language' column
print(movies['original_language'].value_counts())


original_language
en    4505
fr      70
es      32
zh      27
de      27
hi      19
ja      16
it      14
cn      12
ru      11
ko      11
pt       9
da       7
sv       5
nl       4
fa       4
th       3
he       3
ta       2
cs       2
ro       2
id       2
ar       2
vi       1
sl       1
ps       1
no       1
ky       1
hu       1
pl       1
af       1
nb       1
tr       1
is       1
xx       1
te       1
el       1
Name: count, dtype: int64


In [9]:
# Get information about the dataset
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

In [10]:
# Select specific columns from the movies dataset
print(movies[['id', 'title', 'overview', 'genres', 'keywords']].head())


       id                                     title  \
0   19995                                    Avatar   
1     285  Pirates of the Caribbean: At World's End   
2  206647                                   Spectre   
3   49026                     The Dark Knight Rises   
4   49529                               John Carter   

                                            overview  \
0  In the 22nd century, a paraplegic Marine is di...   
1  Captain Barbossa, long believed to be dead, ha...   
2  A cryptic message from Bond’s past sends him o...   
3  Following the death of District Attorney Harve...   
4  John Carter is a war-weary, former military ca...   

                                              genres  \
0  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
1  [{"id": 12, "name": "Adventure"}, {"id": 14, "...   
2  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
3  [{"id": 28, "name": "Action"}, {"id": 80, "nam...   
4  [{"id": 28, "name": "Action"}, {"id": 12, "nam...

In [11]:
# Check for missing values
print(movies.isnull().sum())


budget                     0
genres                     0
homepage                3091
id                         0
keywords                   0
original_language          0
original_title             0
overview                   3
popularity                 0
production_companies       0
production_countries       0
release_date               1
revenue                    0
runtime                    2
spoken_languages           0
status                     0
tagline                  844
title                      0
vote_average               0
vote_count                 0
dtype: int64


In [12]:
# Drop rows with missing values
movies.dropna(inplace=True)


In [13]:
# Check for duplicates
print(movies.duplicated().sum())


0


In [14]:
# View the 'genres' column of the first row (before applying the conversion)
print(movies.iloc[0].genres)


[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]


In [15]:
# Define the function to convert the 'genres' and 'keywords' columns
def convert(obj):
    L = []
    # Safely evaluate the string representation of the list of dictionaries
    try:
        for i in ast.literal_eval(obj):  # ast.literal_eval safely parses the string to list/dict
            L.append(i['name'])  # Extract the 'name' of each genre or keyword
    except (ValueError, SyntaxError):
        return []  # Return an empty list if there is any issue
    return L


In [16]:
# Apply the convert function to the 'genres' column
movies['genres'] = movies['genres'].apply(convert)

In [17]:
# Apply the convert function to the 'keywords' column
movies['keywords'] = movies['keywords'].apply(convert)


In [18]:
# View the transformed 'genres' and 'keywords' columns
print(movies[['title', 'genres', 'keywords']].head())

                                      title  \
0                                    Avatar   
1  Pirates of the Caribbean: At World's End   
2                                   Spectre   
3                     The Dark Knight Rises   
4                               John Carter   

                                          genres  \
0  [Action, Adventure, Fantasy, Science Fiction]   
1                   [Adventure, Fantasy, Action]   
2                     [Action, Adventure, Crime]   
3               [Action, Crime, Drama, Thriller]   
4           [Action, Adventure, Science Fiction]   

                                            keywords  
0  [culture clash, future, space war, space colon...  
1  [ocean, drug abuse, exotic island, east india ...  
2  [spy, based on novel, secret agent, sequel, mi...  
3  [dc comics, crime fighter, terrorist, secret i...  
4  [based on novel, mars, medallion, space travel...  


In [19]:
# Process the 'overview' column by splitting it into words
movies['overview'] = movies['overview'].apply(lambda x: x.split())

In [20]:
# View the transformed 'overview' column
print(movies[['title', 'overview']].head())

                                      title  \
0                                    Avatar   
1  Pirates of the Caribbean: At World's End   
2                                   Spectre   
3                     The Dark Knight Rises   
4                               John Carter   

                                            overview  
0  [In, the, 22nd, century,, a, paraplegic, Marin...  
1  [Captain, Barbossa,, long, believed, to, be, d...  
2  [A, cryptic, message, from, Bond’s, past, send...  
3  [Following, the, death, of, District, Attorney...  
4  [John, Carter, is, a, war-weary,, former, mili...  


In [21]:
# Remove spaces in genres and keywords
movies['genres'] = movies['genres'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(" ", "") for i in x])


In [22]:
# View the transformed genres and keywords
print(movies[['title', 'genres']].head())
print(movies[['title', 'keywords']].head())

                                      title  \
0                                    Avatar   
1  Pirates of the Caribbean: At World's End   
2                                   Spectre   
3                     The Dark Knight Rises   
4                               John Carter   

                                         genres  
0  [Action, Adventure, Fantasy, ScienceFiction]  
1                  [Adventure, Fantasy, Action]  
2                    [Action, Adventure, Crime]  
3              [Action, Crime, Drama, Thriller]  
4           [Action, Adventure, ScienceFiction]  
                                      title  \
0                                    Avatar   
1  Pirates of the Caribbean: At World's End   
2                                   Spectre   
3                     The Dark Knight Rises   
4                               John Carter   

                                            keywords  
0  [cultureclash, future, spacewar, spacecolony, ...  
1  [ocean, drugabuse, ex

In [23]:
# Create the tags column by combining overview, genres, and keywords
# Ensure all parts are concatenated properly
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords']


In [24]:
# Optional: Join tags into a single string per movie
movies['tags'] = movies['tags'].apply(lambda x: ' '.join(x))  # Join the list into a single string


In [25]:
# View the first few rows of the dataset with the new tags column
print(movies[['title', 'tags']].head())

                                      title  \
0                                    Avatar   
1  Pirates of the Caribbean: At World's End   
2                                   Spectre   
3                     The Dark Knight Rises   
4                               John Carter   

                                                tags  
0  In the 22nd century, a paraplegic Marine is di...  
1  Captain Barbossa, long believed to be dead, ha...  
2  A cryptic message from Bond’s past sends him o...  
3  Following the death of District Attorney Harve...  
4  John Carter is a war-weary, former military ca...  


In [26]:
new_df = movies[['id', 'title', 'tags']]

In [27]:
print(new_df.head())

       id                                     title  \
0   19995                                    Avatar   
1     285  Pirates of the Caribbean: At World's End   
2  206647                                   Spectre   
3   49026                     The Dark Knight Rises   
4   49529                               John Carter   

                                                tags  
0  In the 22nd century, a paraplegic Marine is di...  
1  Captain Barbossa, long believed to be dead, ha...  
2  A cryptic message from Bond’s past sends him o...  
3  Following the death of District Attorney Harve...  
4  John Carter is a war-weary, former military ca...  


In [28]:
#new_df['tags']= new_df['tags'].apply(lambda x:" ".join(x))

In [29]:
#print(new_df['tags'].head())

In [30]:
new_df['tags'][0]

'In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. Action Adventure Fantasy ScienceFiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d'

In [31]:
# Convert the tags to lowercase
movies['tags'] = movies['tags'].apply(lambda x: x.lower())


In [32]:
print(movies['tags'].head())

0    in the 22nd century, a paraplegic marine is di...
1    captain barbossa, long believed to be dead, ha...
2    a cryptic message from bond’s past sends him o...
3    following the death of district attorney harve...
4    john carter is a war-weary, former military ca...
Name: tags, dtype: object


In [33]:
cv = CountVectorizer(max_features=5000, stop_words='english')

In [34]:
vectorized_data = cv.fit_transform(new_df['tags']).toarray()

In [35]:
print(vectorized_data.shape)

(1493, 5000)


In [36]:
print(vectorized_data)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [37]:
print(vectorized_data[0])

[0 0 0 ... 0 0 0]


In [38]:
print(vectorized_data[1])
print(vectorized_data[2])

[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]


In [39]:
print(cv.get_feature_names_out())

['00' '000' '007' ... 'zoo' 'zooey' 'zookeeper']


In [40]:
from sklearn.metrics.pairwise import cosine_similarity

In [41]:
# Assuming `vectors` is the result of the CountVectorizer transformation
similarity_matrix = cosine_similarity(vectorized_data)

In [42]:
# View the shape of the cosine similarity matrix
print(similarity_matrix.shape)

(1493, 1493)


In [43]:
# Print the similarity matrix (optional)
print(similarity_matrix)

[[1.         0.0877058  0.05751973 ... 0.         0.0541332  0.02470831]
 [0.0877058  1.         0.06558258 ... 0.048795   0.         0.        ]
 [0.05751973 0.06558258 1.         ... 0.         0.         0.        ]
 ...
 [0.         0.048795   0.         ... 1.         0.         0.0412393 ]
 [0.0541332  0.         0.         ... 0.         1.         0.        ]
 [0.02470831 0.         0.         ... 0.0412393  0.         1.        ]]


In [44]:
print(list(enumerate(similarity_matrix[0])))

[(0, 1.0000000000000002), (1, 0.08770580193070292), (2, 0.05751973085430508), (3, 0.03698001308168194), (4, 0.17505471314812437), (5, 0.11556254088025608), (6, 0.018248296715045298), (7, 0.16526725762745997), (8, 0.05751973085430508), (9, 0.07325794357582566), (10, 0.07502344849205331), (11, 0.08920515501750789), (12, 0.04169381992250833), (13, 0.11149893466761211), (14, 0.07897471897389846), (15, 0.14011213457945124), (16, 0.10699012312772824), (17, 0.07944581222989573), (18, 0.05510387687779837), (19, 0.09421114395319916), (20, 0.05947010334500526), (21, 0.09421114395319916), (22, 0.03433508017497365), (23, 0.16477051091432687), (24, 0.11694106924093722), (25, 0.063077570029677), (26, 0.0716114874039433), (27, 0.15701857325533194), (28, 0.08968970586617499), (29, 0.09078412990032037), (30, 0.17505471314812435), (31, 0.08238525545716345), (32, 0.06004805767690767), (33, 0.08006407690254357), (34, 0.05128205128205128), (35, 0.13725270326150324), (36, 0.0), (37, 0.09342836717341034), (3

In [45]:
# Sort the similarity scores for the first movie in descending order
sorted_similarities = sorted(list(enumerate(similarity_matrix[0])), key=lambda x: x[1], reverse=True)

# Print the sorted similarity scores for the first movie
print(sorted_similarities)

[(0, 1.0000000000000002), (188, 0.23788041338002103), (331, 0.23112508176051216), (579, 0.23000322710873397), (51, 0.22645540682891915), (1243, 0.21790681682020446), (61, 0.2172620473133704), (296, 0.21483446221182984), (610, 0.20033416898825337), (285, 0.20033416898825335), (118, 0.20016019225635892), (884, 0.1951231356683212), (574, 0.1921537845661046), (1091, 0.18923271008903098), (160, 0.1849000654084097), (40, 0.1754116038614058), (4, 0.17505471314812437), (30, 0.17505471314812435), (208, 0.17476551841063892), (1228, 0.17295817388759027), (1044, 0.17255919256291524), (681, 0.17093520167692652), (607, 0.1706971854997297), (55, 0.16984155512168933), (117, 0.16531163063339513), (7, 0.16526725762745997), (23, 0.16477051091432687), (87, 0.1617875572323585), (44, 0.16012815380508713), (27, 0.15701857325533194), (1143, 0.15695698526580623), (139, 0.1539670545751043), (73, 0.14867525836251314), (1246, 0.14867525836251314), (293, 0.146943671674129), (325, 0.14617633655117152), (665, 0.1432

In [46]:
# Get the top 5 similar movies (excluding the first movie itself)
top_5_similar_movies = sorted_similarities[1:6]  # Exclude the first movie (similarity to itself)

# Print the top 5 similar movies
print(top_5_similar_movies)


[(188, 0.23788041338002103), (331, 0.23112508176051216), (579, 0.23000322710873397), (51, 0.22645540682891915), (1243, 0.21790681682020446)]


In [47]:
def recommend(movie):
    # Get the index of the movie that matches the title
    movie_index = new_df[new_df['title'] == movie].index[0]  # Fix: Get the index of the specified movie

    # Get the distances (similarity scores) for the specified movie
    distances = similarity_matrix[movie_index]  # Use the correct variable name `similarity_matrix`

    # Sort the movies based on their similarity scores in descending order
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]  # Exclude the first movie itself

    # Print the titles of the top 5 similar movies
    for i in movies_list:
        print(new_df.iloc[i[0]]['title'])  # Use iloc to get the title based on the index

    return 




In [48]:
# Example usage
recommend('Batman Begins')

Kung Fu Panda
How to Train Your Dragon 2
Kung Fu Panda 2
Cars 2
Now You See Me 2


In [49]:
import pickle

In [50]:
# Save the DataFrame `new_df` as a pickle file
pickle.dump(new_df, open('movies.pkl', 'wb'))


In [51]:
titles = new_df['title'].values
print(titles)

['Avatar' "Pirates of the Caribbean: At World's End" 'Spectre' ...
 'Tin Can Man' 'Primer' 'Shanghai Calling']


In [56]:
pickle.dump(new_df.to_dict(),open('movie_dict.pkl','wb'))

In [57]:
pickle.dump(similarity_matrix, open('similarity_matrix.pkl', 'wb'))