# Feature Extraction Using TF-IDF Vectorizer

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

df = pd.read_csv("/kaggle/input/tmdb-5000-movies-csv/tmdb_5000_movies.csv")
print(df.head(2))
print(df.info())
df['overview'] = df['overview'].fillna('')
#Initialize TF-IDF Vectorizer 
tfidf = TfidfVectorizer(stop_words='english')

#Fit and transform the overview column
tfidf_matrix = tfidf.fit_transform(df['overview'].values.astype('U'))

print("Shape of TF-IDF matrix:", tfidf_matrix.shape)




      budget                                             genres  \
0  237000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
1  300000000  [{"id": 12, "name": "Adventure"}, {"id": 14, "...   

                                       homepage     id  \
0                   http://www.avatarmovie.com/  19995   
1  http://disney.go.com/disneypictures/pirates/    285   

                                            keywords original_language  \
0  [{"id": 1463, "name": "culture clash"}, {"id":...                en   
1  [{"id": 270, "name": "ocean"}, {"id": 726, "na...                en   

                             original_title  \
0                                    Avatar   
1  Pirates of the Caribbean: At World's End   

                                            overview  popularity  \
0  In the 22nd century, a paraplegic Marine is di...  150.437577   
1  Captain Barbossa, long believed to be dead, ha...  139.082615   

                                production_companies 

# Similarity Calculation Using Cosine Similarity 

In [2]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print("Shape of cosine similarity matrix:", cosine_sim.shape)

sim_df = pd.DataFrame(cosine_sim, 
                      index=df['title'], 
                      columns=df['title'])

sim_df.head()


Shape of cosine similarity matrix: (4803, 4803)


title,Avatar,Pirates of the Caribbean: At World's End,Spectre,The Dark Knight Rises,John Carter,Spider-Man 3,Tangled,Avengers: Age of Ultron,Harry Potter and the Half-Blood Prince,Batman v Superman: Dawn of Justice,...,On The Downlow,Sanctuary: Quite a Conundrum,Bang,Primer,Cavite,El Mariachi,Newlyweds,"Signed, Sealed, Delivered",Shanghai Calling,My Date with Drew
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Avatar,1.0,0.0,0.0,0.024995,0.0,0.030353,0.0,0.037581,0.0,0.0,...,0.0,0.0,0.029175,0.042176,0.0,0.0,0.0,0.0,0.0,0.0
Pirates of the Caribbean: At World's End,0.0,1.0,0.0,0.0,0.033369,0.0,0.0,0.022676,0.0,0.0,...,0.0,0.0,0.006895,0.0,0.0,0.0,0.0,0.021605,0.0,0.0
Spectre,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.030949,0.02483,0.0,...,0.027695,0.0,0.0,0.0,0.017768,0.0,0.0,0.014882,0.0,0.0
The Dark Knight Rises,0.024995,0.0,0.0,1.0,0.010433,0.005145,0.012601,0.026954,0.020652,0.13374,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033864,0.042752,0.022692
John Carter,0.0,0.033369,0.0,0.010433,1.0,0.0,0.009339,0.037407,0.0,0.017148,...,0.01273,0.0,0.0,0.0,0.0,0.0,0.0,0.006126,0.0,0.0


# Build the Recommender Function

In [3]:
def recommend(movie_title, cosine_sim=cosine_sim, df=df):
    idx = df[df['title'] == movie_title].index[0]
    
    #Get similarity scores for this movie with all movies
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    #Sort movies by similarity score (highest first)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
     #Take top 5 most similar movies (skip the first because it's the movie itself)
    sim_scores = sim_scores[1:6]
    
    #Get movie indices
    movie_indices = [i[0] for i in sim_scores]
    
    #Return the movie titles
    return df['title'].iloc[movie_indices]

print("Recommendations for Avatar:")
print(recommend("Avatar"))

print("\nRecommendations for The Dark Knight Rises:")
print(recommend("The Dark Knight Rises"))



Recommendations for Avatar:
3604               Apollo 18
2130            The American
634               The Matrix
1341    The Inhabited Island
529         Tears of the Sun
Name: title, dtype: object

Recommendations for The Dark Knight Rises:
65                              The Dark Knight
299                              Batman Forever
428                              Batman Returns
1359                                     Batman
3854    Batman: The Dark Knight Returns, Part 2
Name: title, dtype: object
