## 1. Import pandas :

In [4]:
import pandas as pd

## 2. Load Dataset :

In [7]:
movies = pd.read_csv('top10K-TMDB-movies.csv')

movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 10000 non-null  int64  
 1   title              10000 non-null  object 
 2   genre              9997 non-null   object 
 3   original_language  10000 non-null  object 
 4   overview           9987 non-null   object 
 5   popularity         10000 non-null  float64
 6   release_date       10000 non-null  object 
 7   vote_average       10000 non-null  float64
 8   vote_count         10000 non-null  int64  
dtypes: float64(2), int64(2), object(5)
memory usage: 703.3+ KB


## 3. Fearure Selection (Select most Important columns) :

In [10]:
movies.columns

Index(['id', 'title', 'genre', 'original_language', 'overview', 'popularity',
       'release_date', 'vote_average', 'vote_count'],
      dtype='object')

In [12]:
movies = movies[['id','title','genre','overview']]
movies.head()

Unnamed: 0,id,title,genre,overview
0,278,The Shawshank Redemption,"Drama,Crime",Framed in the 1940s for the double murder of h...
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance","Raj is a rich, carefree, happy-go-lucky second..."
2,238,The Godfather,"Drama,Crime","Spanning the years 1945 to 1955, a chronicle o..."
3,424,Schindler's List,"Drama,History,War",The true story of how businessman Oskar Schind...
4,240,The Godfather: Part II,"Drama,Crime",In the continuing saga of the Corleone crime f...


## 4. Create tags :

In [15]:
movies['tags']=movies['genre']+movies['overview']
movies['tags']

0       Drama,CrimeFramed in the 1940s for the double ...
1       Comedy,Drama,RomanceRaj is a rich, carefree, h...
2       Drama,CrimeSpanning the years 1945 to 1955, a ...
3       Drama,History,WarThe true story of how busines...
4       Drama,CrimeIn the continuing saga of the Corle...
                              ...                        
9995    Action,Adventure,FantasyThe story follows the ...
9996    Action,TV Movie,Science Fiction,Comedy,Adventu...
9997    Action,Science Fiction,WarDuring World War II,...
9998    Adventure,Fantasy,Action,DramaA man named Farm...
9999    Thriller,Action,CrimeSeeking justice for his p...
Name: tags, Length: 10000, dtype: object

In [17]:
movies.head()

Unnamed: 0,id,title,genre,overview,tags
0,278,The Shawshank Redemption,"Drama,Crime",Framed in the 1940s for the double murder of h...,"Drama,CrimeFramed in the 1940s for the double ..."
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance","Raj is a rich, carefree, happy-go-lucky second...","Comedy,Drama,RomanceRaj is a rich, carefree, h..."
2,238,The Godfather,"Drama,Crime","Spanning the years 1945 to 1955, a chronicle o...","Drama,CrimeSpanning the years 1945 to 1955, a ..."
3,424,Schindler's List,"Drama,History,War",The true story of how businessman Oskar Schind...,"Drama,History,WarThe true story of how busines..."
4,240,The Godfather: Part II,"Drama,Crime",In the continuing saga of the Corleone crime f...,"Drama,CrimeIn the continuing saga of the Corle..."


## 5. Drop columns :

In [20]:
new_data = movies.drop(columns=['genre','overview'])
new_data

Unnamed: 0,id,title,tags
0,278,The Shawshank Redemption,"Drama,CrimeFramed in the 1940s for the double ..."
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,RomanceRaj is a rich, carefree, h..."
2,238,The Godfather,"Drama,CrimeSpanning the years 1945 to 1955, a ..."
3,424,Schindler's List,"Drama,History,WarThe true story of how busines..."
4,240,The Godfather: Part II,"Drama,CrimeIn the continuing saga of the Corle..."
...,...,...,...
9995,10196,The Last Airbender,"Action,Adventure,FantasyThe story follows the ..."
9996,331446,Sharknado 3: Oh Hell No!,"Action,TV Movie,Science Fiction,Comedy,Adventu..."
9997,13995,Captain America,"Action,Science Fiction,WarDuring World War II,..."
9998,2312,In the Name of the King: A Dungeon Siege Tale,"Adventure,Fantasy,Action,DramaA man named Farm..."


## 6. Vectorization :

In [23]:
from sklearn.feature_extraction.text import CountVectorizer

In [24]:
cv = CountVectorizer(max_features=10000, stop_words='english') 
# 10000 movies are available in this data so ,we declare max_features=10000

In [25]:
cv

In [29]:
vector = cv.fit_transform(new_data['tags'].values.astype('U')).toarray() 

# The 'U' in astype('U') specifies that the data should be treated as Unicode strings
#In this context, using 'U' helps ensure that each entry in new_data['tags'] is treated as a string, 
#which is essential for vectorization (e.g., transforming text into numerical vectors) to work correctly

#The toarray() method is typically used to convert a sparse matrix representation into a dense array format

In [30]:
vector.shape

(10000, 10000)

## 7. find similarity using cosine_similarity :

In [34]:
from sklearn.metrics.pairwise import cosine_similarity

In [36]:
similarity = cosine_similarity(vector)

In [37]:
similarity

array([[1.        , 0.06253054, 0.05802589, ..., 0.07963978, 0.07597372,
        0.03798686],
       [0.06253054, 1.        , 0.08980265, ..., 0.        , 0.        ,
        0.        ],
       [0.05802589, 0.08980265, 1.        , ..., 0.02541643, 0.03636965,
        0.        ],
       ...,
       [0.07963978, 0.        , 0.02541643, ..., 1.        , 0.03327792,
        0.03327792],
       [0.07597372, 0.        , 0.03636965, ..., 0.03327792, 1.        ,
        0.04761905],
       [0.03798686, 0.        , 0.        , ..., 0.03327792, 0.04761905,
        1.        ]])

## 8. find the index :

In [41]:
new_data[new_data['title'] == 'Captain America'].index[0]	

9997

## 9. find similarity scores for a specific item :

In [44]:
distance = sorted(list(enumerate(similarity[9997])), reverse=True, key=lambda vector:vector[1])

#similarity[9997]: This accesses the similarity scores for the item at index 9997. 

#enumerate(similarity[9997]): This adds an index to each similarity score, creating pairs of (index, score). 

#list(): This converts the enumerated object into a list of tuples.

#sorted(reverse=True, key=lambda vector: vector[1]): This sorts the list in descending order based on the similarity scores. 
#The key=lambda vector: vector[1] means that the sorting is done based on the second element of each tuple (the similarity score).
#reverse=True makes the sorting order descending.


## 10. top 5 most similar items

In [47]:
for i in distance[0:5]:
    print(new_data.iloc[i[0]].title)

#.title: This accesses the title column of the retrieved row, which contains the title of the item.

Captain America
Captain America: The First Avenger
Team Thor
Captain America: The Winter Soldier
Ultimate Avengers: The Movie


## 11. recommend function is designed to recommend movies based on a given movie title.

In [50]:
def recommend(movie_title):
    index = new_data[new_data['title'] == movie_title].index[0]
    distance = sorted(list(enumerate(similarity[index])), reverse=True, key=lambda vector:vector[1])
    for i in distance[0:5]:
        print(new_data.iloc[i[0]].title)

In [52]:
recommend('Captain America')

Captain America
Captain America: The First Avenger
Team Thor
Captain America: The Winter Soldier
Ultimate Avengers: The Movie


In [56]:
movie_title = input('Enter title of the movie : ')
print('Recommended movies are :')
recommend(movie_title)

Enter title of the movie :  The Godfather


Recommended movies are :
The Godfather
The Godfather: Part II
Felon
House of Gucci
Gotti


In [58]:
import pickle

In [60]:
pickle.dump(new_data, open('movies_list.pkl', 'wb'))

In [62]:
pickle.dump(similarity, open('similarity.pkl', 'wb'))

In [63]:
pickle.load(open('movies_list.pkl', 'rb'))

Unnamed: 0,id,title,tags
0,278,The Shawshank Redemption,"Drama,CrimeFramed in the 1940s for the double ..."
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,RomanceRaj is a rich, carefree, h..."
2,238,The Godfather,"Drama,CrimeSpanning the years 1945 to 1955, a ..."
3,424,Schindler's List,"Drama,History,WarThe true story of how busines..."
4,240,The Godfather: Part II,"Drama,CrimeIn the continuing saga of the Corle..."
...,...,...,...
9995,10196,The Last Airbender,"Action,Adventure,FantasyThe story follows the ..."
9996,331446,Sharknado 3: Oh Hell No!,"Action,TV Movie,Science Fiction,Comedy,Adventu..."
9997,13995,Captain America,"Action,Science Fiction,WarDuring World War II,..."
9998,2312,In the Name of the King: A Dungeon Siege Tale,"Adventure,Fantasy,Action,DramaA man named Farm..."


In [66]:
pickle.load(open('similarity.pkl', 'rb'))

array([[1.        , 0.06253054, 0.05802589, ..., 0.07963978, 0.07597372,
        0.03798686],
       [0.06253054, 1.        , 0.08980265, ..., 0.        , 0.        ,
        0.        ],
       [0.05802589, 0.08980265, 1.        , ..., 0.02541643, 0.03636965,
        0.        ],
       ...,
       [0.07963978, 0.        , 0.02541643, ..., 1.        , 0.03327792,
        0.03327792],
       [0.07597372, 0.        , 0.03636965, ..., 0.03327792, 1.        ,
        0.04761905],
       [0.03798686, 0.        , 0.        , ..., 0.03327792, 0.04761905,
        1.        ]])