In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("movies.csv")
df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [3]:
df.shape

(9742, 3)

In [4]:
df.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

In [5]:
df["genres"].isnull().sum()

0

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
feature_vectors = vectorizer.fit_transform(df['genres'])
feature_vectors.toarray()

array([[0.        , 0.41684567, 0.51622547, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.51236121, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.57860574, 0.        , 0.81560738, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [7]:
vectorizer.get_feature_names_out()

array(['action', 'adventure', 'animation', 'children', 'comedy', 'crime',
       'documentary', 'drama', 'fantasy', 'fi', 'film', 'genres',
       'horror', 'imax', 'listed', 'musical', 'mystery', 'no', 'noir',
       'romance', 'sci', 'thriller', 'war', 'western'], dtype=object)

In [8]:
pd.DataFrame(vectorizer.fit_transform(df['genres']).toarray(),columns=vectorizer.get_feature_names_out())

Unnamed: 0,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,fi,...,listed,musical,mystery,no,noir,romance,sci,thriller,war,western
0,0.000000,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.000000,0.482990,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
1,0.000000,0.512361,0.000000,0.620525,0.000000,0.0,0.0,0.000000,0.593662,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2,0.000000,0.000000,0.000000,0.000000,0.570915,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.821009,0.0,0.0,0.0,0.0
3,0.000000,0.000000,0.000000,0.000000,0.505015,0.0,0.0,0.466405,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.726241,0.0,0.0,0.0,0.0
4,0.000000,0.000000,0.000000,0.000000,1.000000,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,0.436010,0.000000,0.614603,0.000000,0.318581,0.0,0.0,0.000000,0.575034,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
9738,0.000000,0.000000,0.682937,0.000000,0.354002,0.0,0.0,0.000000,0.638968,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
9739,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,1.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
9740,0.578606,0.000000,0.815607,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0


# 1,2,3,4,5 - 1 row
# 1,2,3,4,5 - 1 col, 
# write as marix 
# (1,1) (1,2) (1,3) (1,4) (1,5)
# (2,1) (2,2) (2,3)..so on

In [9]:
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(feature_vectors)
similarity


array([[1.        , 0.81357774, 0.15276924, ..., 0.        , 0.4210373 ,
        0.26758648],
       [0.81357774, 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.15276924, 0.        , 1.        , ..., 0.        , 0.        ,
        0.57091541],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.4210373 , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.26758648, 0.        , 0.57091541, ..., 0.        , 0.        ,
        1.        ]])

In [10]:
print(similarity.shape)

(9742, 9742)


In [11]:
list_of_all_titles = df["title"].tolist()
list_of_all_titles

['Toy Story (1995)',
 'Jumanji (1995)',
 'Grumpier Old Men (1995)',
 'Waiting to Exhale (1995)',
 'Father of the Bride Part II (1995)',
 'Heat (1995)',
 'Sabrina (1995)',
 'Tom and Huck (1995)',
 'Sudden Death (1995)',
 'GoldenEye (1995)',
 'American President, The (1995)',
 'Dracula: Dead and Loving It (1995)',
 'Balto (1995)',
 'Nixon (1995)',
 'Cutthroat Island (1995)',
 'Casino (1995)',
 'Sense and Sensibility (1995)',
 'Four Rooms (1995)',
 'Ace Ventura: When Nature Calls (1995)',
 'Money Train (1995)',
 'Get Shorty (1995)',
 'Copycat (1995)',
 'Assassins (1995)',
 'Powder (1995)',
 'Leaving Las Vegas (1995)',
 'Othello (1995)',
 'Now and Then (1995)',
 'Persuasion (1995)',
 'City of Lost Children, The (Cité des enfants perdus, La) (1995)',
 'Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)',
 'Dangerous Minds (1995)',
 'Twelve Monkeys (a.k.a. 12 Monkeys) (1995)',
 'Babe (1995)',
 'Dead Man Walking (1995)',
 'It Takes Two (1995)',
 'Clueless (1995)',
 'Cry, the Beloved Country

In [12]:
Movie_name = input("enter the movie name: ")

enter the movie name:  toy story


In [13]:
import difflib

find_close_match = difflib.get_close_matches(Movie_name,list_of_all_titles,n=5,cutoff=0.1)

# cutoff percentage of match, n is for no.of movies suggests
# it gives relative genres movies...
#if u type exactly as the movie name...it gives that movie too

print(find_close_match) 

['Toy Story (1995)', 'Toy Story 3 (2010)', 'Toy Story 2 (1999)', 'Love Story (1970)', 'Holy Motors (2012)']


In [14]:
close_match = find_close_match[0]
close_match

'Toy Story (1995)'

In [15]:
df[df.title==close_match]

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [16]:
index_of_movie = df[df.title==close_match].index
index_of_movie

Index([0], dtype='int64')

In [17]:
index_of_movie = df[df.title==close_match].index
index_of_movie

Index([0], dtype='int64')

In [18]:
index_of_movie = df[df.title==close_match].index.values
index_of_movie

array([0], dtype=int64)

In [19]:
# index_of_movie = df[df["title"]==close_match].index.values[0]

index_of_movie = df[df.title==close_match].index.values[0]
index_of_movie

0

# enumerate means, 
# if l = [1,2,3,4,5]
# enumerate(l)
# means it gives = (0,1) (1,2) (2,3)...so on, 
# enumerate = (index,similarity) 
# nv ichina movie index tho okkokka movie similarity % 

In [20]:
similarity_score = list(enumerate(similarity[index_of_movie]))
#print(similarity_score)

In [21]:
sorted_movies = sorted(similarity_score,key = lambda x:x[1], reverse = True)
# Here, lambda x: x[1] is a lambda function that takes an element x 
# (which is a tuple (index, similarity_score)) and returns x[1],
# which is the similarity score.
# descending order (reverse=True).
#print(sorted_movies)

In [22]:
print("movies suggested for u : \n")
i = 1
for j in sorted_movies:
    # top 10 lo unna anni okkokati print avuthunnayi
    # sorted elaa vachindo check cheyyi
    # sorted movies ~ similarity ~ index of movie ~ close match
    index=j[0]
    title_from_index = df[df.index==index]["title"].values[0]
    if(i<=10):
        print(i, " ",title_from_index)
        i+=1

movies suggested for u : 

1   Toy Story (1995)
2   Antz (1998)
3   Toy Story 2 (1999)
4   Adventures of Rocky and Bullwinkle, The (2000)
5   Emperor's New Groove, The (2000)
6   Monsters, Inc. (2001)
7   Wild, The (2006)
8   Shrek the Third (2007)
9   Tale of Despereaux, The (2008)
10   Asterix and the Vikings (Astérix et les Vikings) (2006)
