In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv('validated_data.csv') 
df['tags'].fillna('', inplace=True)
df.isna().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['tags'].fillna('', inplace=True)


Unnamed: 0    0
id            0
title         0
tags          0
dtype: int64

**Will use NLTK library for stemming**

*Will get 5000 or 10000 most common words from tags

In [2]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()


from tqdm import tqdm


def stem(text):
    data = []
    for word in text.split():
        data.append(ps.stem(word))
    
    return " ".join(data)

In [3]:
# stem(df['tags'][0])

In [4]:
df['tags'] = df['tags'].apply(stem)
print(df)

      Unnamed: 0      id                                     title  \
0              0   19995                                    Avatar   
1              1     285  Pirates of the Caribbean: At World's End   
2              2  206647                                   Spectre   
3              3   49026                     The Dark Knight Rises   
4              4   49529                               John Carter   
...          ...     ...                                       ...   
4801        4804    9367                               El Mariachi   
4802        4805   72766                                 Newlyweds   
4803        4806  231617                 Signed, Sealed, Delivered   
4804        4807  126186                          Shanghai Calling   
4805        4808   25975                         My Date with Drew   

                                                   tags  
0     in the 22nd century, a parapleg marin is dispa...  
1     captain barbossa, long believ to be d

**Now doing counter vercorization using sklearn**

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')

In [16]:
vectors = cv.fit_transform(df['tags']).toarray()
vectors


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [17]:
print(vectors.shape)
cv.get_feature_names_out()

(4806, 5000)


array(['000', '007', '10', ..., 'zone', 'zoo', 'zooeydeschanel'],
      dtype=object)

In [18]:
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(vectors)

In [9]:
# similarity[0]

In [10]:
# list(enumerate(similarity[0]))

In [19]:
sorted(list(enumerate(similarity[0])), reverse=True, key = lambda x: x[1])

[(0, 1.0000000000000002),
 (1214, 0.28676966733820225),
 (2405, 0.26901379342448517),
 (3728, 0.2605130246476754),
 (507, 0.255608593705383),
 (539, 0.2503866978335957),
 (582, 0.24511108480187255),
 (1202, 0.24455799402225925),
 (1192, 0.2367785320221084),
 (61, 0.23179316248638274),
 (778, 0.23174488732966075),
 (4046, 0.2278389747471728),
 (1916, 0.22528177844479153),
 (2782, 0.21853668936906193),
 (172, 0.21239769762143662),
 (972, 0.2108663315950723),
 (322, 0.2105263157894737),
 (2329, 0.20443988269091456),
 (3606, 0.20437977982832192),
 (260, 0.20395079136182276),
 (151, 0.2029530274475215),
 (4190, 0.2029530274475215),
 (1440, 0.20277677641345318),
 (74, 0.20246457179963143),
 (1087, 0.2020475485519274),
 (3671, 0.1979082783981174),
 (973, 0.19767387315371682),
 (577, 0.1976738731537168),
 (47, 0.19672236884115843),
 (2969, 0.19252140716412977),
 (942, 0.19134594929397597),
 (495, 0.19088542889273336),
 (1199, 0.19088542889273336),
 (305, 0.1900748713929803),
 (4, 0.18929940971

In [12]:
# sorted(list(enumerate(similarity[0])), reverse=True, key=lambda x:x[1])[1:6]

In [20]:
def recommend(movie):
    index = df[df['title'] == movie].index[0]
    distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])
    for i in distances[1:6]:
        print(df.iloc[i[0]].title)


In [23]:
recommend('Avatar')

Aliens vs Predator: Requiem
Aliens
Falcon Rising
Independence Day
Titan A.E.
