In [1]:
import pandas as pd
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

df = movies.merge(credits,on= 'title')

In [2]:
import ast
def safe_to_list(obj):
    if pd.isna(obj):
        return []
    if isinstance(obj,list):
        return obj
    if isinstance(obj,str):
        v = ast.literal_eval(obj)
        if isinstance(v,list):
            return v
        else:
            return [v]
    else:
        return [obj]

def get_names(obj):
    names = []
    if not isinstance(obj,list):
        return []
    for item in obj:
        names.append(item['name'])
    return names    

def get_director(obj):
    try:
        if isinstance(obj,list):
            for item in obj:
                if item['job']=='Director':
                    return item['name']        
    except Exception:
        return ""
                
def top_actor(obj):
    names = []
    count = 0
    if not isinstance(obj,list):
        return []
    for item in obj:
        if count!=3:
          names.append(item)
          count+=1
        else:
            break
    return names       

In [3]:
cols_to_list = ['genres','crew','cast','keywords']
for c in cols_to_list:
    df[c] = df[c].apply(safe_to_list)

In [4]:
df['genres'] = df['genres'].apply(get_names)
df['cast'] = df['cast'].apply(get_names)
df['director'] = df['crew'].apply(get_director)
df['keywords'] = df['keywords'].apply(get_names)

In [5]:
df['top_cast'] = df['cast'].apply(top_actor)

In [6]:
df = df.drop(['cast','homepage','revenue','runtime','popularity'],axis=1)

In [7]:
df.shape

(4809, 20)

In [8]:
df = df[['title','genres','top_cast','director','keywords','overview']].copy()

In [9]:
df.head()

Unnamed: 0,title,genres,top_cast,director,keywords,overview
0,Avatar,"[Action, Adventure, Fantasy, Science Fiction]","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",James Cameron,"[culture clash, future, space war, space colon...","In the 22nd century, a paraplegic Marine is di..."
1,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[Johnny Depp, Orlando Bloom, Keira Knightley]",Gore Verbinski,"[ocean, drug abuse, exotic island, east india ...","Captain Barbossa, long believed to be dead, ha..."
2,Spectre,"[Action, Adventure, Crime]","[Daniel Craig, Christoph Waltz, Léa Seydoux]",Sam Mendes,"[spy, based on novel, secret agent, sequel, mi...",A cryptic message from Bond’s past sends him o...
3,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]","[Christian Bale, Michael Caine, Gary Oldman]",Christopher Nolan,"[dc comics, crime fighter, terrorist, secret i...",Following the death of District Attorney Harve...
4,John Carter,"[Action, Adventure, Science Fiction]","[Taylor Kitsch, Lynn Collins, Samantha Morton]",Andrew Stanton,"[based on novel, mars, medallion, space travel...","John Carter is a war-weary, former military ca..."


In [10]:
df.head()

Unnamed: 0,title,genres,top_cast,director,keywords,overview
0,Avatar,"[Action, Adventure, Fantasy, Science Fiction]","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",James Cameron,"[culture clash, future, space war, space colon...","In the 22nd century, a paraplegic Marine is di..."
1,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[Johnny Depp, Orlando Bloom, Keira Knightley]",Gore Verbinski,"[ocean, drug abuse, exotic island, east india ...","Captain Barbossa, long believed to be dead, ha..."
2,Spectre,"[Action, Adventure, Crime]","[Daniel Craig, Christoph Waltz, Léa Seydoux]",Sam Mendes,"[spy, based on novel, secret agent, sequel, mi...",A cryptic message from Bond’s past sends him o...
3,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]","[Christian Bale, Michael Caine, Gary Oldman]",Christopher Nolan,"[dc comics, crime fighter, terrorist, secret i...",Following the death of District Attorney Harve...
4,John Carter,"[Action, Adventure, Science Fiction]","[Taylor Kitsch, Lynn Collins, Samantha Morton]",Andrew Stanton,"[based on novel, mars, medallion, space travel...","John Carter is a war-weary, former military ca..."


In [11]:
def add_rows(obj): # row -> 
    if not isinstance(obj,list):
        return ""
    return " ".join( str(s) for s in obj)

In [12]:
def build_text_for_bert(row):
    parts = []

    parts.append(add_rows(row['genres']))
    parts.append(add_rows(row['top_cast']))
    parts.append(add_rows(row['keywords']))

    if isinstance(row['director'],str):
        parts.append(row['director'])

    if isinstance(row['overview'],str):
        parts.append(row['overview'])    
    
    return " ".join(parts)
    

In [13]:
df['text_for_bert'] = df.apply(build_text_for_bert ,axis=1)

In [14]:
df['text_for_bert']

0       Action Adventure Fantasy Science Fiction Sam W...
1       Adventure Fantasy Action Johnny Depp Orlando B...
2       Action Adventure Crime Daniel Craig Christoph ...
3       Action Crime Drama Thriller Christian Bale Mic...
4       Action Adventure Science Fiction Taylor Kitsch...
                              ...                        
4804    Action Crime Thriller Carlos Gallardo Jaime de...
4805    Comedy Romance Edward Burns Kerry Bishé Marsha...
4806    Comedy Drama Romance TV Movie Eric Mabius Kris...
4807     Daniel Henney Eliza Coupe Bill Paxton  Daniel...
4808    Documentary Drew Barrymore Brian Herzlinger Co...
Name: text_for_bert, Length: 4809, dtype: object

In [15]:
from sentence_transformers import SentenceTransformer
import numpy as np

bert_model = SentenceTransformer('all-MiniLm-L6-v2')

  from .autonotebook import tqdm as notebook_tqdm


In [16]:
bert_model.max_seq_length = 64
text = df['text_for_bert'].fillna("").tolist()

embedding = bert_model.encode(
    text,
    batch_size=4,
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True
)

np.save("bert_embedding.npy",embedding)


Batches: 100%|██████████| 1203/1203 [00:28<00:00, 42.01it/s]


In [19]:
df['title_clean'] = df['title'].str.lower().str.strip()

In [20]:
indices = pd.Series(df.index , index=df['title_clean']).drop_duplicates()

# Recommendation logic 

In [None]:
def clean_title(t):
    if isinstance(t,str):
        return t.lower().strip()
    return ""

In [None]:
from difflib import get_close_matches

def recommendation(title,topn=10):
    title_key = clean_title(title)
    if title_key not in indices:
        choices = get_close_matches(title_key,df['title_clean'].tolist(),n=6,cutoff=0.6)
        if len(choices)==0:
            return f"Sorry we could not find {title} ",None
        return f"{title} is not found did u mean ", choices
    idx = indices[title_key]
    query = embedding[idx]
    sim_score = np.dot(embedding,query)
    top_indices = np.argsort(sim_score)[::-1]
    top_indices = top_indices[top_indices!=idx]
    top_indices = top_indices[:topn]
    top_score = sim_score[top_indices]

    results = df.iloc[top_indices][['title','genres','director','overview']].copy()
    results['score'] = top_score
    results = results.reset_index(drop=True)
    return title , results 

In [None]:
# data - csv , embedding - npy  , indices - joblib 

('Avatar',
                                           title  \
 0                       Star Trek Into Darkness   
 1  Star Wars: Episode II - Attack of the Clones   
 2                             Jupiter Ascending   
 3                                        Aliens   
 4                          The Inhabited Island   
 5                                    Barbarella   
 6                                      Serenity   
 7                                    Battleship   
 8                                     Moonraker   
 9                                      Æon Flux   
 
                                            genres           director  \
 0            [Action, Adventure, Science Fiction]        J.J. Abrams   
 1            [Adventure, Action, Science Fiction]       George Lucas   
 2   [Science Fiction, Fantasy, Action, Adventure]    Lilly Wachowski   
 3     [Horror, Action, Thriller, Science Fiction]      James Cameron   
 4    [Action, Fantasy, Science Fiction, Thriller]