In [1]:
import os, re, joblib
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD

In [3]:

# Use a raw string so backslashes aren't interpreted as escape sequences
DATA_PATH = r'G:\VSCODE\Python And Data Sceince\Advance AI Projects\aio protype system\Data\all_content_with_posters.csv'
df = pd.read_csv(DATA_PATH)
print("Rows, cols:", df.shape)
df.head()

Rows, cols: (5850, 11)


Unnamed: 0,id,type,name,genres,cast,crew,overview,runtime,meta_text,poster_path,poster_url
0,1062722,movie,frankenstein,"drama, horror, fantasy","oscar isaac, jacob elordi, christoph waltz, mi...","guillermo del toro, guillermo del toro","dr victor frankenstein, a brilliant but egotis...",150.0,"dr victor frankenstein, a brilliant but egotis...",/g4JtvGlQO7DByTI6frUobqvSL3R.jpg,https://image.tmdb.org/t/p/w342/g4JtvGlQO7DByT...
1,1248226,movie,playdate,"action, comedy","kevin james, alan ritchson, sarah chalke, isla...","neil goldman, luke greenfield",when out of work accountant brian joins stay a...,93.0,when out of work accountant brian joins stay a...,/fGodXWqJkkkbSebPIlxLSygV8GY.jpg,https://image.tmdb.org/t/p/w342/fGodXWqJkkkbSe...
2,1242898,movie,predator badlands,"action, science fiction, adventure","elle fanning, dimitrius schuster koloamatangi,...",dan trachtenberg,"cast out from his clan, a young predator finds...",107.0,"cast out from his clan, a young predator finds...",/ef2QSeBkrYhAdfsWGXmp0lvH0T1.jpg,https://image.tmdb.org/t/p/w342/ef2QSeBkrYhAdf...
3,1161617,movie,code 3,"action, comedy","rainn wilson, lil rel howery, aimee carrero, y...",christopher leone,a burned out paramedic tries to survive his la...,100.0,a burned out paramedic tries to survive his la...,/gIAYMDb5mIAeCAj76q1sRsKjkzo.jpg,https://image.tmdb.org/t/p/w342/gIAYMDb5mIAeCA...
4,1197137,movie,black phone 2,"horror, thriller","ethan hawke, mason thames, madeleine mcgraw, d...","scott derrickson, c robert cargill, scott derr...","four years after escaping the grabber, finney ...",114.0,"four years after escaping the grabber, finney ...",/xUWUODKPIilQoFUzjHM6wKJkP3Y.jpg,https://image.tmdb.org/t/p/w342/xUWUODKPIilQoF...


In [4]:
def clean_text(s):
    if pd.isna(s): return ''
    s = str(s).lower()
    s = re.sub(r'[^a-z0-9 ,]', ' ', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s

In [5]:
for col in ['name','genres','overview','cast','crew']:
    if col in df.columns:
        df[col] = df[col].fillna('').astype(str)

df['combined_text'] = (
    df['name'] + ' ' + df['genres'] + ' ' + df['overview'] + ' ' + df['cast'] + ' ' + df['crew']
).apply(clean_text)

# normalize type column
if 'type' in df.columns:
    df['type'] = df['type'].fillna('unknown').str.lower().str.strip()

# ensure poster_url exists
if 'poster_url' not in df.columns and 'poster_path' in df.columns:
    df['poster_url'] = df['poster_path'].apply(lambda p: f'https://image.tmdb.org/t/p/w342{p}' if pd.notna(p) and p!='' else '')

df[['name','type','poster_url','combined_text']].head()

Unnamed: 0,name,type,poster_url,combined_text
0,frankenstein,movie,https://image.tmdb.org/t/p/w342/g4JtvGlQO7DByT...,"frankenstein drama, horror, fantasy dr victor ..."
1,playdate,movie,https://image.tmdb.org/t/p/w342/fGodXWqJkkkbSe...,"playdate action, comedy when out of work accou..."
2,predator badlands,movie,https://image.tmdb.org/t/p/w342/ef2QSeBkrYhAdf...,"predator badlands action, science fiction, adv..."
3,code 3,movie,https://image.tmdb.org/t/p/w342/gIAYMDb5mIAeCA...,"code 3 action, comedy a burned out paramedic t..."
4,black phone 2,movie,https://image.tmdb.org/t/p/w342/xUWUODKPIilQoF...,"black phone 2 horror, thriller four years afte..."


In [6]:
tfidf = TfidfVectorizer(ngram_range=(1,2), max_features=50000, stop_words='english')
X_tfidf = tfidf.fit_transform(df['combined_text'])
print("TF-IDF shape:", X_tfidf.shape)

TF-IDF shape: (5850, 50000)


In [9]:
joblib.dump(tfidf,'tfidf_vectorizer.joblib')
joblib.dump(X_tfidf,'X_tfidf.joblib')  # stores sparse matrix

['X_tfidf.joblib']

In [11]:
title_to_idx = {row['name'].lower(): idx for idx, row in df.iterrows()}
joblib.dump(title_to_idx, 'title_to_idx.joblib')
len(title_to_idx)

5519

In [12]:
from sklearn.metrics.pairwise import linear_kernel

In [13]:
def recommend_tfidf(title, type_filter=None, topn=10):
    key = title.lower()
    if key not in title_to_idx:
        raise ValueError(f"'{title}' not found in dataset")
    idx = title_to_idx[key]
    query_vec = X_tfidf[idx]
    sims = linear_kernel(query_vec, X_tfidf).flatten()
    sims[idx] = -1  # remove itself
    if type_filter:
        mask = (df['type'] == type_filter.lower())
    else:
        mask = np.ones(len(df), dtype=bool)
    candidates = np.where(mask)[0]
    ranked = candidates[np.argsort(sims[candidates])[::-1]]
    top_idx = ranked[:topn]
    return df.iloc[top_idx][['id','type','name','genres','poster_url','overview']].to_dict(orient='records')

# quick test (replace with any title from your dataset)
print(recommend_tfidf(df['name'].iloc[0], type_filter=None, topn=5))

[{'id': 3036, 'type': 'movie', 'name': 'mary shelley s frankenstein', 'genres': 'drama, horror, science fiction, romance', 'poster_url': 'https://image.tmdb.org/t/p/w342/bOwCAQsZlEKrwhPi1ejY6BS8jpL.jpg', 'overview': 'victor frankenstein is a promising young doctor who, devastated by the death of his mother during childbirth, becomes obsessed with bringing the dead back to life his experiments lead to the creation of a monster, which frankenstein has put together with the remains of corpses it s not long before frankenstein regrets his actions'}, {'id': 1062722, 'type': 'movie', 'name': 'frankenstein', 'genres': 'drama, horror, fantasy', 'poster_url': 'https://image.tmdb.org/t/p/w342/g4JtvGlQO7DByTI6frUobqvSL3R.jpg', 'overview': 'dr victor frankenstein, a brilliant but egotistical scientist, brings a creature to life in a monstrous experiment that ultimately leads to the undoing of both the creator and his tragic creation'}, {'id': 3306, 'type': 'movie', 'name': 'frankenstein 80', 'genr

In [15]:
# Cell 6 - optional SBERT (run only if sentence-transformers installed)
try:
    from sentence_transformers import SentenceTransformer
    sbert = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = sbert.encode(df['combined_text'].tolist(), show_progress_bar=True, convert_to_numpy=True)
    joblib.dump(embeddings, 'sbert_embeddings.joblib')
    joblib.dump('all-MiniLM-L6-v2', 'sbert_model_name.joblib')
    print("SBERT embeddings shape:", embeddings.shape)
except Exception as e:
    print("SBERT not available or failed to load:", e)


Batches: 100%|██████████| 183/183 [01:34<00:00,  1.93it/s]

SBERT embeddings shape: (5850, 384)





In [16]:
# Cell 7 - sbert-based recommend (use only if embeddings exist)
if 'embeddings' in globals():
    def recommend_sbert(title, type_filter=None, topn=10):
        key = title.lower()
        if key not in title_to_idx:
            raise ValueError(f"'{title}' not found")
        idx = title_to_idx[key]
        q = embeddings[idx].reshape(1,-1)
        sims = cosine_similarity(q, embeddings).flatten()
        sims[idx] = -1
        if type_filter:
            mask = (df['type'] == type_filter.lower())
        else:
            mask = np.ones(len(df), dtype=bool)
        candidates = np.where(mask)[0]
        ranked = candidates[np.argsort(sims[candidates])[::-1]]
        top_idx = ranked[:topn]
        return df.iloc[top_idx][['id','type','name','genres','poster_url','overview']].to_dict(orient='records')


In [18]:
# Cell 8 - hybrid recommend (tfidf + title overlap)
count_vec = CountVectorizer(ngram_range=(1,2), stop_words='english')
X_count = count_vec.fit_transform(df['name'].apply(clean_text))
joblib.dump(count_vec, 'count_vectorizer.joblib')
joblib.dump(X_count, 'X_count.joblib')

def recommend_hybrid(title, type_filter=None, topn=10, alpha=0.7):
    key = title.lower()
    if key not in title_to_idx:
        raise ValueError(f"'{title}' not found")
    idx = title_to_idx[key]
    tfidf_sims = linear_kernel(X_tfidf[idx], X_tfidf).flatten()
    title_sims = linear_kernel(X_count[idx], X_count).flatten()
    hybrid = alpha * tfidf_sims + (1 - alpha) * title_sims
    hybrid[idx] = -1
    if type_filter:
        mask = (df['type'] == type_filter.lower())
    else:
        mask = np.ones(len(df), dtype=bool)
    candidates = np.where(mask)[0]
    ranked = candidates[np.argsort(hybrid[candidates])[::-1]]
    top_idx = ranked[:topn]
    return df.iloc[top_idx][['id','type','name','genres','poster_url','overview']].to_dict(orient='records')


In [19]:
# Cell 9 - export minimal metadata for backend
meta = df[['name','type','poster_url','genres','overview','id']].reset_index(drop=False)  # keep index for position
joblib.dump({'meta_df': meta, 'title_to_idx': title_to_idx}, 'meta_records.joblib')
print("Saved metadata to model/meta_records.joblib")


Saved metadata to model/meta_records.joblib
