In [1]:
import pandas as pd

df=pd.read_csv("/Users/lanaa/Downloads/Telegram Desktop/TMDB_movie_dataset_v11.csv")
df.head()

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,...,original_title,overview,popularity,poster_path,tagline,genres,production_companies,production_countries,spoken_languages,keywords
0,27205,Inception,8.364,34495,Released,2010-07-15,825532764,148,False,/8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg,...,Inception,"Cobb, a skilled thief who commits corporate es...",83.952,/oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg,Your mind is the scene of the crime.,"Action, Science Fiction, Adventure","Legendary Pictures, Syncopy, Warner Bros. Pict...","United Kingdom, United States of America","English, French, Japanese, Swahili","rescue, mission, dream, airplane, paris, franc..."
1,157336,Interstellar,8.417,32571,Released,2014-11-05,701729206,169,False,/pbrkL804c8yAv3zBZR4QPEafpAR.jpg,...,Interstellar,The adventures of a group of explorers who mak...,140.241,/gEU2QniE6E77NI6lCU6MxlNBvIx.jpg,Mankind was born on Earth. It was never meant ...,"Adventure, Drama, Science Fiction","Legendary Pictures, Syncopy, Lynda Obst Produc...","United Kingdom, United States of America",English,"rescue, future, spacecraft, race against time,..."
2,155,The Dark Knight,8.512,30619,Released,2008-07-16,1004558444,152,False,/nMKdUUepR0i5zn0y1T4CsSB5chy.jpg,...,The Dark Knight,Batman raises the stakes in his war on crime. ...,130.643,/qJ2tW6WMUDux911r6m7haRef0WH.jpg,Welcome to a world without rules.,"Drama, Action, Crime, Thriller","DC Comics, Legendary Pictures, Syncopy, Isobel...","United Kingdom, United States of America","English, Mandarin","joker, sadism, chaos, secret identity, crime f..."
3,19995,Avatar,7.573,29815,Released,2009-12-15,2923706026,162,False,/vL5LR6WdxWPjLPFRLe133jXWsh5.jpg,...,Avatar,"In the 22nd century, a paraplegic Marine is di...",79.932,/kyeqWdyUXW608qlYkRqosgbbJyK.jpg,Enter the world of Pandora.,"Action, Adventure, Fantasy, Science Fiction","Dune Entertainment, Lightstorm Entertainment, ...","United States of America, United Kingdom","English, Spanish","future, society, culture clash, space travel, ..."
4,24428,The Avengers,7.71,29166,Released,2012-04-25,1518815515,143,False,/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg,...,The Avengers,When an unexpected enemy emerges and threatens...,98.082,/RYMX2wcKCBAr24UyPD7xwmjaTn.jpg,Some assembly required.,"Science Fiction, Action, Adventure",Marvel Studios,United States of America,"English, Hindi, Russian","new york city, superhero, shield, based on com..."


In [None]:
#dropping unwanted features (columns)
df=df.drop(['vote_count','status', 'release_date', 'revenue', 'runtime', 'adult', 'backdrop_path', 
            'budget','homepage', 'imdb_id', 'original_language','original_title',
              'overview', 'tagline', 'production_companies','production_countries', 'spoken_languages'], axis=1)
df.head()

Unnamed: 0,id,title,vote_average,popularity,poster_path,genres,keywords
0,27205,Inception,8.364,83.952,/oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg,"Action, Science Fiction, Adventure","rescue, mission, dream, airplane, paris, franc..."
1,157336,Interstellar,8.417,140.241,/gEU2QniE6E77NI6lCU6MxlNBvIx.jpg,"Adventure, Drama, Science Fiction","rescue, future, spacecraft, race against time,..."
2,155,The Dark Knight,8.512,130.643,/qJ2tW6WMUDux911r6m7haRef0WH.jpg,"Drama, Action, Crime, Thriller","joker, sadism, chaos, secret identity, crime f..."
3,19995,Avatar,7.573,79.932,/kyeqWdyUXW608qlYkRqosgbbJyK.jpg,"Action, Adventure, Fantasy, Science Fiction","future, society, culture clash, space travel, ..."
4,24428,The Avengers,7.71,98.082,/RYMX2wcKCBAr24UyPD7xwmjaTn.jpg,"Science Fiction, Action, Adventure","new york city, superhero, shield, based on com..."


In [3]:
#removing null values
df=df.dropna()

In [4]:
#filtering out very low rated or empty genre movies
df=df[df['vote_average'] >= 5]

In [5]:
#combine wanted features
def combine_features(row):
    return f"title: {row['title']} || genres: {row['genres']} || keywords: {row['keywords']}"

df['text']=df.apply(combine_features, axis=1)

In [6]:
#shuffling dataset
df_shuffled=df.sample(frac=1,random_state=42).reset_index(drop=True)

In [7]:
import random
from tqdm import tqdm 
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.neighbors import NearestNeighbors

def generate_pairs(df, num_pairs=20000, model_name='all-MiniLM-L6-v2'):
    #pre-compute embeddings for better negative sampling
    base_model=SentenceTransformer(model_name)
    embeddings=base_model.encode(df['text'].tolist(), show_progress_bar=True)
    
    #initialize KNN for finding similar/dissimilar items
    nn=NearestNeighbors(n_neighbors=50)
    nn.fit(embeddings)
    
    pairs=[]
    genre_groups=df.groupby('genres')
    
    # Positive pairs - from same genre cluster but not too similar
    for _, group in tqdm(genre_groups, desc="Generating positive pairs"):
        if len(group) >= 2:
            indices = group.index.tolist()
            # Get random pairs from the same genre
            for _ in range(min(5, len(group) // 2)):  # Limit pairs per genre
                idx1, idx2 = random.sample(indices, 2)
                pairs.append((df.loc[idx1]['text'], df.loc[idx2]['text'], 1))
    
    # Negative pairs - ensure true dissimilarity
    for _ in tqdm(range(num_pairs // 2), desc="Generating negative pairs"):
        # Find two random movies
        idx1, idx2 = random.sample(range(len(df)), 2)
        
        #ensure they're from different genres and semantically dissimilar
        genre_overlap = set(df.iloc[idx1]['genres'].split('|')) & set(df.iloc[idx2]['genres'].split('|'))
        semantic_distance = np.linalg.norm(embeddings[idx1] - embeddings[idx2])
        
        if not genre_overlap and semantic_distance > 1.5:  # Strict conditions
            pairs.append((df.iloc[idx1]['text'], df.iloc[idx2]['text'], 0))
    
    pairs_df = pd.DataFrame(pairs, columns=['text1', 'text2', 'label'])
    return pairs_df.drop_duplicates(), embeddings

pairs_df, movie_embeddings=generate_pairs(df)




Batches:   0%|          | 0/3582 [00:00<?, ?it/s]

Generating positive pairs: 100%|██████████| 7249/7249 [00:01<00:00, 6582.54it/s]
Generating negative pairs: 100%|██████████| 10000/10000 [00:01<00:00, 7151.81it/s]


In [8]:
#train/test split
from sklearn.model_selection import train_test_split
from sentence_transformers import InputExample

train_df, val_df = train_test_split(pairs_df, test_size=0.2,random_state=42, stratify=pairs_df['label'] )
train_examples = [ InputExample(texts=[row['text1'], row['text2']], label=row['label']) 
        for _, row in train_df.iterrows()]
val_examples = [InputExample(texts=[row['text1'], row['text2']], label=row['label']) 
        for _, row in val_df.iterrows()]

In [9]:
#fine-tuning
from sentence_transformers import losses, evaluation, util
from torch.utils.data import DataLoader
from sklearn.metrics import classification_report


model=SentenceTransformer('all-MiniLM-L6-v2')
train_dataloader=DataLoader(train_examples, shuffle=True, batch_size=32,drop_last=True )

#contrastive Loss with margin to push apart negatives and pull together positives
train_loss= losses.ContrastiveLoss(
        model=model,
        margin=0.5,  #prevents overfitting
        size_average=True
    )

#for early stopping
evaluator = evaluation.BinaryClassificationEvaluator.from_input_examples( val_examples, name='movie-val',show_progress_bar=True)

def evaluate_model(model, val_examples, threshold=0.5):
    texts1 = [ex.texts[0] for ex in val_examples]
    texts2 = [ex.texts[1] for ex in val_examples]
    
    embeddings1 = model.encode(texts1, convert_to_tensor=True)
    embeddings2 = model.encode(texts2, convert_to_tensor=True)
    
    cos_scores = util.pytorch_cos_sim(embeddings1, embeddings2).cpu().numpy()
    predictions = (cos_scores > threshold).astype(int)
    true_labels = np.array([ex.label for ex in val_examples])
    
    print("\n" + "="*60)
    print("Detailed Classification Report")
    print("="*60)
    print(classification_report(true_labels, predictions, 
                              target_names=["Dissimilar", "Similar"]))
    
    return {
        'accuracy': np.mean(predictions == true_labels),
        'cosine_scores': cos_scores
    }


#training loop
model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        epochs=5,
        warmup_steps=100,
        evaluator=evaluator,
        evaluation_steps=500,
        output_path='./movie_model',
        save_best_model=True,
        optimizer_params={'lr': 2e-5},
        weight_decay=0.01
    ) 


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss,Validation Loss,Movie-val Cosine Accuracy,Movie-val Cosine Accuracy Threshold,Movie-val Cosine F1,Movie-val Cosine F1 Threshold,Movie-val Cosine Precision,Movie-val Cosine Recall,Movie-val Cosine Ap,Movie-val Cosine Mcc
179,No log,No log,0.999306,0.962405,0.999653,0.962405,1.0,0.999306,1.0,0.0
358,No log,No log,0.999306,0.976326,0.999653,0.976326,1.0,0.999306,1.0,0.0
500,0.005800,No log,0.999306,0.98072,0.999653,0.98072,1.0,0.999306,1.0,0.0
537,0.005800,No log,0.999306,0.981446,0.999653,0.981446,1.0,0.999306,1.0,0.0
716,0.005800,No log,0.999306,0.983768,0.999653,0.983768,1.0,0.999306,1.0,0.0
895,0.005800,No log,0.999306,0.984479,0.999653,0.984479,1.0,0.999306,1.0,0.0


Batches:   0%|          | 0/87 [00:00<?, ?it/s]

Batches:   0%|          | 0/87 [00:00<?, ?it/s]

Batches:   0%|          | 0/87 [00:00<?, ?it/s]

Batches:   0%|          | 0/87 [00:00<?, ?it/s]

Batches:   0%|          | 0/87 [00:00<?, ?it/s]

Batches:   0%|          | 0/87 [00:00<?, ?it/s]

In [None]:
#saving for use on streamlit
model.save('C:/Users/lanaa/Downloads/imdb_fine_tuned_model1')
df[['title', 'poster_path', 'text']].to_csv('C:/Users/lanaa/Downloads/movie_metadata1.csv', index=False)
np.save('C:/Users/lanaa/Downloads/movie_embeddings1.npy', movie_embeddings)


Training complete! Model and artifacts saved.


In [14]:
#AI component
from transformers import pipeline

classifier=pipeline("zero-shot-classification",model="facebook/bart-large-mnli") #BERT variant

#define genres from dataset
all_genres = set()

#split genre strings and flatten (if "Action|Sci-Fi" or "Action, Sci-Fi")
for genres in df['genres']:
    if '|' in genres:
        all_genres.update(genres.split('|'))
    elif ',' in genres:
        all_genres.update([g.strip() for g in genres.split(',')])
    else:
        all_genres.add(genres.strip())

# Convert to sorted list
genre_list = sorted(list(all_genres))
print(f"Extracted {len(genre_list)} genres: {genre_list[:5]}...")

#analyze query
user_query="comedy movie with family"
result= classifier(user_query, genre_list, multi_label=True)

print("Query: ",user_query)
print("Detected genres:", result["labels"][:3]) #max 3 genres

Device set to use cpu


Extracted 19 genres: ['Action', 'Adventure', 'Animation', 'Comedy', 'Crime']...
Query:  comedy movie with family
Detected genres: ['Comedy', 'Family', 'TV Movie']
