In [None]:
'''Summary:
1. Netflix ratings → filter liked movies
2. Group into user sequences
3. Break sequences into (input → next movie) pairs
4. Pad sequences
5. Train Transformer like GPT
6. Recommend next movies

Step 1: Ratings Data
user → movie → rating

Step 2: Keep only liked movies
rating >= 4

Step 3: Convert user history into sequences
User A → [m1, m2, m3, m4]

Step 4: Create GPT-style training pairs
[m1] → m2
[m1,m2] → m3
[m1,m2,m3] → m4

Step 5: Pad to fixed length
[m1,m2] → [m1,m2,0,0,...]

Step 6: Feed batches into Transformer

Shape:

(batch_size, seq_len)
= (256, 20)

Step 7: Transformer learns attention patterns
People who watched X then Y usually watch Z next

Step 8: Recommend next movies
Top-K predicted movie IDs → Titles

'''

'Summary:\n1. Netflix ratings → filter liked movies\n2. Group into user sequences\n3. Break sequences into (input → next movie) pairs\n4. Pad sequences\n5. Train Transformer like GPT\n6. Recommend next movies\n\nStep 1: Ratings Data\nuser → movie → rating\n\nStep 2: Keep only liked movies\nrating >= 4\n\nStep 3: Convert user history into sequences\nUser A → [m1, m2, m3, m4]\n\nStep 4: Create GPT-style training pairs\n[m1] → m2\n[m1,m2] → m3\n[m1,m2,m3] → m4\n\nStep 5: Pad to fixed length\n[m1,m2] → [m1,m2,0,0,...]\n\nStep 6: Feed batches into Transformer\n\nShape:\n\n(batch_size, seq_len)\n= (256, 20)\n\nStep 7: Transformer learns attention patterns\nPeople who watched X then Y usually watch Z next\n\nStep 8: Recommend next movies\nTop-K predicted movie IDs → Titles\n\n'

In [None]:
import kagglehub
import pandas as pd
import os

path = kagglehub.dataset_download("netflix-inc/netflix-prize-data")
print("Dataset downloaded at:", path)

Using Colab cache for faster access to the 'netflix-prize-data' dataset.
Dataset downloaded at: /kaggle/input/netflix-prize-data


In [None]:
# creates movie_titles.csv into --> [movie_id, title] dataframe
def load_movie_titles(filename):
    movies = []

    with open(filename, "r", encoding="latin1") as f:
        for line in f:
            parts = line.strip().split(",", 2)  # split only first 2 commas

            if len(parts) == 3:
                movie_id, year, title = parts
                movies.append([int(movie_id), title])

    return pd.DataFrame(movies, columns=["movie_id", "title"])


print("\nLoading movie titles...")
movie_file = os.path.join(path, "movie_titles.csv")
df_movies = load_movie_titles(movie_file)

print("Movies loaded:", len(df_movies))
print(df_movies.head())


Loading movie titles...
Movies loaded: 17770
   movie_id                         title
0         1               Dinosaur Planet
1         2    Isle of Man TT 2004 Review
2         3                     Character
3         4  Paula Abdul's Get Up & Dance
4         5      The Rise and Fall of ECW


In [None]:
def load_ratings(filename, limit=None):
    data = []
    movie_id = None

    with open(filename, "r") as f:
        for line in f:
            line = line.strip()

            # Movie header
            if line.endswith(":"):
                movie_id = int(line[:-1])

            # Rating line
            else:
                user_id, rating, date = line.split(",")
                data.append([movie_id, int(user_id), int(rating)])

            # Stop early if limit reached
            if limit and len(data) >= limit:
                break

    return pd.DataFrame(data, columns=["movie_id", "user_id", "rating"])


print("\nLoading ratings...")
ratings_file = os.path.join(path, "combined_data_4.txt")

df_ratings = load_ratings(ratings_file, limit=2_000_000)

print("Ratings loaded:", len(df_ratings))
print(df_ratings.head())




Loading ratings...
Ratings loaded: 2000000
   movie_id  user_id  rating
0     13368  2385003       4
1     13368   659432       3
2     13368   751812       2
3     13368  2625420       2
4     13368  1650301       1


In [None]:
print("\nMerging ratings with titles...")

df_final = df_ratings.merge(df_movies, on="movie_id", how="left")

df_final = df_final[["movie_id", "title", "user_id", "rating"]]

print("\nFinal dataset preview:")
print(df_final.head())



Merging ratings with titles...

Final dataset preview:
   movie_id      title  user_id  rating
0     13368  Sarfarosh  2385003       4
1     13368  Sarfarosh   659432       3
2     13368  Sarfarosh   751812       2
3     13368  Sarfarosh  2625420       2
4     13368  Sarfarosh  1650301       1


In [None]:

output_file = "netflix_final.csv"
df_final.to_csv(output_file, index=False)

print("\nSaved successfully!")
print("File created:", output_file)
print("Total rows:", len(df_final))



Saved successfully!
File created: netflix_final.csv
Total rows: 2000000


In [None]:
import pandas as pd

df = pd.read_csv("netflix_final.csv")

print("Total Ratings:", len(df))
print("Unique Users:", df["user_id"].nunique())
print("Unique Movies:", df["movie_id"].nunique())

print("\nRating Distribution:")
print(df["rating"].value_counts())


Total Ratings: 2000000
Unique Users: 363102
Unique Movies: 306

Rating Distribution:
rating
4    695448
3    571644
5    465648
2    186831
1     80429
Name: count, dtype: int64


In [None]:
#Only keeps movies user liked.
df = df[df["rating"] >= 4]
print("After filtering:", len(df))


After filtering: 1161096


In [None]:
#tokenization
#movies become tokens here
df["movie_id"] = df["movie_id"].astype("category").cat.codes
movie_count = df["movie_id"].nunique()

print("Total unique movies:", movie_count)


Total unique movies: 306


In [None]:
#Building User Watch Sequences (Like Sentences)
df = df.sort_values(["user_id"])

#maps users with movie ids wachted by them
user_sequences = df.groupby("user_id")["movie_id"].apply(list)

# Keep users who have watched more than 5 movies
user_sequences = user_sequences[user_sequences.apply(len) >= 5]

print("Total users:", len(user_sequences))
print("Example sequence:", user_sequences.iloc[9][:10])


Total users: 83785
Example sequence: [2, 246, 298, 45, 74]


In [None]:
import torch
from torch.utils.data import Dataset

class MovieSequenceDataset(Dataset):
    def __init__(self, sequences, max_len=20):
        self.samples = []
        self.max_len = max_len

        for seq in sequences:
            seq = seq[-max_len:]

            for i in range(1, len(seq)):
                inp = seq[:i]
                target = seq[i]
                self.samples.append((inp, target))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        inp, target = self.samples[idx]

        # Pad input: if input is [10,25] i.e lenght of 2 and max_lenght is 5 then 3 zeros are padded and output becomes [10,25,0,0,0].
        padded = inp + [0] * (self.max_len - len(inp))

        return torch.tensor(padded), torch.tensor(target)


In [None]:
from torch.utils.data import DataLoader

dataset = MovieSequenceDataset(user_sequences.tolist(), max_len=20)
loader = DataLoader(dataset, batch_size=256, shuffle=True)

print("Total training samples:", len(dataset))


Total training samples: 632214


In [None]:
import torch.nn as nn

class TransformerRecommender(nn.Module):
    def __init__(self, num_movies, embed_dim=128, heads=4):
        super().__init__()

# movies-->vector embedings
        self.embedding = nn.Embedding(num_movies + 1, embed_dim, padding_idx=0)

# self attention block
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim,
            nhead=heads,
            batch_first=True
        )

        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=2)

        self.fc = nn.Linear(embed_dim, num_movies + 1)

    def forward(self, x):
        x = self.embedding(x)

        x = self.transformer(x)

        # Use last token representation
        out = x[:, -1, :]

        return self.fc(out)


In [None]:
#training
import torch.optim as optim
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = TransformerRecommender(movie_count).to(device)

optimizer = optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.CrossEntropyLoss()

EPOCHS = 3

for epoch in range(EPOCHS):
    total_loss = 0

    for x, y in tqdm(loader):
        x, y = x.to(device), y.to(device)

        optimizer.zero_grad()

        preds = model(x)

        loss = loss_fn(preds, y)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss = {total_loss/len(loader):.4f}")


100%|██████████| 2470/2470 [00:51<00:00, 47.68it/s]


Epoch 1, Loss = 3.9402


100%|██████████| 2470/2470 [00:51<00:00, 48.27it/s]


Epoch 2, Loss = 3.8621


100%|██████████| 2470/2470 [00:51<00:00, 47.91it/s]

Epoch 3, Loss = 3.8430





In [None]:
def recommend(user_seq, top_k=5):

  #keep last 20 movies
    user_seq = user_seq[-20:]
    padded = user_seq + [0] * (20 - len(user_seq))

#Convert to Tensor + Batch dimension
    x = torch.tensor(padded).unsqueeze(0).to(device)

    with torch.no_grad():
        preds = model(x) #vector scores for every movie

    top_movies = torch.topk(preds, top_k).indices.squeeze().tolist()
    return top_movies


In [None]:
# Reverse mapping
id_to_title = dict(zip(df["movie_id"], df["title"]))

# Pick one user
example_user = user_sequences.index[0]
seq = user_sequences.iloc[10000]

print("User watched:", [id_to_title[m] for m in seq[:5]])

# Recommendations
recs = recommend(seq)

print("\nRecommended Movies:")
for mid in recs:
    if mid in id_to_title:
        print("-", id_to_title[mid])


User watched: ['Kramer vs. Kramer', 'Alice in Wonderland', 'Terms of Endearment', 'The World According to Garp', 'Life Is Beautiful']

Recommended Movies:
- Billy Elliot
- A Fish Called Wanda
- Air Force One
- Fast Times at Ridgemont High
- Bull Durham


In [None]:
title_to_id = df.drop_duplicates("title").set_index("title")["movie_id"].to_dict()
id_to_title = df.drop_duplicates("movie_id").set_index("movie_id")["title"].to_dict()


In [None]:
def recommend_from_titles(movie_titles, top_k=5):
    seq = []

    for title in movie_titles:
        if title in title_to_id:
            seq.append(title_to_id[title])
        else:
            print("Movie not found:", title)

    if len(seq) == 0:
        return

    rec_ids = recommend(seq, top_k)

    print("\n You Watched:")
    for t in movie_titles:
        print("-", t)

    print("\n Recommended Movies :-)")
    for mid in rec_ids:
        if mid in id_to_title:
            print("-", id_to_title[mid])


In [None]:
#for title in sorted(df_movies["title"].unique()):
 #   print(title)


import pandas as pd

df = pd.read_csv("netflix_final.csv")

titles = sorted(df["title"].unique())

print("Total unique movies:", len(titles))

for t in titles:
    print(t)


Total unique movies: 306
2 Days in the Valley
7 Faces of Dr. Lao
A Fish Called Wanda
A Kiss Before Dying
A Perfect World
A Thousand Clouds of Peace
A Touch of Frost: Season 4
A Woman Called Moses
After School Specials: 1974-1976
Air Force One
Alice
Alice in Wonderland
Allosaurus: A Walking with Dinosaurs Special
American Family: Season 1
An American Tail: The Mystery of the Night Monster
Andy Griffith Show: Classic Favorites
Angel of the Night
Antonio Gaudi
Army
Around the Bend
Article 99
As Time Goes By: Series 9
Attila
B.A.P.S.
Babylon 5: A Call to Arms
Bad Boys II: Bonus Material
Bade Miyan Chote Miyan
Bandit Queen
Bandits
Barney: Best Manners
Barney: Now I Know My ABCs
Bat Thumb
Batman the Animated Series: Secrets of the Caped Crusader
Battle Royale
Bear in the Big Blue House Live!
Beast Cops
Bells Are Ringing
Berga: Soldiers of Another War
Best Motoring: Rotary Reborn
Beyond the Stars
Bikini Bandits Experience
Billy Elliot
Blackwoods
Blue Streak
Blue's Clues: Blue's Big Band
Bob t

In [None]:
my_movies = [
    "WWE: No Mercy 2002",
    "Toy Story",
    "The Mosquito Coast",
    "The Pajama Game",
    "The Nanny: Season 1"
]

recommend_from_titles(my_movies, top_k=5)



 You Watched:
- WWE: No Mercy 2002
- Toy Story
- The Mosquito Coast
- The Pajama Game
- The Nanny: Season 1

 Recommended Movies :-)
- Air Force One
- True Lies
- Office Space
- Alice in Wonderland
- The Count of Monte Cristo
