# Movie Recommendation
By Reagan Orth

In [2]:
import ast
from collections import Counter
import itertools
import json
import numpy as np
import pandas as pd
import re
import string
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch_geometric
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader, LinkNeighborLoader
from torch_geometric.nn import GCNConv, SAGEConv
import torch_geometric.transforms as T

## Movie Dataset Cleaning

In [9]:
# Next get the Movie dataset
movies_df = pd.read_csv("Top1000.csv")
movies_df.drop(columns=["Unnamed: 0", "votes"], inplace=True)  # Remove external indexing column
movies_df.iloc[:10]

Unnamed: 0,title,year,certificate,time,genre,rating,metascore,simple_desc,directors,actors
0,1.\nThe Shawshank Redemption\n(1994),(1994),R,142 min,Drama,9.3,82 \n Metascore,"Over the course of several years, two convicts...",Frank Darabont,"['Tim Robbins', 'Morgan Freeman', 'Bob Gunton'..."
1,2.\nThe Godfather\n(1972),(1972),R,175 min,"Crime, Drama",9.2,100 \n Metascore,The aging patriarch of an organized crime dyna...,Francis Ford Coppola,"['Marlon Brando', 'Al Pacino', 'James Caan', '..."
2,3.\nThe Dark Knight\n(2008),(2008),PG-13,152 min,"Action, Crime, Drama",9.0,84 \n Metascore,When the menace known as the Joker wreaks havo...,Christopher Nolan,"['Christian Bale', 'Heath Ledger', 'Aaron Eckh..."
3,4.\nSchindler's List\n(1993),(1993),R,195 min,"Biography, Drama, History",9.0,95 \n Metascore,"In German-occupied Poland during World War II,...",Oskar Schindler,"['Steven Spielberg', 'Liam Neeson', 'Ralph Fie..."
4,5.\nThe Lord of the Rings: The Return of the K...,(2003),PG-13,201 min,"Action, Adventure, Drama",9.0,94 \n Metascore,Gandalf and Aragorn lead the World of Men agai...,Peter Jackson,"['Elijah Wood', 'Viggo Mortensen', 'Ian McKell..."
5,6.\n12 Angry Men\n(1957),(1957),Approved,96 min,"Crime, Drama",9.0,97 \n Metascore,The jury in a New York City murder trial is fr...,Sidney Lumet,"['Henry Fonda', 'Lee J. Cobb', 'Martin Balsam'..."
6,7.\nThe Godfather Part II\n(1974),(1974),R,202 min,"Crime, Drama",9.0,90 \n Metascore,The early life and career of Vito Corleone in ...,Francis Ford Coppola,"['Al Pacino', 'Robert De Niro', 'Robert Duvall..."
7,8.\nPulp Fiction\n(1994),(1994),R,154 min,"Crime, Drama",8.9,95 \n Metascore,"The lives of two mob hitmen, a boxer, a gangst...",Quentin Tarantino,"['John Travolta', 'Uma Thurman', 'Samuel L. Ja..."
8,9.\nInception\n(2010),(2010),PG-13,148 min,"Action, Adventure, Sci-Fi",8.8,74 \n Metascore,A thief who steals corporate secrets through t...,Christopher Nolan,"['Leonardo DiCaprio', 'Joseph Gordon-Levitt', ..."
9,10.\nThe Lord of the Rings: The Fellowship of ...,(2001),PG-13,178 min,"Action, Adventure, Drama",8.8,92 \n Metascore,A meek Hobbit from the Shire and eight compani...,Peter Jackson,"['Elijah Wood', 'Ian McKellen', 'Orlando Bloom..."


In [10]:
# Clean up formatting within columns
movies_df["title"] = [i.split("\n")[1] for i in movies_df["title"]]
movies_df["year"] = [re.match(".*\([0-9]*\)", i)[0][1:-1][-4:] for i in movies_df["year"]]
movies_df["year"] = movies_df["year"].astype(int)
movies_df["decade"] = movies_df["year"] // 10 * 10
movies_df["time"] = [int(i[:i.index(" ")]) for i in movies_df["time"]]
movies_df["genre"] = movies_df["genre"].str.split(", ")
movies_df["metascore"] = [int(i[:i.index(" ")]) for i in movies_df["metascore"]]
movies_df["directors"] = movies_df["directors"].str.split(", ")
movies_df["actors"] = movies_df["actors"].apply(ast.literal_eval)
movies_df.drop_duplicates(subset=["title"], inplace=True)
movies_df.index = [i for i in range(len(movies_df))]
movies_df.iloc[:10]

Unnamed: 0,title,year,certificate,time,genre,rating,metascore,simple_desc,directors,actors,decade
0,The Shawshank Redemption,1994,R,142,[Drama],9.3,82,"Over the course of several years, two convicts...",[Frank Darabont],"[Tim Robbins, Morgan Freeman, Bob Gunton, Will...",1990
1,The Godfather,1972,R,175,"[Crime, Drama]",9.2,100,The aging patriarch of an organized crime dyna...,[Francis Ford Coppola],"[Marlon Brando, Al Pacino, James Caan, Diane K...",1970
2,The Dark Knight,2008,PG-13,152,"[Action, Crime, Drama]",9.0,84,When the menace known as the Joker wreaks havo...,[Christopher Nolan],"[Christian Bale, Heath Ledger, Aaron Eckhart, ...",2000
3,Schindler's List,1993,R,195,"[Biography, Drama, History]",9.0,95,"In German-occupied Poland during World War II,...",[Oskar Schindler],"[Steven Spielberg, Liam Neeson, Ralph Fiennes,...",1990
4,The Lord of the Rings: The Return of the King,2003,PG-13,201,"[Action, Adventure, Drama]",9.0,94,Gandalf and Aragorn lead the World of Men agai...,[Peter Jackson],"[Elijah Wood, Viggo Mortensen, Ian McKellen, O...",2000
5,12 Angry Men,1957,Approved,96,"[Crime, Drama]",9.0,97,The jury in a New York City murder trial is fr...,[Sidney Lumet],"[Henry Fonda, Lee J. Cobb, Martin Balsam, John...",1950
6,The Godfather Part II,1974,R,202,"[Crime, Drama]",9.0,90,The early life and career of Vito Corleone in ...,[Francis Ford Coppola],"[Al Pacino, Robert De Niro, Robert Duvall, Dia...",1970
7,Pulp Fiction,1994,R,154,"[Crime, Drama]",8.9,95,"The lives of two mob hitmen, a boxer, a gangst...",[Quentin Tarantino],"[John Travolta, Uma Thurman, Samuel L. Jackson...",1990
8,Inception,2010,PG-13,148,"[Action, Adventure, Sci-Fi]",8.8,74,A thief who steals corporate secrets through t...,[Christopher Nolan],"[Leonardo DiCaprio, Joseph Gordon-Levitt, Elli...",2010
9,The Lord of the Rings: The Fellowship of the Ring,2001,PG-13,178,"[Action, Adventure, Drama]",8.8,92,A meek Hobbit from the Shire and eight compani...,[Peter Jackson],"[Elijah Wood, Ian McKellen, Orlando Bloom, Sea...",2000


## Naive Recommender System

In [4]:
# First generate a very simple recommendation system based on a score system, for comparison

naive_edges = np.zeros((len(movies_df), len(movies_df)))
for i, film1 in enumerate(movies_df.itertuples()):
    for j, film2 in enumerate(movies_df.itertuples()):
        if i == j:
            continue
            
        score = 0
        for genre in film1[5]:  # 5 indicates the Genre list
            if genre in film2[5]:
                score += 1
        for director in film1[9]:  # 9 is the Directors index
            if director in film2[9]:
                score += 1
        for actor in film1[10]:  # 10 is the Actors index
            if actor in film2[10]:
                score += 1
        
        naive_edges[i, j] = score
        
# Generate similarities DataFrame
similarities = pd.DataFrame(naive_edges, columns=list(movies_df["title"]), index=list(movies_df["title"]))
most_similar_naive = pd.DataFrame([movies_df["title"][similarities[film].argmax()] for film in movies_df["title"]], index=movies_df["title"], columns=["Most Similar Film"])
with pd.option_context('display.max_colwidth', None, 'display.max_rows', 250):
    display(similarities.iloc[:10, :10])

Unnamed: 0,The Shawshank Redemption,The Godfather,The Dark Knight,Schindler's List,The Lord of the Rings: The Return of the King,12 Angry Men,The Godfather Part II,Pulp Fiction,Inception,The Lord of the Rings: The Fellowship of the Ring
The Shawshank Redemption,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0
The Godfather,1.0,0.0,2.0,1.0,1.0,2.0,5.0,2.0,0.0,1.0
The Dark Knight,1.0,2.0,0.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0
Schindler's List,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0
The Lord of the Rings: The Return of the King,1.0,1.0,2.0,1.0,0.0,1.0,1.0,1.0,2.0,7.0
12 Angry Men,1.0,2.0,2.0,1.0,1.0,0.0,2.0,2.0,0.0,1.0
The Godfather Part II,1.0,5.0,2.0,1.0,1.0,2.0,0.0,2.0,0.0,1.0
Pulp Fiction,1.0,2.0,2.0,1.0,1.0,2.0,2.0,0.0,0.0,1.0
Inception,0.0,0.0,2.0,0.0,2.0,0.0,0.0,0.0,0.0,2.0
The Lord of the Rings: The Fellowship of the Ring,1.0,1.0,2.0,1.0,7.0,1.0,1.0,1.0,2.0,0.0


### Some of these are great:

In [5]:
most_similar_naive.T[["The Godfather", "Before Sunrise", "Batman Begins", "The Dark Knight", "The Lord of the Rings: The Return of the King", "Inception", "Interstellar", "Star Wars", "Terminator 2: Judgment Day", "Spirited Away", "City Lights", "Alien", "Rear Window", "Indiana Jones and the Last Crusade", "Avengers: Infinity War", "Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb"]].T

Unnamed: 0_level_0,Most Similar Film
title,Unnamed: 1_level_1
The Godfather,The Godfather Part II
Before Sunrise,Before Sunset
Batman Begins,The Dark Knight
The Dark Knight,Batman Begins
The Lord of the Rings: The Return of the King,The Lord of the Rings: The Two Towers
Inception,Interstellar
Interstellar,The Martian
Star Wars,Star Wars: Episode V - The Empire Strikes Back
Terminator 2: Judgment Day,The Terminator
Spirited Away,Howl's Moving Castle


### Some are not:

In [6]:
most_similar_naive.T[["Hamilton", "Witness for the Prosecution", "The Wages of Fear", "Rebecca", "Trainspotting", "City of God", "The Lion King", "Pan's Labyrinth"]].T

Unnamed: 0_level_0,Most Similar Film
title,Unnamed: 1_level_1
Hamilton,Schindler's List
Witness for the Prosecution,Se7en
The Wages of Fear,Z
Rebecca,Strangers on a Train
Trainspotting,Big Fish
City of God,The Godfather
The Lion King,Song of the Sea
Pan's Labyrinth,The Green Mile


## ML Beginnings

### Select features

In [19]:
# Generate feature list, one for each genre/director/actor
genre_list = sorted(pd.unique([genre for film in movies_df["genre"] for genre in film]))
director_list = sorted(pd.unique([person for film in movies_df["directors"] for person in film]))
actor_list = sorted(pd.unique([person for film in movies_df["actors"] for person in film]))
decades = sorted(pd.unique(movies_df["decade"]))

genres = pd.DataFrame([[genre in film for genre in genre_list] for film in movies_df["genre"]], columns=genre_list)
directors = pd.DataFrame([[director in film for director in director_list] for film in movies_df["directors"]], columns=director_list)
actors = pd.DataFrame([[actor in film for actor in actor_list] for film in movies_df["actors"]], columns=actor_list)
years = pd.DataFrame([movies_df["decade"] == decade for decade in decades], index=decades).T
features_df = pd.concat([years, genres, directors, actors], axis=1)
features_df.index = movies_df["title"]
features_df.iloc[27:40, [4, 8, 20, 24, 90, 110, 177, 203]]

Unnamed: 0_level_0,1960,2000,Film-Noir,Musical,Charles Chaplin,Danis Tanovic,George Seaton,Irvin Kershner
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Life Is Beautiful,False,False,False,False,False,False,False,False
Seven Samurai,False,False,False,False,False,False,False,False
It's a Wonderful Life,False,False,False,False,False,False,False,False
Harakiri,True,False,False,False,False,False,False,False
Gladiator,False,True,False,False,False,False,False,False
Parasite,False,False,False,False,False,False,False,False
Whiplash,False,False,False,False,False,False,False,False
The Departed,False,True,False,False,False,False,False,False
Back to the Future,False,False,False,False,False,False,False,False
The Prestige,False,True,False,False,False,False,False,False


### Create Dataset and Model

In [20]:
# Create our custom Dataset class
class MovieDataset(Dataset):
    def __init__(self, features_tensor, labels):
        self.features = features_tensor
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

In [21]:
# Create our custom model
class Model(nn.Module):
    def __init__(self, channels):
        super().__init__()
        self.linear_relu_stack = nn.Sequential(  # simple three linear layer network
            nn.Linear(channels, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 11),
        )

    def forward(self, x):
        x = self.linear_relu_stack(x)  # run through our Sequential pipeline
        return x

### Train Model

In [22]:
# Now create variables needed for iterative training

net = Model(len(features_df.keys()))  # Create model instance
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

# Convert to tensor
features = torch.tensor(np.array(features_df)).to(torch.float32)

# Train for a user who loves drama movies and hates everything else
labels = torch.tensor(["Drama" in film for film in movies_df["genre"]]).to(torch.int64) * 10

# Train and test sets, 0.8 train/0.2 test
train_size = int(len(features) * 0.8)
train_dataset = MovieDataset(features[:train_size], labels[:train_size])
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_dataset = MovieDataset(features[train_size:], labels[train_size:])
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=True)

In [23]:
# Training the model
for epoch in range(100):
    for i, data in enumerate(train_loader, 0):
        # basic training loop
        inputs, labels = data  # Note that this pulls from our dataset using MovieDataset's __getitem__()
        optimizer.zero_grad()
        outputs = net(inputs)  # Run our inputs through the model
        loss = criterion(outputs, labels)  # Check our model's outputs against the labels
        
        # Update the model
        loss.backward()
        optimizer.step()

print('Finished Training')

Finished Training


### Iterative Testing

In [16]:
correct = 0
total = 0
for i, data in enumerate(test_loader):
    inputs, labels = data
    outputs = torch.argmax(net(inputs), axis=1)
    correct += (outputs == labels).sum()
    total += len(labels)
print("Total correctness: {:.2f} ({}/{})".format(float(correct/total), correct, total))

Total correctness: 1.00 (168/168)


# GNN Beginnings
Initially just importing everything into GCN format, going to make movie graph next time

In [24]:
edge_index = torch.tensor(list(itertools.product([i for i in range(len(movies_df))], [i for i in range(len(movies_df))]))).T
labels = torch.tensor(["Drama" in film for film in movies_df["genre"]]).to(torch.int64) * 10
train_mask = torch.tensor([True for i in range(train_size)] + [False for i in range(features.shape[0] - train_size)], dtype=torch.bool)
test_mask = np.logical_not(train_mask).to(torch.bool)
data = Data(x=features.to(torch.float32), y=labels, edge_index=edge_index, train_mask=train_mask, test_mask=test_mask)  # x is the data, y is labels
loader = DataLoader(data, batch_size=32, shuffle=True)

### GNN Model Class

In [25]:
class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(data.num_node_features, 16)
        self.conv2 = GCNConv(16, 11)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)

Use GCN just to make sure it's all working properly, this isn't the final purpose

In [28]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GCN().to(device)
data = data.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

model.train()
for epoch in range(200):
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()

In [29]:
model.eval()
pred = model(data).argmax(dim=1)
correct = (pred[data.test_mask] == data.y[data.test_mask]).sum()
acc = int(correct) / int(data.test_mask.sum())
print(f'Accuracy: {acc:.4f}')

Accuracy: 0.6964


### Building the Graph Attempt 1

In [21]:
class GNN(torch.nn.Module):
    def __init__(self, channels):
        super().__init__()
        self.conv1 = SAGEConv(channels, channels)
        self.conv2 = SAGEConv(channels, channels)

    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        x = self.conv2(x, edge_index)
        return torch.matmul(x, x.T)
    
    
def custom_loss(data, pred):
    pred = (pred >= 0.5).to(torch.float32)
    edges = (pred == 1).nonzero(as_tuple=True)
    loss_tensor = (data[edges[0]] != data[edges[1]]).float().requires_grad_(True)
    # Add in penalty for not predicting any edges
    return loss_tensor.sum()

In [22]:
model = GNN(channels=len(features_df.keys()))
model = model.to(device)
print(model)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

GNN(
  (conv1): SAGEConv(2809, 2809, aggr=mean)
  (conv2): SAGEConv(2809, 2809, aggr=mean)
)


In [23]:
transform = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    disjoint_train_ratio=0.3,
    neg_sampling_ratio=0.0,
    add_negative_train_samples=False,
)

train_data, val_data, test_data = transform(data)
train_loader = LinkNeighborLoader(
    data=train_data,
    num_neighbors=[3],
    batch_size=len(train_data),
    shuffle=True,
    replace=True
)

In [25]:
import tqdm

for epoch in range(0, 4):
    total_loss = total_examples = 0
    for sampled_data in tqdm.tqdm(train_loader):
        optimizer.zero_grad()

        sampled_data.to(device)
        pred = model(sampled_data.x, sampled_data.edge_index)
        loss = custom_loss(sampled_data.x, pred)
        loss.backward()
        optimizer.step()
        
        total_loss += float(loss)
        total_examples += sampled_data.num_nodes
    print("Loss for epoch {}: {}".format(epoch, total_loss/total_examples))

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 56180/56180 [1:33:48<00:00,  9.98it/s]
  0%|                                                                                                                                                                           | 1/56180 [00:00<1:45:31,  8.87it/s]

Loss for epoch 0: 789.8611999623183


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 56180/56180 [1:30:06<00:00, 10.39it/s]
  0%|                                                                                                                                                                           | 1/56180 [00:00<2:12:36,  7.06it/s]

Loss for epoch 1: 789.7026387634742


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 56180/56180 [1:23:06<00:00, 11.27it/s]
  0%|                                                                                                                                                                           | 1/56180 [00:00<1:34:19,  9.93it/s]

Loss for epoch 2: 789.46312851767


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 56180/56180 [1:14:02<00:00, 12.65it/s]

Loss for epoch 3: 789.8017143404946





In [26]:
all_edges = torch.tensor([[i, j] for i in range(data.num_nodes) for j in range(data.num_nodes)]).T
output = model(data.x, all_edges)
for i in range(len(output)):
    output[i, i] = 0

output_probs, output_links = torch.topk(output, 5)
for i, l in enumerate(output_links):
    print(features_df.iloc[i].name)
    for j in l:
        print("\t->{}".format(features_df.iloc[int(j)].name))

The Shawshank Redemption
	->Se7en
	->The Green Mile
	->Short Cuts
	->Unforgiven
	->The Fugitive
The Godfather
	->The Godfather Part II
	->Apocalypse Now
	->Serpico
	->Dog Day Afternoon
	->The Conversation
The Dark Knight
	->Batman Begins
	->3:10 to Yuma
	->The Prestige
	->Children of Men
	->Man on Fire
Schindler's List
	->Tombstone
	->Malcolm X
	->Hacksaw Ridge
	->Frost/Nixon
	->Gandhi
The Lord of the Rings: The Return of the King
	->The Lord of the Rings: The Two Towers
	->The Lord of the Rings: The Fellowship of the Ring
	->Hero
	->Crouching Tiger, Hidden Dragon
	->Into the Wild
12 Angry Men
	->On the Waterfront
	->Elevator to the Gallows
	->The Killing
	->The Asphalt Jungle
	->Strangers on a Train
The Godfather Part II
	->The Godfather
	->Apocalypse Now
	->Serpico
	->Heat
	->The Irishman
Pulp Fiction
	->Kill Bill: Vol. 1
	->The Hateful Eight
	->The Sixth Sense
	->True Romance
	->Kill Bill: Vol. 2
Inception
	->Interstellar
	->Mad Max: Fury Road
	->Star Trek Into Darkness
	->Rogue One

	->Sabrina
	->The Bridge on the River Kwai
	->Sunset Blvd.
	->Jojo Rabbit
	->No Man's Land
Out of the Past
	->Mildred Pierce
	->Double Indemnity
	->Rope
	->The Night of the Hunter
	->The Maltese Falcon
Brief Encounter
	->The Shop Around the Corner
	->Notorious
	->Rope
	->The Fault in Our Stars
	->Doctor Zhivago
Spring, Summer, Fall, Winter... and Spring
	->3-Iron
	->Slumdog Millionaire
	->Eternal Sunshine of the Spotless Mind
	->The Butterfly Effect
	->Match Point
Nobody Knows
	->No Country for Old Men
	->The Butterfly Effect
	->Into the Wild
	->Dancer in the Dark
	->In Bruges
Persepolis
	->Into the Wild
	->Dancer in the Dark
	->Finding Neverland
	->Ip Man
	->The Wind Rises
Black Cat, White Cat
	->True Romance
	->Lock, Stock and Two Smoking Barrels
	->Underground
	->Rushmore
	->As Good as It Gets
The Sea Inside
	->Into the Wild
	->No Country for Old Men
	->Finding Neverland
	->Frost/Nixon
	->The Last King of Scotland
Departures
	->Almost Famous
	->Dancer in the Dark
	->Into the Wild
	-

	->The Taking of Pelham One Two Three
	->Le Cercle Rouge
	->Kill Bill: Vol. 2
	->The Raid: Redemption
	->End of Watch
The Fault in Our Stars
	->The Broken Circle Breakdown
	->A Star Is Born
	->The Lunchbox
	->Her
	->The Handmaiden
The Trial of the Chicago 7
	->All the President's Men
	->The Butterfly Effect
	->No Country for Old Men
	->Frost/Nixon
	->Hacksaw Ridge
Clerks
	->Groundhog Day
	->Mulan
	->Underground
	->Toy Story 2
	->The Big Lebowski
Glengarry Glen Ross
	->The Girl with the Dragon Tattoo
	->Se7en
	->Primal Fear
	->Heat
	->L.A. Confidential
When Harry Met Sally...
	->Flipped
	->Modern Times
	->City Lights
	->After Hours
	->The Graduate
3:10 to Yuma
	->The Dark Knight
	->Batman Begins
	->Man on Fire
	->Dancer in the Dark
	->Training Day
The Color Purple
	->E.T.
	->Empire of the Sun
	->The Fly
	->After Hours
	->Into the Wild
Star Trek Into Darkness
	->Star Trek
	->Star Wars: Episode VII - The Force Awakens
	->Mad Max: Fury Road
	->Captain America: The Winter Soldier
	->Edge of

In [27]:
lines = "{'1681335863852': {'Username': 'Prayash', 'Title': 'The Godfather', 'Rating': '9', 'Review': 'asfdafsd'}, '1681335868552': {'Username': 'Prayash', 'Title': 'Movie 2', 'Rating': '3', 'Review': 'asdfasfdasdfasdf'}, '1681335874235': {'Username': 'Prayash', 'Title': '12 Angry Men', 'Rating': '10', 'Review': 'asfasdadsasdfasdf'}}".replace("'", "\"")

def get_recs(lines):
    lines = json.loads(lines)
    user_movies = pd.DataFrame(lines).T
    movie_names = user_movies[user_movies["Rating"].astype(float) >= 8]["Title"]  # get the names of the highly rated movies
    movie_index = np.flatnonzero(features_df.index.isin(movie_names))  # get the indices of highly rated movies
    rec_movies = pd.DataFrame([output_links[movie_index].flatten(), output_probs[movie_index].flatten()]).T
    rec_movies.sort_values(1, ascending=False, inplace=True)
    similar_films = list(np.unique(features_df.iloc[rec_movies[0][:3]].index))  # get names
    return similar_films
    
get_recs(lines)

['Apocalypse Now', 'Serpico', 'The Godfather Part II']

In [30]:
pd.DataFrame(torch.concat([output_links.T, output_probs.T]).detach().numpy().T, index=features_df.index).to_csv("SimilarityMatrices.csv")

NameError: name 'output_links' is not defined

In [31]:
df = pd.read_csv("SimilarityIndices.csv")
movie_list = df["title"]
links = df[["0", "1", "2", "3", "4"]].values
df = pd.read_csv("SimilarityProbs.csv").values
movie_list

0                           The Shawshank Redemption
1                                      The Godfather
2                                    The Dark Knight
3                                   Schindler's List
4      The Lord of the Rings: The Return of the King
                           ...                      
833                                          Control
834                              Eyes Without a Face
835                                            Shine
836                                The Invisible Man
837                                   The Odd Couple
Name: title, Length: 838, dtype: object

In [32]:
input_movies = ["The Treasure of the Sierra Madre", "Dirty Harry", "The Fugitive", "The Godfather"]
sim_mat = pd.read_csv("FullMatrix.csv").set_index("title")

In [33]:
lines = "{'1681335863852': {'Username': 'Prayash', 'Title': 'The Godfather', 'Rating': '9', 'Review': 'asfdafsd'}, '1681335868552': {'Username': 'Prayash', 'Title': 'The Treasure of the Sierra Madre', 'Rating': '10', 'Review': 'asdfasfdasdfasdf'}, '1681335874235': {'Username': 'Prayash', 'Title': 'The Fugitive', 'Rating': '10', 'Review': 'asfasdadsasdfasdf'}}".replace("'", "\"")

def get_recs(lines, genre="Crime"):
    lines = json.loads(lines)
    user_movies = pd.DataFrame(lines).T
    sim_mat = pd.read_csv("FullMatrix.csv").set_index("title")
    movie_names = user_movies[user_movies["Rating"].astype(float) >= 8][
        "Title"]  # get the names of the highly rated movies
    movie_names = [movie for movie in movie_names if movie in sim_mat.index]
    rec_movies = []

    # For first recommendation, take most similar to all inputs by  mean
    # For now, random from top len(great_movies)
    indices = np.argsort(np.mean(sim_mat[movie_names].values, axis=1))[-len(movie_names):]
    poss_movies = sim_mat.index[indices]
    rec_movies.append(poss_movies[np.random.randint(len(poss_movies))])

    # For the second recommendation, take random movie in five most similar to each one they like
    poss_movies = [film_name for i in sim_mat[movie_names].T.values for film_name in sim_mat.index[np.argsort(i)[-5:]]]
    poss_movies = [movie for movie in poss_movies if movie not in movie_names]
    rec_movies.append(poss_movies[np.random.randint(len(poss_movies))])

    # For third recommendation, take highest 2 single similarity to each film given genre
    genre_titles = movies_df[[genre in movie_genres for movie_genres in movies_df["genre"]]]["title"]
    genre_titles = np.unique([title for title in genre_titles if title in sim_mat] + movie_names)
    genre_mat = sim_mat[genre_titles].T[genre_titles]
    indices = np.argsort(genre_mat[movie_names].values, axis=0)[-2:].T
    poss_movies = [film_name for i, s in enumerate(indices) for film_name in genre_mat[movie_names[i]].iloc[s].index]
    poss_movies = [movie for movie in poss_movies if movie not in movie_names]
    rec_movies.append(poss_movies[np.random.randint(len(poss_movies))])

    return rec_movies
    
get_recs(lines)

['Children of Men', 'The Maltese Falcon', 'Serpico']