# Movie Recommendation

In [1]:
import ast
from collections import Counter
import numpy as np
import pandas as pd
import re
import string
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

## Review Dataset Cleaning

In [2]:
# Load in the Reviews dataset
review_df = pd.read_csv("MovieReviews.csv")
review_df.drop(columns=["Unnamed: 0"], inplace=True)  # Drop external index column
movies = pd.unique(review_df["Movie Name"])  # List of movies we have reviews for
display(review_df.loc[:20])

Unnamed: 0,usernames,ratings,reviews,usefulness_rating,Movie Name
0,kelanwm,6,"Original plot, they had a marvellous setting a...",0.0,The Menu
1,steveyleeginger,9,No review,-1.0,The Menu
2,karsata,8,The movie perfectly portrays the high end rest...,0.0,The Menu
3,garyview,-1,"This clever film employs the ""Reductio ad absu...",0.0,The Menu
4,helloamandamay,7,This mind twisting dark comedy will keeping yo...,0.0,The Menu
5,NatIPat37,6,"Trailers gave us a promise of the big mistery,...",-1.0,The Menu
6,gulliskywalker,-1,Summary is bebal go people going to an island ...,0.0,The Menu
7,Screen_Rants,8,"Strange movie. This movie is about snobbish, b...",0.0,The Menu
8,aubreygranger-62164,7,No review,-1.0,The Menu
9,eyadelgarhyy,8,I loved the concept and I will absolutely rewa...,0.0,The Menu


In [3]:
# Now we'll clean the data
low_priority = {}
high_priority = {}

# loop through movie reviews to process them
for movie in movies:
    lowercase = []
    titlecase = []
    for review in review_df["reviews"][review_df["Movie Name"] == movie]:
        if review == "No Review":
            continue  # Ignore empty reviews for this part

        # Make always-capitalized letters lowercase as we will use extra capitalization to interpret sentiment
        checker = 0
        new_review = list(review)
        new_review[0] = new_review[0].lower()  # First character of the review
        for i, char in enumerate(new_review):  # Find first character of sentences
            if char == "." and checker == 0:
                checker = 1
            elif char == " " and checker == 1:
                checker = 2
            elif checker == 2 and char.isupper():
                new_review[i] = char.lower()
                checker = 0
            else:
                checker = 0
        review = "".join(new_review)

        # Remove punctuation and emojis/special characters
        review = review.translate(str.maketrans('', '', string.punctuation))
        words = review.split(" ")
        for word in words:
            word = "".join([ch for ch in word if ord(ch) < 128])
            if word.islower():
                lowercase.append(word)
            elif word:
                titlecase.append(word)

    # Get counts for words
    low_priority[movie] = dict(Counter(lowercase))
    high_priority[movie] = dict(Counter(titlecase))


# Make DataFrames for lowercase and uppercase words
low_priority = pd.DataFrame(low_priority)
low_priority["Totals"] = low_priority.sum(axis=1)
low_priority = low_priority.sort_values("Totals", ascending=False).T
high_priority = pd.DataFrame(high_priority)
high_priority["Totals"] = high_priority.sum(axis=1)
high_priority = high_priority.sort_values("Totals", ascending=False).T

print(low_priority[["the", "movie", "satire", "cat", "superhero", "imperceptible"]])
print(high_priority[["I", "Great", "Disney", "Horror", "Fiennes", "Marvel", "Dreamworks"]])

                  the   movie  satire   cat  superhero  imperceptible
The Menu       3252.0   640.0    68.0   1.0        NaN            NaN
Antman         3465.0   774.0     NaN   NaN       20.0            NaN
Puss In Boots  3157.0   872.0     NaN  54.0        NaN            1.0
Totals         9874.0  2286.0    68.0  55.0       20.0            1.0
                    I  Great  Disney  Horror  Fiennes  Marvel  Dreamworks
The Menu        571.0    2.0     3.0     2.0     76.0     1.0         NaN
Antman          648.0    4.0    36.0     NaN      NaN   144.0         NaN
Puss In Boots   642.0    3.0    29.0     NaN      NaN     NaN        49.0
Totals         1861.0    9.0    68.0     2.0     76.0   145.0        49.0


## Movie Dataset Cleaning

In [4]:
# Next get the Movie dataset
movies_df = pd.read_csv("Top250.csv")
movies_df.drop(columns=["Unnamed: 0", "votes"], inplace=True)  # Remove external indexing column
movies_df.iloc[:10]

Unnamed: 0,title,year,certificate,time,genre,rating,metascore,simple_desc,directors,actors
0,1.\nThe Shawshank Redemption\n(1994),(1994),R,142 min,Drama,9.3,82 \n Metascore,"Over the course of several years, two convicts...",Frank Darabont,"['Tim Robbins', 'Morgan Freeman']"
1,2.\nThe Godfather\n(1972),(1972),R,175 min,"Crime, Drama",9.2,100 \n Metascore,The aging patriarch of an organized crime dyna...,Francis Ford Coppola,"['Marlon Brando', 'Al Pacino']"
2,3.\nThe Dark Knight\n(2008),(2008),PG-13,152 min,"Action, Crime, Drama",9.0,84 \n Metascore,When the menace known as the Joker wreaks havo...,Christopher Nolan,"['Christian Bale', 'Heath Ledger']"
3,4.\nThe Lord of the Rings: The Return of the K...,(2003),PG-13,201 min,"Action, Adventure, Drama",9.0,94 \n Metascore,Gandalf and Aragorn lead the World of Men agai...,Peter Jackson,"['Elijah Wood', 'Viggo Mortensen']"
4,5.\nSchindler's List\n(1993),(1993),R,195 min,"Biography, Drama, History",9.0,95 \n Metascore,"In German-occupied Poland during World War II,...",Oskar Schindler,"['Steven Spielberg', 'Liam Neeson']"
5,6.\nThe Godfather Part II\n(1974),(1974),R,202 min,"Crime, Drama",9.0,90 \n Metascore,The early life and career of Vito Corleone in ...,Francis Ford Coppola,"['Al Pacino', 'Robert De Niro']"
6,7.\n12 Angry Men\n(1957),(1957),Approved,96 min,"Crime, Drama",9.0,97 \n Metascore,The jury in a New York City murder trial is fr...,Sidney Lumet,"['Henry Fonda', 'Lee J. Cobb']"
7,8.\nPulp Fiction\n(1994),(1994),R,154 min,"Crime, Drama",8.9,95 \n Metascore,"The lives of two mob hitmen, a boxer, a gangst...",Quentin Tarantino,"['John Travolta', 'Uma Thurman']"
8,9.\nThe Lord of the Rings: The Fellowship of t...,(2001),PG-13,178 min,"Action, Adventure, Drama",8.8,92 \n Metascore,A meek Hobbit from the Shire and eight compani...,Peter Jackson,"['Elijah Wood', 'Ian McKellen']"
9,10.\nFight Club\n(1999),(1999),R,139 min,Drama,8.8,66 \n Metascore,An insomniac office worker and a devil-may-car...,David Fincher,"['Brad Pitt', 'Edward Norton']"


In [5]:
# Clean up formatting within columns
movies_df["title"] = [i.split("\n")[1] for i in movies_df["title"]]
movies_df["year"] = [re.match(".*\([0-9]*\)", i)[0][1:-1] for i in movies_df["year"]]
movies_df["time"] = [int(i[:i.index(" ")]) for i in movies_df["time"]]
movies_df["genre"] = movies_df["genre"].str.split(", ")
movies_df["metascore"] = [int(i[:i.index(" ")]) for i in movies_df["metascore"]]
movies_df["directors"] = movies_df["directors"].str.split(", ")
movies_df["actors"] = movies_df["actors"].apply(ast.literal_eval)
movies_df.iloc[:10]

Unnamed: 0,title,year,certificate,time,genre,rating,metascore,simple_desc,directors,actors
0,The Shawshank Redemption,1994,R,142,[Drama],9.3,82,"Over the course of several years, two convicts...",[Frank Darabont],"[Tim Robbins, Morgan Freeman]"
1,The Godfather,1972,R,175,"[Crime, Drama]",9.2,100,The aging patriarch of an organized crime dyna...,[Francis Ford Coppola],"[Marlon Brando, Al Pacino]"
2,The Dark Knight,2008,PG-13,152,"[Action, Crime, Drama]",9.0,84,When the menace known as the Joker wreaks havo...,[Christopher Nolan],"[Christian Bale, Heath Ledger]"
3,The Lord of the Rings: The Return of the King,2003,PG-13,201,"[Action, Adventure, Drama]",9.0,94,Gandalf and Aragorn lead the World of Men agai...,[Peter Jackson],"[Elijah Wood, Viggo Mortensen]"
4,Schindler's List,1993,R,195,"[Biography, Drama, History]",9.0,95,"In German-occupied Poland during World War II,...",[Oskar Schindler],"[Steven Spielberg, Liam Neeson]"
5,The Godfather Part II,1974,R,202,"[Crime, Drama]",9.0,90,The early life and career of Vito Corleone in ...,[Francis Ford Coppola],"[Al Pacino, Robert De Niro]"
6,12 Angry Men,1957,Approved,96,"[Crime, Drama]",9.0,97,The jury in a New York City murder trial is fr...,[Sidney Lumet],"[Henry Fonda, Lee J. Cobb]"
7,Pulp Fiction,1994,R,154,"[Crime, Drama]",8.9,95,"The lives of two mob hitmen, a boxer, a gangst...",[Quentin Tarantino],"[John Travolta, Uma Thurman]"
8,The Lord of the Rings: The Fellowship of the Ring,2001,PG-13,178,"[Action, Adventure, Drama]",8.8,92,A meek Hobbit from the Shire and eight compani...,[Peter Jackson],"[Elijah Wood, Ian McKellen]"
9,Fight Club,1999,R,139,[Drama],8.8,66,An insomniac office worker and a devil-may-car...,[David Fincher],"[Brad Pitt, Edward Norton]"


## Naive Recommender System

In [6]:
# First generate a very simple recommendation system based on a score system, for comparison

naive_edges = np.zeros((len(movies_df), len(movies_df)))
for i, film1 in enumerate(movies_df.itertuples()):
    for j, film2 in enumerate(movies_df.itertuples()):
        if i == j:
            continue
            
        score = 0
        for genre in film1[5]:  # 5 indicates the Genre list
            if genre in film2[5]:
                score += 1
        for director in film1[9]:  # 9 is the Directors index
            if director in film2[9]:
                score += 1
        for actor in film1[10]:  # 10 is the Actors index
            if actor in film2[10]:
                score += 1
        
        naive_edges[i, j] = score
        
# Generate similarities DataFrame
similarities = pd.DataFrame(naive_edges, columns=list(movies_df["title"]), index=list(movies_df["title"]))
most_similar_naive = pd.DataFrame([movies_df["title"][similarities[film].argmax()] for film in movies_df["title"]], index=movies_df["title"], columns=["Most Similar Film"])
with pd.option_context('display.max_colwidth', None, 'display.max_rows', 250):
    display(similarities.iloc[:10, :10])

Unnamed: 0,The Shawshank Redemption,The Godfather,The Dark Knight,The Lord of the Rings: The Return of the King,Schindler's List,The Godfather Part II,12 Angry Men,Pulp Fiction,The Lord of the Rings: The Fellowship of the Ring,Fight Club
The Shawshank Redemption,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
The Godfather,1.0,0.0,2.0,1.0,1.0,4.0,2.0,2.0,1.0,1.0
The Dark Knight,1.0,2.0,0.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0
The Lord of the Rings: The Return of the King,1.0,1.0,2.0,0.0,1.0,1.0,1.0,1.0,5.0,1.0
Schindler's List,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
The Godfather Part II,1.0,4.0,2.0,1.0,1.0,0.0,2.0,2.0,1.0,1.0
12 Angry Men,1.0,2.0,2.0,1.0,1.0,2.0,0.0,2.0,1.0,1.0
Pulp Fiction,1.0,2.0,2.0,1.0,1.0,2.0,2.0,0.0,1.0,1.0
The Lord of the Rings: The Fellowship of the Ring,1.0,1.0,2.0,5.0,1.0,1.0,1.0,1.0,0.0,1.0
Fight Club,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0


### Some of these are great:

In [7]:
most_similar_naive.T[["The Godfather", "Before Sunrise", "Batman Begins", "The Dark Knight", "The Lord of the Rings: The Return of the King", "Inception", "Interstellar", "Star Wars", "Terminator 2: Judgment Day", "Spirited Away", "City Lights", "Alien", "Rear Window", "Indiana Jones and the Last Crusade", "Avengers: Infinity War", "Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb"]].T

Unnamed: 0_level_0,Most Similar Film
title,Unnamed: 1_level_1
The Godfather,The Godfather Part II
Before Sunrise,Before Sunset
Batman Begins,The Dark Knight
The Dark Knight,Batman Begins
The Lord of the Rings: The Return of the King,The Lord of the Rings: The Fellowship of the Ring
Inception,Interstellar
Interstellar,Inception
Star Wars,Star Wars: Episode V - The Empire Strikes Back
Terminator 2: Judgment Day,The Terminator
Spirited Away,Howl's Moving Castle


### Some are not:

In [8]:
most_similar_naive.T[["Hamilton", "Witness for the Prosecution", "The Wages of Fear", "Rebecca", "Trainspotting", "City of God", "The Lion King", "Pan's Labyrinth"]].T

Unnamed: 0_level_0,Most Similar Film
title,Unnamed: 1_level_1
Hamilton,Schindler's List
Witness for the Prosecution,Se7en
The Wages of Fear,The Lord of the Rings: The Return of the King
Rebecca,Se7en
Trainspotting,The Shawshank Redemption
City of God,The Godfather
The Lion King,The Lord of the Rings: The Return of the King
Pan's Labyrinth,Saving Private Ryan


## ML Beginnings

### Select features

In [9]:
# Generate feature list, one for each genre/director/actor
genre_list = sorted(pd.unique([genre for film in movies_df["genre"] for genre in film]))
director_list = sorted(pd.unique([person for film in movies_df["directors"] for person in film]))
actor_list = sorted(pd.unique([person for film in movies_df["actors"] for person in film]))
genres = pd.DataFrame([[genre in film for genre in genre_list] for film in movies_df["genre"]], columns=genre_list)
directors = pd.DataFrame([[director in film for director in director_list] for film in movies_df["directors"]], columns=director_list)
actors = pd.DataFrame([[actor in film for actor in actor_list] for film in movies_df["actors"]], columns=actor_list)
features = pd.concat([genres, directors, actors], axis=1)
features.index = movies_df["title"]
features

Unnamed: 0_level_0,Action,Adventure,Animation,Biography,Comedy,Crime,Drama,Family,Fantasy,Film-Noir,...,Woody Harrelson,Yoo Ji-tae,Yordanos Shiferaw,Yuriko Ishida,Yutaka Sada,Yves Montand,Yôji Matsuda,Zain Al Rafeea,Zendaya,Éric Toledano
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
The Shawshank Redemption,False,False,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
The Godfather,False,False,False,False,False,True,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
The Dark Knight,True,False,False,False,False,True,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
The Lord of the Rings: The Return of the King,True,True,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
Schindler's List,False,False,False,True,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
The Incredibles,True,True,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
Dances with Wolves,False,True,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
Aladdin,False,True,True,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
Gandhi,False,False,False,True,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False


### Create Dataset and Model

In [10]:
# Create our custom Dataset class
class MovieDataset(Dataset):
    def __init__(self, features_tensor, labels):
        self.features = features_tensor
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

In [11]:
# Create our custom model
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear_relu_stack = nn.Sequential(  # simple three linear layer network
            nn.Linear(527, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 11),
        )

    def forward(self, x):
        x = self.linear_relu_stack(x)  # run through our Sequential pipeline
        return x

### Train Model

In [12]:
# Now create variables needed for iterative training

net = Model()  # Create model instance
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

# Convert to tensor
features = torch.tensor(np.array(features)).to(torch.float32)

# Train for a user who loves drama movies and hates everything else
labels = torch.tensor(["Drama" in film for film in movies_df["genre"]]).to(torch.int64) * 10

# Train and test sets, 0.8 train/0.2 test
train_size = int(len(features) * 0.8)
train_dataset = MovieDataset(features[:train_size], labels[:train_size])
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_dataset = MovieDataset(features[train_size:], labels[train_size:])
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=True)

In [13]:
# Training the model
for epoch in range(100):
    for i, data in enumerate(train_loader, 0):
        # basic training loop
        inputs, labels = data  # Note that this pulls from our dataset using MovieDataset's __getitem__()
        optimizer.zero_grad()
        outputs = net(inputs)  # Run our inputs through the model
        loss = criterion(outputs, labels)  # Check our model's outputs against the labels
        
        # Update the model
        loss.backward()
        optimizer.step()

print('Finished Training')

Finished Training


### Sample Logical Tests

In [14]:
drama_movie = torch.zeros((1, features.shape[1]))
drama_movie[0, 6] = 1
print("Rating for Drama movie is", int(torch.argmax(net(drama_movie))))

Rating for Drama movie is 10


In [15]:
crime_movie = torch.zeros((1, features.shape[1]))
crime_movie[0, 5] = 1
print("Rating for Crime movie is", int(torch.argmax(net(crime_movie))))

Rating for Crime movie is 0


In [16]:
crime_drama_movie = torch.zeros((1, features.shape[1]))
crime_drama_movie[0, 5] = 1
crime_drama_movie[0, 6] = 1
print("Rating for Crime Drama movie is", int(torch.argmax(net(crime_drama_movie))))

Rating for Crime Drama movie is 10


In [17]:
everything_but_drama_movie = torch.ones((1, features.shape[1]))
everything_but_drama_movie[0, 6] = 0
print("Rating for movie with every feature except for Drama is", int(torch.argmax(net(everything_but_drama_movie))))

Rating for movie with every feature except for Drama is 0
