# Creation and Pre-processing of the dataset

In [20]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

Firstly, we create a database containing a large variety of films with various features. There is therefore two steps needed: gathering a list of film titles, and making API calls to gather the desired features for each title. There are various APIs available for this purpose.

To build the dataframe of film titles, we found that wikipedia conveniently had pages which were in the format
https://en.wikipedia.org/wiki/List_of_{country}_films_of_{year}.

For example, for british films in 2022:
"This article lists feature-length British films and full-length documentaries that had their premiere in 2022 and were at least partly produced by the United Kingdom. It does not feature short films, medium-length films, made-for-TV films, pornographic films, filmed theater, VR films or interactive films, nor does it include films screened in previous years that have official release dates in 2022."

All the films in these articles are set out in tables and there is no bias towards only including well performing films.

We make an assumption that british and american audiences and trends have a large amount of crossover, and choose to create a database containing all films released between 2000 and 2022 produced in the UK or the US.

In [21]:

def films_years_country(country, years_from, year_to, start, end):
    filmdata = []
    start_num = years_from
    end_num = year_to
    year = list(range(start_num, end_num + 1))

    for i in year:
        url = (f"https://en.wikipedia.org/wiki/List_of_{country}_films_of_{str(i)}")

        # Make a GET request to the website
        response = requests.get(url)

        # Parse the HTML content of the website
        soup = BeautifulSoup(response.content, "html.parser")
        # print(soup)
        movie_titles = []
        # Find all elements with the class "movie-title"
        rows = soup.find_all('tr')
        for row in rows:
            title = row.find('a')
            if title:
                movie_titles.append(title.text)
        #print(movie_titles)
        movie_titles = movie_titles[start:]
        movie_titles = movie_titles[:len(movie_titles) - end]
        print(movie_titles)
        for i in movie_titles:
            filmdata.append(i)
    df = pd.DataFrame(filmdata)
    df.to_csv(f"MovieTitles_{years_from}-{year_to}_{country}.csv", index=False)
    return df


df1 = films_years_country("British", 2000, 2022, 19, 35)
df2 = films_years_country("American", 2000, 2022, 29, 30)
df = pd.concat([df1, df2], axis=0)
df.to_csv("Data/MovieTitles_British_American_2000_2022.csv", index=False)

['Aberdeen', 'The Asylum', 'The Beach', 'Beautiful Joe', 'Best', 'Billy Elliot', 'Blood', 'Borstal Boy', 'Bread and Roses', 'Breathtaking', 'Chicken Run', 'Chocolat', 'The Claim', 'Complicity', 'Essex Boys', 'Esther Kahn', 'Five Seconds to Spare', 'Gangster No. 1', 'Gladiator', 'The Golden Bowl', 'Greenfingers', 'Honest', 'Hotel Splendide', 'House!', "How to Kill Your Neighbor's Dog", 'Kevin & Perry Go Large', 'Kiss Kiss (Bang Bang)', 'Liam', 'Love, Honour and Obey', "Love's Labour's Lost", 'Mad About Mambo', 'The Man Who Cried', 'Maybe Baby', 'Monarch', 'No Maps for These Territories', 'Nora', 'One of the Hollywood Ten', 'Out of Depth', 'Pandaemonium', 'Paranoid', 'Purely Belter', 'Quills', 'Relative Values', 'Sabotage!', 'Saving Grace', 'Sexy Beast', 'A Shot at Glory', 'Snatch', 'Some Voices', "There's Only One Jimmy Grimble", 'Thomas and the Magic Railroad', 'Up at the Villa', 'When the Sky Falls']
['The 51st State', 'Another Life', 'Back to the Secret Garden', "Beginner's Luck", 'B

In [31]:
df = pd.read_csv("Data/MovieTitles_British_American_2000_2022.csv")
print(df.head(20))
print(df.tail(20))

                        0
0                Aberdeen
1              The Asylum
2               The Beach
3           Beautiful Joe
4                    Best
5            Billy Elliot
6                   Blood
7             Borstal Boy
8         Bread and Roses
9            Breathtaking
10            Chicken Run
11               Chocolat
12              The Claim
13             Complicity
14             Essex Boys
15            Esther Kahn
16  Five Seconds to Spare
17         Gangster No. 1
18              Gladiator
19        The Golden Bowl
                                                 0
9855                                  Emancipation
9856                                 Spoiler Alert
9857           Diary of a Wimpy Kid: Rodrick Rules
9858                            Darby and the Dead
9859                       Lady Chatterley's Lover
9860                          The Eternal Daughter
9861                           Matilda the Musical
9862                               Empire of L

It's unlikely this is a complete list, however, we were unable to find any films it had missed. Apart from possibly sequels, we would assume that it does not matter if the list is complete or not.

The next step is to make API calls for each film in the liat.

In [32]:
# Set the API endpoint URL
url = "http://www.omdbapi.com/"
api_key = "86325b89" # Toms API key. Limited to 100,000 requests per day

titles = pd.read_csv("Data/MovieTitles_British_American_2000_2022.csv")["0"].to_list()

response = requests.get(url, params={
        "apikey": api_key,
        "t": titles[4933],
        # "y": year,
        "plot": "full",
        "type": "movie"
    })
data = response.json()
print(data)

{'Title': 'Shrek 2', 'Year': '2004', 'Rated': 'PG', 'Released': '19 May 2004', 'Runtime': '93 min', 'Genre': 'Animation, Adventure, Comedy', 'Director': 'Andrew Adamson, Kelly Asbury, Conrad Vernon', 'Writer': 'William Steig, Andrew Adamson, Joe Stillman', 'Actors': 'Mike Myers, Eddie Murphy, Cameron Diaz', 'Plot': "Shrek (Mike Myers) has rescued Princess Fiona (Cameron Diaz), got married, and now is time to meet the parents. Shrek, Fiona, and Donkey (Eddie Murphy) set off to Far, Far Away to meet Fiona's mother and father. But not everyone is happy. Shrek and King Harold (John Cleese) find it hard to get along, and there's tension in the marriage. It's not just the family who are unhappy. Prince Charming (Rupert Everett) returns from a failed attempt at rescuing Fiona, and works alongside his mother, the Fairy Godmother (Jennifer Saunders), to try and find a way to get Shrek away from Fiona.", 'Language': 'English', 'Country': 'United States', 'Awards': 'Nominated for 2 Oscars. 18 win

The API calls return useful data about each film. This is a lot of features which would be possible to use.
Since the project is focused on NLP, we choose to extract the title and the plot. We also extract the year, genre, and the IMDb Rating, which is going to be our target. There may be other features such as Runtime AgeRating Director ... which may be useful for our prediction model, however, we chose to limit the feature space and make the project more about how we process the title and plot features to get a good prediction.

In [None]:
# Set the API endpoint URL
url = "http://www.omdbapi.com/"
api_key = "86325b89" # Toms API key. Limited to 100,000 requests per day

titles = pd.read_csv("Data/MovieTitles_British_American_2000_2022.csv")["0"].to_list()

movie_data = []
count = 0

for i in titles:
    response = requests.get(url, params={
        "apikey": api_key,
        "t": i,
        # "y": year,
        "plot": "full",
        "type": "movie"
    })
    data = response.json()
    #print(data)
    count += 1
    if count % 50 == 0:
        response = data["Response"]
        print(f"{response}, {count} out of {len(titles)}")

    if data["Response"] == 'True':
        movie_data.append({
            "IMDbRating": data["imdbRating"],
            "Title": data["Title"],
            "Year": data["Year"],
            "Genre": data["Genre"],
            "Plot": data["Plot"],
            "Actors": data["Actors"],
        })

# Create a DataFrame from the movie data
df = pd.DataFrame(movie_data)
df.to_csv("Data/MovieDatabase_British_American_2000_2021.csv", index=False)

{'Title': 'Aberdeen', 'Year': '2000', 'Rated': 'TV-MA', 'Released': '08 Sep 2000', 'Runtime': '106 min', 'Genre': 'Drama', 'Director': 'Hans Petter Moland, Tony Spataro', 'Writer': 'Hans Petter Moland, Kristin Amundsen, Lars Bill Lundholm', 'Actors': 'Stellan Skarsgård, Lena Headey, Jean Johansson', 'Plot': "Kaisa is a Scot, a successful London lawyer, who snorts coke and has one-night stands with strangers. Her mother calls from Aberdeen with some story begging her to fly to Norway and collect her alcoholic dad whom she hasn't seen in years. The impatient Kaisa reluctantly agrees. The story is a ruse: mom is dying and wants her ex and her daughter together again. The trip gets complicated (dad is too drunk to fly and the hot-tempered Kaisa gets them banned from the airline): they go by ferry then car, needing a great deal of help along the way from a sweet lorry driver named Clive. Will they reach Aberdeen before mom's death, and will Kaisa find any stability within herself or in othe

Some of these API calls do inevitably get the incorrect movie. There might be two films which have the same name and our api lookup picks the wrong one, such as classical films with modern day remakes. We assume that this does not affect our overall predictions. Some films also fail with the API lookup and we lose these from our database but again, it does not seem essential for this project to have a perfect complete database.

Another issue is that some films have N/A for some of the features we wish to use, such as the plot. Rather than to set this to some default value, we choose to remove these films from the database.

There are 26 genres in total so we One-hot-encode these.

In [33]:
def RemoveNA(df):
    df = df.replace("N/A", np.nan)
    df = df.dropna()
    return df


def RemoveDuplicates(df):
    df = df.drop_duplicates()
    return df


def OHE_Genre(df):
    genres = list(set(df["Genre"].tolist()))

    film_genres = df['Genre'].str.split(',', expand=True).replace(" ", "")
    film_genres = film_genres.apply(lambda x: x.str.strip())

    # One-hot encode the new genre columns and append to the film dataframe
    genre_dummies = pd.get_dummies(film_genres.stack()).groupby(level=0).sum()
    film_df = pd.concat([df, genre_dummies], axis=1)

    # drop original genre column
    film_df.drop('Genre', axis=1, inplace=True)
    return film_df

We now need to decide how to process the actors.
By logic, we think that an actor who has only appeared in very little films will not be a good predictor for the rating of a new film. By setting the minimum amount of films an actor has appeared in to 5, we reduce the number of actors in our database to 927, which we can then one hot encode. It was difficult to conclude to what approach was best when deadling with the actors column. Dimensionality reduction methods such as PCA could work, however, OHE was manageable and despite adding 927 columns to our database, will not affect model performance.

In [35]:
def OHE_actors(df):
    # Get all unique actors in the dataset
    film_actors = df['Actors'].str.split(',', expand=True).replace(" ", "")
    film_actors = film_actors.apply(lambda x: x.str.strip())

    actor_dummies = pd.get_dummies(film_actors.stack()).groupby(level=0).sum()

    # Sum the number of movies each actor has appeared in
    actor_counts = actor_dummies.sum(axis=0)

    # Filter out actors with less than 5 movie appearances
    actor_counts = actor_counts[actor_counts >= 5]

    # Select only the one-hot encoded columns for actors with more than 5 movie appearances
    actor_dummies = actor_dummies[actor_counts.index]
    df = pd.concat([df, actor_dummies], axis=1)
    df.drop('Actors', axis=1, inplace=True)
    return df

df = pd.read_csv("Data/MovieDatabase_British_American_2000_2021.csv")
df = RemoveDuplicates(df)
df = RemoveNA(df)
df = OHE_Genre(df)
df = OHE_actors(df)
df = df[df['Year'] >= 2000]

df.to_csv("Data/PreProcessedData.csv", index=False)

Since we have this database, Lets see who the best and worst actors that have appeared in 5 or more films this century are.

In [None]:
df = pd.read_csv("PreProcessedData.csv")
targets = df['IMDbRating']
df = df.iloc[:, 30:]

actor_names = df.columns.tolist()
num_films = 6731
rating_data = []
for actor in actor_names:
    sum = 0
    films = 0
    lowest = 10
    highest = 0
    for film in range(num_films):
        if df[actor][film] == 1:
            sum += targets[film]
            films += 1
            if targets[film] > highest:
                highest = targets[film]
            if targets[film] < lowest:
                lowest = targets[film]
    rating_data.append({
        'Actor': actor,
        'AvgIMDb': round(sum/films, 2),
        'Num_Films': films,
        'Spacer': "---",
        'Lowest_Rated': lowest,
        'Highest_Rated': highest})
df = pd.DataFrame(rating_data)

sorted_df = df.sort_values('AvgIMDb')
sorted_df = sorted_df.iloc[::-1]
sorted_df.to_csv('Actor_Rankings.csv', index=False)

print(sorted_df.head(20))
print(sorted_df.tail(20))