In [1]:
import pandas as pd
import time
import csv
import requests
import os
import requests

Functions for getting data from thje TMDB API

In [7]:
import os
import csv
import time
import pandas as pd
import requests

def fetch_movie_data(tmdb_id, i, TMDB_API_KEY):
    '''
    Fetches movie review data from TMDB API given a tmdb_id
    '''
    # rate limit: 40 requests every 10 seconds
    if i % 39 == 0:
        time.sleep(0.25)

    url = (
        f"https://api.themoviedb.org/3/movie/{tmdb_id}/reviews"
        f"?api_key={TMDB_API_KEY}&language=en-US"
    )

    data = requests.get(url).json()
    return data


def write_to_csv(row, filename="ml-latest/reviews.csv"):
    '''
    Appends a row to a csv file
    '''
    with open(filename, 'a', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(row)


def get_last_row_movie(filename="ml-latest/reviews.csv"):
    '''
    Returns the movieId of the last row in the csv file
    '''
    with open(filename, 'r', encoding='utf-8') as f:
        last_line = f.readlines()[-1]
        return int(last_line.split(',')[0])


def process_movie_data(data, TMDB_API_KEY, file_path="ml-latest/reviews.csv"):
    '''
    Processes movie data fetched from TMDB API.
    Only pulls movieId + reviews.
    '''

    # check if csv exists
    if os.path.exists(file_path):
        last_movie_id = get_last_row_movie(file_path)
        print(f"Resuming from movieId: {last_movie_id}")

    else:
        last_movie_id = 0
        # write header
        with open(file_path, 'w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(["movieId", "reviews"])
        print("Creating new reviews.csv file")

    for i, (movieId, imdbId, tmdbId) in enumerate(data.itertuples(index=False)):

        # skip already processed movies
        if movieId <= last_movie_id:
            continue

        # skip NaN tmdb id
        if pd.isna(tmdbId):
            continue

        tmdbId = int(tmdbId)

        # get review data
        review_data = fetch_movie_data(tmdbId, i, TMDB_API_KEY)

        # extract reviews
        reviews_raw = review_data.get("results", [])
        reviews_texts = []

        for r in reviews_raw[:5]:  # take first 20 reviews only
            content = r.get("content", "")
            content = content.replace("\n", " ").replace(",", "").replace("'", "")
            reviews_texts.append(content)

        reviews_joined = "|".join(reviews_texts)

        # write to csv
        row = [movieId, reviews_joined]
        write_to_csv(row, file_path)

        print(f"Saved movieId {movieId}")


# Get the extra data

IMPORTANT: API key to TMDB is required. It is free for non commercial applications.


We have included the data we pulled so you don't need to run this part as the data is already present

In [8]:
TMDB_API_KEY = "be0552b72397e07ffaa4d7d488b22b92"

movies_df = pd.read_csv("data/links_action.csv")
process_movie_data(movies_df, TMDB_API_KEY=TMDB_API_KEY, file_path="data/reviews_final_one_AA.csv")

Creating new reviews.csv file
Saved movieId 6
Saved movieId 9
Saved movieId 10
Saved movieId 15
Saved movieId 20
Saved movieId 23
Saved movieId 42
Saved movieId 44
Saved movieId 51
Saved movieId 66
Saved movieId 70
Saved movieId 71
Saved movieId 76
Saved movieId 78
Saved movieId 86
Saved movieId 89
Saved movieId 95
Saved movieId 98
Saved movieId 110
Saved movieId 112
Saved movieId 139
Saved movieId 145
Saved movieId 151
Saved movieId 153
Saved movieId 160
Saved movieId 163
Saved movieId 165
Saved movieId 168
Saved movieId 170
Saved movieId 172
Saved movieId 173
Saved movieId 181
Saved movieId 185
Saved movieId 198
Saved movieId 204
Saved movieId 208
Saved movieId 227
Saved movieId 236
Saved movieId 251
Saved movieId 260
Saved movieId 284
Saved movieId 286
Saved movieId 288
Saved movieId 292
Saved movieId 293
Saved movieId 303
Saved movieId 315
Saved movieId 316
Saved movieId 327
Saved movieId 330
Saved movieId 338
Saved movieId 349
Saved movieId 353
Saved movieId 360
Saved movieId 367


In [9]:
print(movies_df.columns)


Index(['movieId', 'imdbId', 'tmdbId'], dtype='object')


In [10]:
# procentsats af reviews med indhold
total_reviews = 0
non_empty_reviews = 0
with open("data/reviews_final_one_AA.csv", 'r', encoding='utf-8') as f:
    reader = csv.reader(f)
    next(reader)  # skip header
    for row in reader:
        total_reviews += 1
        reviews = row[1]
        if reviews.strip():
            non_empty_reviews += 1

print(f"Percentage of reviews with content: {non_empty_reviews / total_reviews * 100:.2f}%")

Percentage of reviews with content: 27.92%


In [13]:
# hvor mange reviews per movie i gennemsnit for film med reviews
total_reviews_count = 0
movies_with_reviews = 0
with open("data/reviews_final_one_AA.csv", 'r', encoding='utf-8') as f:
    reader = csv.reader(f)
    next(reader)  # skip header
    for row in reader:
        reviews = row[1]
        if reviews.strip():
            review_list = reviews.split("|")
            total_reviews_count += len(review_list)
            movies_with_reviews += 1

average_reviews_per_movie = total_reviews_count / movies_with_reviews
print(f"Average number of reviews per movie (for movies with reviews): {average_reviews_per_movie:.2f}")

Average number of reviews per movie (for movies with reviews): 2.14
