Welcome to our explainer!

In this Jupyter Notebook file, you will find all code related to the final project for group XX in Social Graphs & Interactions. Please open the sections below to examine the code.

# Scraping section

## Packages

In [None]:
import os
import csv
import time
import pandas as pd
import requests

## Function for scraping and storing the data

In [None]:
def fetch_movie_data(tmdb_id, i, TMDB_API_KEY):
    '''
    Fetches movie review data from TMDB API given a tmdb_id
    '''
    # rate limit: 40 requests every 10 seconds
    if i % 39 == 0:
        time.sleep(0.25)

    url = (
        f"https://api.themoviedb.org/3/movie/{tmdb_id}/reviews"
        f"?api_key={TMDB_API_KEY}&language=en-US"
    )

    data = requests.get(url).json()
    return data


def write_to_csv(row, filename="ml-latest/reviews.csv"):
    '''
    Appends a row to a csv file
    '''
    with open(filename, 'a', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(row)


def get_last_row_movie(filename="ml-latest/reviews.csv"):
    '''
    Returns the movieId of the last row in the csv file
    '''
    with open(filename, 'r', encoding='utf-8') as f:
        last_line = f.readlines()[-1]
        return int(last_line.split(',')[0])


def process_movie_data(data, TMDB_API_KEY, file_path="ml-latest/reviews.csv"):
    '''
    Processes movie data fetched from TMDB API.
    Only pulls movieId + reviews.
    '''

    # check if csv exists
    if os.path.exists(file_path):
        last_movie_id = get_last_row_movie(file_path)
        print(f"Resuming from movieId: {last_movie_id}")

    else:
        last_movie_id = 0
        # write header
        with open(file_path, 'w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(["movieId", "reviews"])
        print("Creating new reviews.csv file")

    for i, (movieId, imdbId, tmdbId) in enumerate(data.itertuples(index=False)):

        # skip already processed movies
        if movieId <= last_movie_id:
            continue

        # skip NaN tmdb id
        if pd.isna(tmdbId):
            continue

        tmdbId = int(tmdbId)

        # get review data
        review_data = fetch_movie_data(tmdbId, i, TMDB_API_KEY)

        # extract reviews
        reviews_raw = review_data.get("results", [])
        reviews_texts = []

        for r in reviews_raw[:5]:  # take first 20 reviews only
            content = r.get("content", "")
            content = content.replace("\n", " ").replace(",", "").replace("'", "")
            reviews_texts.append(content)

        reviews_joined = "|".join(reviews_texts)

        # write to csv
        row = [movieId, reviews_joined]
        write_to_csv(row, file_path)

        print(f"Saved movieId {movieId}")


In [None]:
TMDB_API_KEY = "be0552b72397e07ffaa4d7d488b22b92"

movies_df = pd.read_csv("data/links_action.csv")
process_movie_data(movies_df, TMDB_API_KEY=TMDB_API_KEY, file_path="data/reviews_final_one_AA.csv")

# Graph section

## Packages

In [None]:
import pandas as pd
import itertools
import networkx as nx
import matplotlib.pyplot as plt


## Initialization of data and preprocessing

In [None]:
# Read data
df = pd.read_csv("data/overview.csv")
movies_df = pd.read_csv("data/movies.csv")

# Clean the cast column so it is ready for splitting
df["cast_names"] = df["cast_names"].fillna("").astype(str)

# Converting of casts into lists (orignally, they were seperated with the symbol |)
df["cast_list"] = df["cast_names"].apply(
    lambda s: [c.strip() for c in s.split("|") if c.strip() != ""]
)

# Join overview with movies
merged_df = df.merge(movies_df[["movieId", "genres"]], on="movieId", how="left")

# Split of genre to lists
merged_df["genres"] = merged_df["genres"].fillna("")
merged_df["genre_list"] = merged_df["genres"].apply(
    lambda s: [g.strip() for g in s.split("|") if g.strip() != ""]
)

# Filtering the genre
target_genre = "Action"
genre_df = merged_df[merged_df["genre_list"].apply(lambda lst: target_genre in lst) ].copy()