Was previously format.py

In [1]:
import re
import pandas as pd
from collections import defaultdict
import csv

movieFields = ['id', 'title', 'vote_average']
castFields = ['id', 'cast']

movieData = pd.read_csv('archive/movies_metadata.csv', skipinitialspace=True, usecols=movieFields)
castData = pd.read_csv('archive/credits.csv', skipinitialspace=True, usecols=castFields)

dfMovies = pd.DataFrame(movieData)
dfCast = pd.DataFrame(castData)

df = pd.merge(dfMovies, dfCast, how='inner')


# ---------------------------------------------------------------------------- #
# The following section formats the cast and creates a clean list of actor names.

unformattedNames = df['cast']

# main list of cast grouped by movies
cast_master_copy = []

# list of overall cast by individual names
actor_name_list = []

# reads through the cast for each movie
for unformattedNameString in unformattedNames:

    # creates new list for current movie
    grouped = []

    # pulls actors' names out of the paragraph
    line_list = re.findall("(?<=\'name\': )(.*?)(?=,)", unformattedNameString)

    # converts list of cast for one movie into a string to manipulate further 
    line_string = str(line_list)

    # separates cast into a unique list per movie
    line_split = line_string.split(", ")

    # reads through each actor per movie
    for name in line_split:
     
        # removes extraneous symbols from the actors' names
        name = name.replace("\"", "")
        name = name.replace("\'", "")
        name = name.replace("[", "")
        name = name.replace("]", "")

        # adds actors to two working lists
        # grouped: actors are in lists by movies
        # actor_name_list: puts every actors in one list
        grouped.append(name)
        actor_name_list.append(name)

    # adds formatted cast members to list
    cast_master_copy.append(grouped)

# assigns formatted cast to 'cast' column of DataFrame
df['cast'] = cast_master_copy
# ---------------------------------------------------------------------------- #



# ---------------------------------------------------------------------------- #
# The following section turns the list of actor names into a list of unique numbers
# assigned to those actors. Then, creates a dictionary with the actor names and those 
# keys.


# gets unique keys from actors i.e:
# ['Tom Hanks', 'Will Smith', 'Tom Hanks', 'Jonah Hill', 'Tom Hanks', 'Will Smith', 'Jonah Hill']
# returns list [0, 1, 0, 2, 0, 1, 2]
key_assignment = defaultdict(lambda: len(key_assignment))

# list of unique keys for each actor
keys = [key_assignment[key] for key in actor_name_list]

# assigns keys to actors to form a hash table...nums are keys
actor_dict = dict(zip(keys, actor_name_list))

# inverts dictionary...names are keys
actor_dict_inv = {v: k for k, v in actor_dict.items()}
# ---------------------------------------------------------------------------- #


# ---------------------------------------------------------------------------- #
# The following section uses the inverted actor dictionary to look up the actor names
# and put their respective dictionary values in a list for each movie.

cast_names_by_movie = df['cast']
cast_ids_by_movie_master = []

for cast in cast_names_by_movie:

    temp_actor_list = []

    for actor in cast:
        actor = actor.replace("[", "")
        actor = actor.replace("]", "")
        actor = actor.replace("\'", "")

        actor_key = actor_dict_inv.get(actor)
        temp_actor_list.append(actor_key)

    cast_ids_by_movie_master.append(temp_actor_list)

df['cast_ids'] = cast_ids_by_movie_master
# ---------------------------------------------------------------------------- #


# ensure correct
print(df)

# writes formatted DataFrame values to a new csv
pd.DataFrame.to_csv(df, "movies.csv", index="false")


# creates a csv of the actor dictionary
# w = csv.writer(open("actor_dict.csv", "w"))
# for key, val in actor_dict.items():
#     w.writerow([key, val])


           id                             title  vote_average  \
0      469172   Manuel on the Island of Wonders           0.0   
1      468707  Thick Lashes of Lauri Mäntyvaara           8.0   
2      467731       Tragedy in a Temporary Town           0.0   
3      465044                         Abduction           0.0   
4      464207         The Truth Is in the Stars           7.5   
...       ...                               ...           ...   
43014      11                         Star Wars           8.1   
43015       6                    Judgment Night           6.4   
43016       5                        Four Rooms           6.5   
43017       3               Shadows in Paradise           7.1   
43018       2                             Ariel           7.1   

                                                    cast  \
0      [Ruben de Freitas, Teresa Madruga, Fernando He...   
1      [Inka Haapamäki, Rosa Honkonen, Tiitus Rantala...   
2      [Lloyd Bridges, Jack Warden, Raf