# Create a csv to match our movie model

## movies.csv

In [1]:
# Transform and combine movies, links and ratings
# id/title/year/avg_rate/imdb_id/tmdb_id

In [2]:
import pandas as pd

In [3]:
df_movies = pd.read_csv('raw_data/movies.csv')

In [4]:
df_movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
86532,288967,State of Siege: Temple Attack (2021),Action|Drama
86533,288971,Ouija Japan (2021),Action|Horror
86534,288975,The Men Who Made the Movies: Howard Hawks (1973),Documentary
86535,288977,Skinford: Death Sentence (2023),Crime|Thriller


In [5]:
# Create new column with the year
df_movies['year'] = df_movies['title'].str.extract(r'\((\d{4})\)') 

In [6]:
# Remove year from title column
df_movies['title'] = df_movies['title'].replace(r'\((\d{4})\)', '', regex=True).str.strip()

In [7]:
# Drop genres, they wil be handled in a seperate notebook
df_movies = df_movies[['movieId','title', 'year']]

In [8]:
df_movies

Unnamed: 0,movieId,title,year
0,1,Toy Story,1995
1,2,Jumanji,1995
2,3,Grumpier Old Men,1995
3,4,Waiting to Exhale,1995
4,5,Father of the Bride Part II,1995
...,...,...,...
86532,288967,State of Siege: Temple Attack,2021
86533,288971,Ouija Japan,2021
86534,288975,The Men Who Made the Movies: Howard Hawks,1973
86535,288977,Skinford: Death Sentence,2023


## links.csv to add imdb and tmdb id columns to df_movies

In [9]:
df_link = pd.read_csv('raw_data/links.csv')

In [10]:
df_link

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0
...,...,...,...
86532,288967,14418234,845861.0
86533,288971,11162178,878958.0
86534,288975,70199,150392.0
86535,288977,23050520,1102551.0


In [11]:
df_link.isnull

<bound method DataFrame.isnull of        movieId    imdbId     tmdbId
0            1    114709      862.0
1            2    113497     8844.0
2            3    113228    15602.0
3            4    114885    31357.0
4            5    113041    11862.0
...        ...       ...        ...
86532   288967  14418234   845861.0
86533   288971  11162178   878958.0
86534   288975     70199   150392.0
86535   288977  23050520  1102551.0
86536   288983  11644948   940588.0

[86537 rows x 3 columns]>

In [12]:
# Fill null values with 0
df_link['tmdbId'] = df_link['tmdbId'].fillna(0)

In [13]:
# Extract IMDB column 
imdbId_col = df_link["imdbId"]

In [14]:
# Add the extracted column 
df_movies = pd.concat([df_movies, imdbId_col.rename("imdbId")], axis=1)

In [15]:
# Extract TMDB column 
tmdbId_col = df_link["tmdbId"]

In [16]:
# Add the extracted column 
df_movies = pd.concat([df_movies, tmdbId_col.rename("tmdbId")], axis=1)

In [17]:
df_movies

Unnamed: 0,movieId,title,year,imdbId,tmdbId
0,1,Toy Story,1995,114709,862.0
1,2,Jumanji,1995,113497,8844.0
2,3,Grumpier Old Men,1995,113228,15602.0
3,4,Waiting to Exhale,1995,114885,31357.0
4,5,Father of the Bride Part II,1995,113041,11862.0
...,...,...,...,...,...
86532,288967,State of Siege: Temple Attack,2021,14418234,845861.0
86533,288971,Ouija Japan,2021,11162178,878958.0
86534,288975,The Men Who Made the Movies: Howard Hawks,1973,70199,150392.0
86535,288977,Skinford: Death Sentence,2023,23050520,1102551.0


In [18]:
# Change TMDB to integer
df_movies['tmdbId'] = df_movies['tmdbId'].astype(int)

In [19]:
df_movies

Unnamed: 0,movieId,title,year,imdbId,tmdbId
0,1,Toy Story,1995,114709,862
1,2,Jumanji,1995,113497,8844
2,3,Grumpier Old Men,1995,113228,15602
3,4,Waiting to Exhale,1995,114885,31357
4,5,Father of the Bride Part II,1995,113041,11862
...,...,...,...,...,...
86532,288967,State of Siege: Temple Attack,2021,14418234,845861
86533,288971,Ouija Japan,2021,11162178,878958
86534,288975,The Men Who Made the Movies: Howard Hawks,1973,70199,150392
86535,288977,Skinford: Death Sentence,2023,23050520,1102551


## ratings.csv to generate average rating

In [20]:
df_ratings = pd.read_csv('raw_data/ratings.csv')

In [21]:
df_ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,1225734739
1,1,110,4.0,1225865086
2,1,158,4.0,1225733503
3,1,260,4.5,1225735204
4,1,356,5.0,1225735119
...,...,...,...,...
33832157,330975,8340,2.0,1091583256
33832158,330975,8493,2.5,1091585709
33832159,330975,8622,4.0,1091581777
33832160,330975,8665,3.0,1091581765


In [22]:
average_ratings = df_ratings.groupby('movieId')['rating'].mean().reset_index()

In [23]:
average_ratings

Unnamed: 0,movieId,rating
0,1,3.893508
1,2,3.278179
2,3,3.171271
3,4,2.868395
4,5,3.076957
...,...,...
83234,288967,3.500000
83235,288971,0.500000
83236,288975,4.000000
83237,288977,3.000000


In [24]:
df_movies = pd.merge(df_movies, average_ratings, how='outer',on='movieId')

In [25]:
df_movies

Unnamed: 0,movieId,title,year,imdbId,tmdbId,rating
0,1,Toy Story,1995,114709,862,3.893508
1,2,Jumanji,1995,113497,8844,3.278179
2,3,Grumpier Old Men,1995,113228,15602,3.171271
3,4,Waiting to Exhale,1995,114885,31357,2.868395
4,5,Father of the Bride Part II,1995,113041,11862,3.076957
...,...,...,...,...,...,...
86532,288967,State of Siege: Temple Attack,2021,14418234,845861,3.500000
86533,288971,Ouija Japan,2021,11162178,878958,0.500000
86534,288975,The Men Who Made the Movies: Howard Hawks,1973,70199,150392,4.000000
86535,288977,Skinford: Death Sentence,2023,23050520,1102551,3.000000


In [26]:
df_movies = df_movies.round({'rating': 1})

In [27]:
# Reorder columns
df_movies = df_movies[['movieId', 'title', 'year', 'rating', 'imdbId', 'tmdbId']]

In [28]:
# Rename columns to match database model
df_movies = df_movies.rename(columns={"movieId": "id", "rating": "avg_rate", "imdbId": "imdb_id", "tmdbId": "tmdb_id"})

In [29]:
df_movies

Unnamed: 0,id,title,year,avg_rate,imdb_id,tmdb_id
0,1,Toy Story,1995,3.9,114709,862
1,2,Jumanji,1995,3.3,113497,8844
2,3,Grumpier Old Men,1995,3.2,113228,15602
3,4,Waiting to Exhale,1995,2.9,114885,31357
4,5,Father of the Bride Part II,1995,3.1,113041,11862
...,...,...,...,...,...,...
86532,288967,State of Siege: Temple Attack,2021,3.5,14418234,845861
86533,288971,Ouija Japan,2021,0.5,11162178,878958
86534,288975,The Men Who Made the Movies: Howard Hawks,1973,4.0,70199,150392
86535,288977,Skinford: Death Sentence,2023,3.0,23050520,1102551


In [30]:
# Drop auto index and make movie id the index
df_movies.set_index('id', inplace=True)

In [31]:
df_movies

Unnamed: 0_level_0,title,year,avg_rate,imdb_id,tmdb_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Toy Story,1995,3.9,114709,862
2,Jumanji,1995,3.3,113497,8844
3,Grumpier Old Men,1995,3.2,113228,15602
4,Waiting to Exhale,1995,2.9,114885,31357
5,Father of the Bride Part II,1995,3.1,113041,11862
...,...,...,...,...,...
288967,State of Siege: Temple Attack,2021,3.5,14418234,845861
288971,Ouija Japan,2021,0.5,11162178,878958
288975,The Men Who Made the Movies: Howard Hawks,1973,4.0,70199,150392
288977,Skinford: Death Sentence,2023,3.0,23050520,1102551


In [32]:
df_movies.isnull().sum()

title          0
year         618
avg_rate    3298
imdb_id        0
tmdb_id        0
dtype: int64

## Additional cleaning of titles needed

In [33]:
# To remove any parenthesized titles
df_movies['title'] = df_movies['title'].str.replace(r'\(.*\)', '', regex=True)

In [39]:
# Need to move articles and strip out the trailing commas

def move_articles(title):
    articles = ['The', 'An', 'A', 'Les', 'L\'', 'Das', 'Le', 'El', 'La', 'Da']
    words = title.split()
    
    # Check if there is more than one word in the title
    if len(words) > 1:
        # Check if the last word is a common word
        if words[-1] in articles:
            # Move the common word to the front without a comma
            return words[-1] + ' ' + ' '.join(words[:-1])
    
    return title.rstrip(', ')

In [46]:
# Apply to the 'title' column
df_movies['title'] = df_movies['title'].apply(move_articles)

In [45]:
# Examine the remaining movies with commas
df_problems = df_movies[df_movies['title'].str.contains(',')]

In [47]:
df_problems

Unnamed: 0_level_0,title,year,avg_rate,imdb_id,tmdb_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
40,"Cry, the Beloved Country",1995,3.7,112749,34615
119,"Steal Big, Steal Little",1995,2.8,114536,78406
203,"To Wong Foo, Thanks for Everything! Julie Newmar",1995,3.2,114682,9090
309,"Red Firecracker, Green Firecracker",1994,3.6,110769,159185
345,"The Adventures of Priscilla, Queen of the Desert",1994,3.6,109045,2759
...,...,...,...,...,...
287903,"New York Portrait, Chapter II",1981,2.0,1829722,259458
288007,"Film, the Living Record of Our Memory",2022,3.5,15439196,882844
288255,"God, the Universe and Everything Else",1988,3.5,1020960,32986
288289,"Ruby Gillman, Teenage Kraken",2023,2.8,27155038,1040148


In [48]:
df_problems.to_csv('DB_models/problems.csv')

In [49]:
# Uncomment if needed to write csv file
df_movies.to_csv('DB_models/DB_movies.csv')