# Create csv to match the movie model

## movies.csv

In [2]:
# id/title/year/avg_rate

In [3]:
import pandas as pd

In [5]:
df_movies = pd.read_csv('raw_data/movies.csv')

In [6]:
# commented out because resetting index causes problems when adding new columns
# df_movies.set_index('movieId', inplace=True)

In [7]:
# Create new column year
df_movies['year'] = df_movies['title'].str.extract(r'\((\d{4})\)') 

In [8]:
# Remove year from title column
df_movies['title'] = df_movies['title'].replace(r'\((\d{4})\)', '', regex=True).str.strip()

In [9]:
# Change genres to lowercase
df_movies['genres'] = df_movies['genres'].str.lower()

In [10]:
# Reorder columns
df_movies = df_movies[['movieId','title', 'year', 'genres']]

In [11]:
df_movies

Unnamed: 0,movieId,title,year,genres
0,1,Toy Story,1995,adventure|animation|children|comedy|fantasy
1,2,Jumanji,1995,adventure|children|fantasy
2,3,Grumpier Old Men,1995,comedy|romance
3,4,Waiting to Exhale,1995,comedy|drama|romance
4,5,Father of the Bride Part II,1995,comedy
...,...,...,...,...
86532,288967,State of Siege: Temple Attack,2021,action|drama
86533,288971,Ouija Japan,2021,action|horror
86534,288975,The Men Who Made the Movies: Howard Hawks,1973,documentary
86535,288977,Skinford: Death Sentence,2023,crime|thriller


## links.csv to add needed columns movies df
### commented out as we changed the model and this is no longer needed

In [12]:
# read in IMDB and TMDB links to add to movies_df
# df_link = pd.read_csv('links.csv')

In [13]:
# df_link

In [14]:
# df_link.isnull

In [15]:
# Fill null values with 0
# df_link['tmdbId'] = df_link['tmdbId'].fillna(0)

In [16]:
 # Extract IMDB column 
# imdbId_col = df_link["imdbId"]

In [17]:
# imdbId_col

In [18]:
# Add the extracted column 
# df_movies = pd.concat([df_movies, imdbId_col.rename("imdbId")], axis=1)

In [19]:
# df_movies

In [20]:
# Repeat for TMDB
# tmdbId_col = df_link["tmdbId"]

In [21]:
# df_movies

In [22]:
# df_movies = pd.concat([df_movies, tmdbId_col.rename("tmdbId")], axis=1)

In [23]:
# df_movies

In [24]:
# df_movies.dtypes

In [25]:
# Change TMDB to integer
# df_movies['tmdbId'] = df_movies['tmdbId'].astype(int)

In [26]:
# df_movies

In [27]:
# df_movies.dtypes

## ratings.csv to generate average rating

In [28]:
df_ratings = pd.read_csv('raw_data/ratings.csv')

In [29]:
df_ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,1225734739
1,1,110,4.0,1225865086
2,1,158,4.0,1225733503
3,1,260,4.5,1225735204
4,1,356,5.0,1225735119
...,...,...,...,...
33832157,330975,8340,2.0,1091583256
33832158,330975,8493,2.5,1091585709
33832159,330975,8622,4.0,1091581777
33832160,330975,8665,3.0,1091581765


In [34]:
average_ratings = df_ratings.groupby('movieId')['rating'].mean().reset_index()

In [35]:
average_ratings

Unnamed: 0,movieId,rating
0,1,3.893508
1,2,3.278179
2,3,3.171271
3,4,2.868395
4,5,3.076957
...,...,...
83234,288967,3.500000
83235,288971,0.500000
83236,288975,4.000000
83237,288977,3.000000


In [36]:
df_movies = pd.merge(df_movies, average_ratings, how='outer',on='movieId')

In [37]:
df_movies

Unnamed: 0,movieId,title,year,genres,rating
0,1,Toy Story,1995,adventure|animation|children|comedy|fantasy,3.893508
1,2,Jumanji,1995,adventure|children|fantasy,3.278179
2,3,Grumpier Old Men,1995,comedy|romance,3.171271
3,4,Waiting to Exhale,1995,comedy|drama|romance,2.868395
4,5,Father of the Bride Part II,1995,comedy,3.076957
...,...,...,...,...,...
86532,288967,State of Siege: Temple Attack,2021,action|drama,3.500000
86533,288971,Ouija Japan,2021,action|horror,0.500000
86534,288975,The Men Who Made the Movies: Howard Hawks,1973,documentary,4.000000
86535,288977,Skinford: Death Sentence,2023,crime|thriller,3.000000


In [38]:
df_movies = df_movies.round({'rating': 1})

In [39]:
df_movies

Unnamed: 0,movieId,title,year,genres,rating
0,1,Toy Story,1995,adventure|animation|children|comedy|fantasy,3.9
1,2,Jumanji,1995,adventure|children|fantasy,3.3
2,3,Grumpier Old Men,1995,comedy|romance,3.2
3,4,Waiting to Exhale,1995,comedy|drama|romance,2.9
4,5,Father of the Bride Part II,1995,comedy,3.1
...,...,...,...,...,...
86532,288967,State of Siege: Temple Attack,2021,action|drama,3.5
86533,288971,Ouija Japan,2021,action|horror,0.5
86534,288975,The Men Who Made the Movies: Howard Hawks,1973,documentary,4.0
86535,288977,Skinford: Death Sentence,2023,crime|thriller,3.0


In [40]:
# df_movies = df_movies[['movieId', 'title', 'year', 'rating', 'imdbId', 'tmdbId', 'genres']]

In [41]:
# df_movies

## Pull out genres and create a seperate df/csv

In [42]:
df_genres = df_movies[['movieId','genres']]

In [43]:
df_genres

Unnamed: 0,movieId,genres
0,1,adventure|animation|children|comedy|fantasy
1,2,adventure|children|fantasy
2,3,comedy|romance
3,4,comedy|drama|romance
4,5,comedy
...,...,...
86532,288967,action|drama
86533,288971,action|horror
86534,288975,documentary
86535,288977,crime|thriller


In [44]:
df_genres.to_csv('DB_models/DB_genres.csv')

In [45]:
df_movies = df_movies[['movieId', 'title', 'year', 'rating']]

In [46]:
df_movies

Unnamed: 0,movieId,title,year,rating
0,1,Toy Story,1995,3.9
1,2,Jumanji,1995,3.3
2,3,Grumpier Old Men,1995,3.2
3,4,Waiting to Exhale,1995,2.9
4,5,Father of the Bride Part II,1995,3.1
...,...,...,...,...
86532,288967,State of Siege: Temple Attack,2021,3.5
86533,288971,Ouija Japan,2021,0.5
86534,288975,The Men Who Made the Movies: Howard Hawks,1973,4.0
86535,288977,Skinford: Death Sentence,2023,3.0


In [47]:
df_movies.isnull().sum()

movieId       0
title         0
year        618
rating     3298
dtype: int64

In [48]:
df_movies.to_csv('DB_models/DB_movies.csv')