# Explore and clean movie data

In [1]:
import pandas as pd

## movies.csv

In [2]:
df_movies = pd.read_csv('raw_data/movies.csv')

In [3]:
df_movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
86532,288967,State of Siege: Temple Attack (2021),Action|Drama
86533,288971,Ouija Japan (2021),Action|Horror
86534,288975,The Men Who Made the Movies: Howard Hawks (1973),Documentary
86535,288977,Skinford: Death Sentence (2023),Crime|Thriller


In [4]:
df_movies.dtypes

movieId     int64
title      object
genres     object
dtype: object

In [5]:
# Verify movieId is unique
len(df_movies.movieId.unique())

86537

In [6]:
# Check for any null data
df_movies.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

In [7]:
df_movies.set_index('movieId', inplace=True)

In [8]:
df_movies

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy
...,...,...
288967,State of Siege: Temple Attack (2021),Action|Drama
288971,Ouija Japan (2021),Action|Horror
288975,The Men Who Made the Movies: Howard Hawks (1973),Documentary
288977,Skinford: Death Sentence (2023),Crime|Thriller


### Seperate the movie year from title

In [9]:
# Create new column year
df_movies['year'] = df_movies['title'].str.extract(r'\((\d{4})\)') 

In [10]:
# Remove year from title column
df_movies['title'] = df_movies['title'].replace(r'\((\d{4})\)', '', regex=True).str.strip()

In [11]:
# Change genres to lowercase
df_movies['genres'] = df_movies['genres'].str.lower()

In [12]:
# Do we need to replace the pipes for genres? Use a list?

In [13]:
df_movies

Unnamed: 0_level_0,title,genres,year
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Toy Story,adventure|animation|children|comedy|fantasy,1995
2,Jumanji,adventure|children|fantasy,1995
3,Grumpier Old Men,comedy|romance,1995
4,Waiting to Exhale,comedy|drama|romance,1995
5,Father of the Bride Part II,comedy,1995
...,...,...,...
288967,State of Siege: Temple Attack,action|drama,2021
288971,Ouija Japan,action|horror,2021
288975,The Men Who Made the Movies: Howard Hawks,documentary,1973
288977,Skinford: Death Sentence,crime|thriller,2023


In [14]:
df_movies = df_movies[['title', 'year', 'genres']]

In [15]:
df_movies

Unnamed: 0_level_0,title,year,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Toy Story,1995,adventure|animation|children|comedy|fantasy
2,Jumanji,1995,adventure|children|fantasy
3,Grumpier Old Men,1995,comedy|romance
4,Waiting to Exhale,1995,comedy|drama|romance
5,Father of the Bride Part II,1995,comedy
...,...,...,...
288967,State of Siege: Temple Attack,2021,action|drama
288971,Ouija Japan,2021,action|horror
288975,The Men Who Made the Movies: Howard Hawks,1973,documentary
288977,Skinford: Death Sentence,2023,crime|thriller


## tags.csv

In [17]:
# Explore the user created tags, how many are unique
df_tags = pd.read_csv('raw_data/tags.csv')

In [18]:
df_tags

Unnamed: 0,userId,movieId,tag,timestamp
0,10,260,good vs evil,1430666558
1,10,260,Harrison Ford,1430666505
2,10,260,sci-fi,1430666538
3,14,1221,Al Pacino,1311600756
4,14,1221,mafia,1311600746
...,...,...,...,...
2328310,330923,176599,politically correct,1507547491
2328311,330933,3317,coming of age,1351279384
2328312,330933,3317,sexuality,1351279389
2328313,330947,5782,Not Luc Besson,1154110902


In [19]:
len(df_tags.tag.unique())

153950

In [20]:
# Change tags to lowercase, may lower the unique count
df_tags['tag'] = df_tags['tag'].str.lower()

In [21]:
len(df_tags.tag.unique())

143812

In [22]:
# What percent are unique?
10138/2328315 * 100

0.4354221829949986

In [23]:
# Check for any null data
df_tags.isnull().sum()

userId        0
movieId       0
tag          17
timestamp     0
dtype: int64

In [24]:
# Minimal, let's just drop an nulls
df_tags = df_tags.dropna()

## ratings.csv

In [26]:
df_ratings = pd.read_csv('raw_data/ratings.csv')

In [27]:
df_ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,1225734739
1,1,110,4.0,1225865086
2,1,158,4.0,1225733503
3,1,260,4.5,1225735204
4,1,356,5.0,1225735119
...,...,...,...,...
33832157,330975,8340,2.0,1091583256
33832158,330975,8493,2.5,1091585709
33832159,330975,8622,4.0,1091581777
33832160,330975,8665,3.0,1091581765


In [28]:
df_ratings.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

## links.csv

In [31]:
df_links = pd.read_csv('raw_data/links.csv')

In [32]:
df_links

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0
...,...,...,...
86532,288967,14418234,845861.0
86533,288971,11162178,878958.0
86534,288975,70199,150392.0
86535,288977,23050520,1102551.0


## genome-tags.csv

In [34]:
df_genometags = pd.read_csv('raw_data/genome-tags.csv')

In [35]:
df_genometags

Unnamed: 0,tagId,tag
0,1,007
1,2,007 (series)
2,3,18th century
3,4,1920s
4,5,1930s
...,...,...
1123,1124,writing
1124,1125,wuxia
1125,1126,wwii
1126,1127,zombie


In [36]:
df_genometags.isnull().sum()

tagId    0
tag      0
dtype: int64

In [37]:
# Change tags to lowercase
df_genometags['tag'] = df_genometags['tag'].str.lower()

## genome-scores.csv 

In [39]:
df_genomescores = pd.read_csv('raw_data/genome-scores.csv')

In [40]:
df_genomescores

Unnamed: 0,movieId,tagId,relevance
0,1,1,0.03200
1,1,2,0.02225
2,1,3,0.07000
3,1,4,0.05900
4,1,5,0.12300
...,...,...,...
18472123,288167,1124,0.09875
18472124,288167,1125,0.02950
18472125,288167,1126,0.02275
18472126,288167,1127,0.11225


In [41]:
df_genomescores.isnull().sum()

movieId      0
tagId        0
relevance    0
dtype: int64