# Explore and clean movie data

In [13]:
import pandas as pd

## movies.csv

In [14]:
df_movies = pd.read_csv('movies.csv')

In [15]:
df_movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
86532,288967,State of Siege: Temple Attack (2021),Action|Drama
86533,288971,Ouija Japan (2021),Action|Horror
86534,288975,The Men Who Made the Movies: Howard Hawks (1973),Documentary
86535,288977,Skinford: Death Sentence (2023),Crime|Thriller


In [16]:
df_movies.dtypes

movieId     int64
title      object
genres     object
dtype: object

In [17]:
# Verify movieId is unique
len(df_movies.movieId.unique())

86537

In [18]:
# Check for any null data
df_movies.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

In [19]:
df_movies.set_index('movieId', inplace=True)

In [20]:
df_movies

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy
...,...,...
288967,State of Siege: Temple Attack (2021),Action|Drama
288971,Ouija Japan (2021),Action|Horror
288975,The Men Who Made the Movies: Howard Hawks (1973),Documentary
288977,Skinford: Death Sentence (2023),Crime|Thriller


### Seperate the movie year from title

In [21]:
# Create new column year
df_movies['year'] = df_movies['title'].str.extract(r'\((\d{4})\)') 

In [22]:
# Remove year from title column
df_movies['title'] = df_movies['title'].replace(r'\((\d{4})\)', '', regex=True).str.strip()

In [23]:
# Change genres to lowercase
df_movies['genres'] = df_movies['genres'].str.lower()

In [24]:
# Do we need to replace the pipes for genres? Use a list?

In [25]:
df_movies

Unnamed: 0_level_0,title,genres,year
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Toy Story,adventure|animation|children|comedy|fantasy,1995
2,Jumanji,adventure|children|fantasy,1995
3,Grumpier Old Men,comedy|romance,1995
4,Waiting to Exhale,comedy|drama|romance,1995
5,Father of the Bride Part II,comedy,1995
...,...,...,...
288967,State of Siege: Temple Attack,action|drama,2021
288971,Ouija Japan,action|horror,2021
288975,The Men Who Made the Movies: Howard Hawks,documentary,1973
288977,Skinford: Death Sentence,crime|thriller,2023


In [30]:
df_movies = df_movies[['title', 'year', 'genres']]

In [31]:
df_movies

Unnamed: 0_level_0,title,year,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Toy Story,1995,adventure|animation|children|comedy|fantasy
2,Jumanji,1995,adventure|children|fantasy
3,Grumpier Old Men,1995,comedy|romance
4,Waiting to Exhale,1995,comedy|drama|romance
5,Father of the Bride Part II,1995,comedy
...,...,...,...
288967,State of Siege: Temple Attack,2021,action|drama
288971,Ouija Japan,2021,action|horror
288975,The Men Who Made the Movies: Howard Hawks,1973,documentary
288977,Skinford: Death Sentence,2023,crime|thriller


## tags.csv

In [None]:
# Explore the user created tags, how many are unique
df_tags = pd.read_csv('tags.csv')

In [None]:
df_tags

In [None]:
len(df_tags.tag.unique())

In [None]:
# Change tags to lowercase, may lower the unique count
df_tags['tag'] = df_tags['tag'].str.lower()

In [None]:
len(df_tags.tag.unique())

In [None]:
# What percent are unique?
10138/2328315 * 100

In [None]:
# Check for any null data
df_tags.isnull().sum()

In [None]:
# Minimal, let's just drop an nulls
df_tags = df_tags.dropna()

## ratings.csv

In [None]:
df_ratings = pd.read_csv('ratings.csv')

In [None]:
df_ratings

In [None]:
df_ratings.isnull().sum()

## links.csv

In [None]:
df_links = pd.read_csv('links.csv')

In [None]:
df_links

## genome-tags.csv

In [None]:
df_genometags = pd.read_csv('genome-tags.csv')

In [None]:
df_genometags

In [None]:
df_genometags.isnull().sum()

In [None]:
# Change tags to lowercase
df_genometags['tag'] = df_genometags['tag'].str.lower()

## genome-scores.csv 

In [None]:
df_genomescores = pd.read_csv('genome-scores.csv')

In [None]:
df_genomescores

In [None]:
df_genomescores.isnull().sum()