In [2]:
import pandas as pd
import re

In [8]:
# Load the movies dataset
movies = pd.read_csv('../backend/data/raw/movies.csv')

print(f"Original dataset shape: {movies.shape}")

Original dataset shape: (9742, 3)


In [10]:
# Check for missing values
print("\nMissing values:")
print(movies.isnull().sum())


Missing values:
movieId    0
title      0
genres     0
dtype: int64


In [12]:
# Check for movies with no genres
no_genre = movies[movies['genres'] == '(no genres listed)']
print(f"\nMovies with no genres: {len(no_genre)}")


Movies with no genres: 34


In [14]:
# Check for duplicate movieId or title
duplicates = movies[movies.duplicated(subset=['movieId'], keep=False) | movies.duplicated(subset=['title'], keep=False)]
print(f"\nDuplicate entries: {len(duplicates)}")


Duplicate entries: 10


In [18]:
# Extract year from title and check for unusual years
def extract_year(title):
    match = re.search(r'\((\d{4})\)$', title)
    return int(match.group(1)) if match else None

movies['year'] = movies['title'].apply(extract_year)
unusual_years = movies[(movies['year'] < 1900) | (movies['year'] > 2023)]
print(f"\nMovies with unusual years: {len(unusual_years)}")


Movies with unusual years: 0


In [20]:
# Check number of genres per movie
movies['genre_count'] = movies['genres'].str.count('\|') + 1
genre_stats = movies['genre_count'].describe()
print("\nGenre count statistics:")
print(genre_stats)


Genre count statistics:
count    9742.000000
mean        2.266886
std         1.123249
min         1.000000
25%         1.000000
50%         2.000000
75%         3.000000
max        10.000000
Name: genre_count, dtype: float64


In [22]:
# Suggest filtering criteria
print("\nSuggested filtering criteria:")
print("1. Remove movies with no genres")
print("2. Remove duplicate entries")
print("3. Remove movies with years before 1900 or after 2023")
print("4. Consider removing movies with too many genres (e.g., more than 5)")


Suggested filtering criteria:
1. Remove movies with no genres
2. Remove duplicate entries
3. Remove movies with years before 1900 or after 2023
4. Consider removing movies with too many genres (e.g., more than 5)


In [26]:
# Apply suggested filters
filtered_movies = movies[
    (movies['genres'] != '(no genres listed)') &
    (~movies.duplicated(subset=['movieId'], keep='first')) &
    (~movies.duplicated(subset=['title'], keep='first')) &
    (movies['year'].between(1900, 2023)) &
    (movies['genre_count'] <= 5)
]

print(f"\nFiltered dataset shape: {filtered_movies.shape}")
print(f"Removed {len(movies) - len(filtered_movies)} entries")


Filtered dataset shape: (9611, 5)
Removed 131 entries


In [28]:
# Save filtered dataset
filtered_movies.to_csv('../backend/data/processed/filtered_movies.csv', index=False)
print("\nFiltered dataset saved to '../backend/data/processed/filtered_movies.csv'")


Filtered dataset saved to '../backend/data/processed/filtered_movies.csv'
