In [1]:
import pandas as pd
import re

# Load the dataset
movies_df = pd.read_csv('datasets/movies.csv')

# Function to extract year from the title
def extract_year(title):
    match = re.search(r'\((\d{4})\)$', title)
    return int(match.group(1)) if match else None

# Extracting the year and creating a new column
movies_df['year'] = movies_df['title'].apply(extract_year)

# Improving the regex to remove the year from the title column
movies_df['title'] = movies_df['title'].apply(lambda x: re.sub(r'\s*\(\d{4}\)\s*$', '', x))

# Splitting genres into separate rows
genres_expanded = movies_df['genres'].str.split('|', expand=True)
genres_expanded['movieId'] = movies_df['movieId']
melted_genres = genres_expanded.melt(id_vars='movieId', value_name='genre')
melted_genres = melted_genres.dropna().drop('variable', axis=1)

# Checking for duplicate movies
duplicate_movies = movies_df[movies_df.duplicated(subset='movieId', keep=False)]

# Save the cleaned movies_df DataFrame to a new CSV file
movies_df.to_csv('datasets/cleaned_movies.csv', index=False)

# Save the expanded genres DataFrame to a new CSV file
melted_genres.to_csv('datasets/cleaned_genres.csv', index=False)


In [2]:
import pandas as pd

# Load the dataset
ratings_df = pd.read_csv('datasets/ratings.csv')

# Convert timestamp to a human-readable date format
ratings_df['timestamp'] = pd.to_datetime(ratings_df['timestamp'], unit='s')

# Validate ratings range (assuming a 0-5 scale)
ratings_df = ratings_df[(ratings_df['rating'] >= 0) & (ratings_df['rating'] <= 5)]

# Save the cleaned ratings_df DataFrame to a new CSV file
ratings_df.to_csv('datasets/cleaned_ratings.csv', index=False)


In [3]:
import pandas as pd

# Load the dataset
links_df = pd.read_csv('datasets/links.csv')

# Fill missing tmdbId values with a placeholder (-1)
links_df['tmdbId'] = links_df['tmdbId'].fillna(-1)

# Convert tmdbId to integer (as filling with a placeholder makes it float)
links_df['tmdbId'] = links_df['tmdbId'].astype(int)

# Save the cleaned links_df DataFrame to a new CSV file
links_df.to_csv('datasets/cleaned_links.csv', index=False)


In [4]:
import pandas as pd

# Load the dataset
tags_df = pd.read_csv('datasets/tags.csv')

# Convert timestamp to a human-readable date format
tags_df['timestamp'] = pd.to_datetime(tags_df['timestamp'], unit='s')

# Clean and standardize the tag text
tags_df['tag'] = tags_df['tag'].str.strip().str.lower()

# Save the cleaned tags_df DataFrame to a new CSV file
tags_df.to_csv('datasets/cleaned_tags.csv', index=False)
