In [1]:
import pandas as pd
import re

# Load the datasets
links_df = pd.read_csv('datasets/links.csv')
movies_df = pd.read_csv('datasets/movies.csv')
ratings_df = pd.read_csv('datasets/ratings.csv')
tags_df = pd.read_csv('datasets/tags.csv')

# Handling missing values in the Links dataset
links_df['tmdbId'].fillna(0, inplace=True)

# Processing the genres field in the Movies dataset
movies_df['genres'] = movies_df['genres'].apply(lambda x: x.split('|'))

# Extracting the year from the title field and removing it from the title
def extract_year(title):
    match = re.search(r'\((\d{4})\)', title)
    return int(match.group(1)) if match else None

movies_df['year'] = movies_df['title'].apply(extract_year)
movies_df['title'] = movies_df['title'].apply(lambda x: re.sub(r'(\(\d{4}\))', '', x).strip())

# Converting timestamp fields in Ratings and Tags datasets
ratings_df['timestamp'] = pd.to_datetime(ratings_df['timestamp'], unit='s')
tags_df['timestamp'] = pd.to_datetime(tags_df['timestamp'], unit='s')

# Save the cleaned datasets
links_df.to_csv('datasets/cleaned_links.csv', index=False)
movies_df.to_csv('datasets/cleaned_movies.csv', index=False)
ratings_df.to_csv('datasets/cleaned_ratings.csv', index=False)
tags_df.to_csv('datasets/cleaned_tags.csv', index=False)
