In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Load the dataset
data = pd.read_csv('movies_data.csv')

# 1. Handle Missing Values
# Drop rows with missing essential fields
data = data.dropna(subset=['title', 'release_year', 'rating'])

# Fill missing descriptions with a placeholder
data['description'] = data['description'].fillna('No description available')

# Drop rows where poster_url is missing (optional)
data = data.dropna(subset=['poster_url'])

# 2. Normalize Numerical Data
# Normalize the 'rating' column using MinMaxScaler
scaler = MinMaxScaler()
data['rating'] = scaler.fit_transform(data[['rating']])

# 3. Encode Categorical Features
# Split genres into a list and perform one-hot encoding
data['genres'] = data['genres'].str.split(', ')

# Create a one-hot encoding for genres
genres_one_hot = data['genres'].explode().str.get_dummies().groupby(level=0).sum()
data = pd.concat([data, genres_one_hot], axis=1)

# 4. Remove Duplicates
# Remove duplicate entries based on the title
data = data.drop_duplicates(subset='title', keep='first')

# 5. Save the Cleaned Data
data.to_csv('movies_data_cleaned.csv', index=False)
print("Cleaned data saved to movies_data_cleaned.csv")


Cleaned data saved to movies_data_cleaned.csv
