# 02 - Data Cleaning & Preprocessing

Notebook này thực hiện làm sạch và preprocessing dữ liệu phim.

## Mục Tiêu
- Xử lý missing values
- Loại bỏ duplicates
- Xử lý outliers
- Feature engineering
- Text vectorization (TF-IDF)

## 1. Import Libraries

In [None]:
import sys
import os

# Add src to path
sys.path.append(os.path.abspath('../src'))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from data_processing.cleaner import MovieDataCleaner
from data_processing.preprocessor import MovieDataPreprocessor

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Set plot style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)

print("✅ Libraries imported successfully")

## 2. Load Data

In [None]:
# Load raw data
data_dir = '../data/raw/ml-latest-small'

movies = pd.read_csv(f'{data_dir}/movies.csv')
ratings = pd.read_csv(f'{data_dir}/ratings.csv')
tags = pd.read_csv(f'{data_dir}/tags.csv')
links = pd.read_csv(f'{data_dir}/links.csv')

print(f"Loaded {len(movies)} movies, {len(ratings)} ratings, {len(tags)} tags")
print(f"\nMovies shape: {movies.shape}")
print(f"Ratings shape: {ratings.shape}")

## 3. Data Quality Assessment

In [None]:
cleaner = MovieDataCleaner(verbose=True)

print("=" * 50)
print("MISSING VALUES CHECK")
print("=" * 50)

print("\nMovies:")
missing_movies = cleaner.check_missing_values(movies)
if len(missing_movies) > 0:
    print(missing_movies)
else:
    print("No missing values found")

print("\nRatings:")
missing_ratings = cleaner.check_missing_values(ratings)
if len(missing_ratings) > 0:
    print(missing_ratings)
else:
    print("No missing values found")

In [None]:
print("=" * 50)
print("DUPLICATES CHECK")
print("=" * 50)

print(f"\nMovies duplicates (by movieId): {movies['movieId'].duplicated().sum()}")
print(f"Movies duplicates (all columns): {movies.duplicated().sum()}")

print(f"\nRatings duplicates (all columns): {ratings.duplicated().sum()}")

In [None]:
print("=" * 50)
print("DATA INFO")
print("=" * 50)

print("\nMovies Info:")
movies.info()

print("\nRatings Info:")
ratings.info()

## 4. Clean Movies Data

In [None]:
print("=" * 50)
print("CLEANING MOVIES DATA")
print("=" * 50)

movies_clean = movies.copy()

# 1. Remove duplicates nếu có
if movies_clean.duplicated(subset=['movieId']).sum() > 0:
    movies_clean = cleaner.remove_duplicates(movies_clean, subset=['movieId'])

# 2. Handle missing genres (nếu có)
if movies_clean['genres'].isnull().sum() > 0:
    movies_clean = cleaner.handle_missing_values(
        movies_clean, 
        strategy='constant', 
        columns=['genres'], 
        fill_value='(no genres listed)'
    )

print(f"\nCleaned movies: {len(movies_clean)} rows")
movies_clean.head()

## 5. Feature Engineering - Movies

In [None]:
print("=" * 50)
print("FEATURE ENGINEERING")
print("=" * 50)

preprocessor = MovieDataPreprocessor(verbose=True)

# 1. Extract year from title
movies_clean = preprocessor.extract_year_from_title(movies_clean)

# 2. Clean title (remove year)
movies_clean = preprocessor.clean_title(movies_clean)

# 3. Parse genres
movies_clean = preprocessor.parse_genres(movies_clean)

# 4. Create genre features
movies_clean = preprocessor.create_genre_features(movies_clean)

# 5. Create temporal features
movies_clean = preprocessor.create_temporal_features(movies_clean)

print("\nNew columns added:")
new_cols = [col for col in movies_clean.columns if col not in movies.columns]
print(new_cols)

movies_clean.head()

## 6. Add Rating Features

In [None]:
print("=" * 50)
print("RATING FEATURES")
print("=" * 50)

# Create rating features
movies_enriched = preprocessor.create_rating_features(movies_clean, ratings)

print("\nRating features added:")
rating_cols = ['avg_rating', 'std_rating', 'num_ratings', 'popularity', 'rating_confidence']
print(rating_cols)

print("\nSample with rating features:")
movies_enriched[['title_clean', 'year', 'avg_rating', 'num_ratings', 'popularity']].head(10)

## 7. Handle Outliers in Ratings

In [None]:
print("=" * 50)
print("OUTLIER DETECTION")
print("=" * 50)

# Check for outliers in num_ratings
outliers = cleaner.detect_outliers_iqr(movies_enriched, 'num_ratings')

print(f"\nMovies with outlier rating counts:")
print(movies_enriched[outliers][['title_clean', 'year', 'num_ratings', 'avg_rating']].sort_values('num_ratings', ascending=False).head(10))

In [None]:
# Visualize rating distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram of num_ratings
axes[0].hist(movies_enriched['num_ratings'], bins=50, edgecolor='black')
axes[0].set_xlabel('Number of Ratings')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution of Rating Counts')
axes[0].set_yscale('log')

# Boxplot
axes[1].boxplot(movies_enriched['num_ratings'])
axes[1].set_ylabel('Number of Ratings')
axes[1].set_title('Boxplot of Rating Counts')

plt.tight_layout()
plt.show()

print("\nNote: Outliers are expected (popular movies have many more ratings)")
print("We will NOT remove these outliers as they represent real popular movies.")

## 8. Text Vectorization (TF-IDF)

In [None]:
print("=" * 50)
print("TEXT VECTORIZATION")
print("=" * 50)

# Create combined features (title + genres)
movies_enriched = preprocessor.create_combined_text_features(
    movies_enriched, 
    columns=['title_clean', 'genres']
)

print("\nSample combined features:")
print(movies_enriched[['title_clean', 'genres', 'combined_features']].head())

In [None]:
# TF-IDF vectorization
tfidf_matrix, tfidf_vectorizer = preprocessor.vectorize_text_tfidf(
    movies_enriched['combined_features'],
    max_features=200,
    ngram_range=(1, 2)
)

print(f"\nTF-IDF matrix shape: {tfidf_matrix.shape}")
print(f"Number of features: {len(tfidf_vectorizer.get_feature_names_out())}")

print("\nTop 20 TF-IDF features:")
print(tfidf_vectorizer.get_feature_names_out()[:20])

## 9. One-Hot Encode Genres

In [None]:
print("=" * 50)
print("ONE-HOT ENCODING GENRES")
print("=" * 50)

# One-hot encode genres
movies_with_genres, mlb = preprocessor.encode_genres_onehot(movies_enriched)

print(f"\nGenres one-hot encoded: {len(mlb.classes_)}")
print(f"Genre classes: {mlb.classes_}")

print("\nSample with genre encoding:")
genre_cols = [col for col in movies_with_genres.columns if col.startswith('genre_')]
movies_with_genres[['title_clean'] + genre_cols[:5]].head()

## 10. Data Summary

In [None]:
print("=" * 50)
print("FINAL DATASET SUMMARY")
print("=" * 50)

print(f"\nTotal movies: {len(movies_enriched)}")
print(f"Total columns: {len(movies_enriched.columns)}")

print("\nColumn groups:")
print(f"  Original: movieId, title, genres")
print(f"  Extracted: year, title_clean")
print(f"  Genre features: {len([c for c in movies_enriched.columns if c.startswith('is_')])}")
print(f"  Rating features: avg_rating, std_rating, num_ratings, popularity, rating_confidence")
print(f"  Temporal features: movie_age, decade, era")

print("\nMissing values after cleaning:")
missing_final = cleaner.check_missing_values(movies_enriched)
if len(missing_final) > 0:
    print(missing_final)
else:
    print("No missing values!")

print("\nData quality:")
print(f"  Duplicates: {movies_enriched.duplicated().sum()}")
print(f"  Movies with ratings: {(movies_enriched['num_ratings'] > 0).sum()}")
print(f"  Movies with year: {movies_enriched['year'].notna().sum()}")

## 11. Save Processed Data

In [None]:
# Create processed directory
processed_dir = '../data/processed'
os.makedirs(processed_dir, exist_ok=True)

# Save movies enriched
movies_file = os.path.join(processed_dir, 'movies_enriched.csv')
movies_enriched.to_csv(movies_file, index=False)
print(f"✅ Saved: {movies_file}")

# Save movies with one-hot genres (for some models)
movies_genres_file = os.path.join(processed_dir, 'movies_with_genres.csv')
movies_with_genres.to_csv(movies_genres_file, index=False)
print(f"✅ Saved: {movies_genres_file}")

# Save ratings (copy to processed)
ratings_file = os.path.join(processed_dir, 'ratings.csv')
ratings.to_csv(ratings_file, index=False)
print(f"✅ Saved: {ratings_file}")

# Save TF-IDF matrix
import pickle
tfidf_file = os.path.join(processed_dir, 'tfidf_matrix.pkl')
with open(tfidf_file, 'wb') as f:
    pickle.dump({
        'matrix': tfidf_matrix,
        'vectorizer': tfidf_vectorizer
    }, f)
print(f"✅ Saved: {tfidf_file}")

print("\n" + "=" * 50)
print("✅ DATA CLEANING & PREPROCESSING COMPLETED!")
print("=" * 50)

## Summary

### Completed Tasks:
1. ✅ **Missing Values**: Checked and handled (none found in MovieLens)
2. ✅ **Duplicates**: Checked and removed if any
3. ✅ **Outliers**: Detected in rating counts (kept as they represent popular movies)
4. ✅ **Feature Engineering**:
   - Extracted year from title
   - Parsed genres
   - Created genre features (binary)
   - Created rating features (avg, std, count, popularity)
   - Created temporal features (age, decade, era)
5. ✅ **Text Vectorization**:
   - TF-IDF on combined features (title + genres)
   - One-hot encoding for genres

### Files Created:
- `data/processed/movies_enriched.csv` - Main dataset with all features
- `data/processed/movies_with_genres.csv` - Dataset with one-hot encoded genres
- `data/processed/ratings.csv` - Ratings data
- `data/processed/tfidf_matrix.pkl` - TF-IDF matrix and vectorizer

### Next Steps:
- Phase 3: EDA & Visualization
- Phase 4: Model Building