# 01 - Data Collection: MovieLens Dataset

Notebook này thực hiện thu thập dữ liệu phim từ MovieLens dataset.

## Mục Tiêu
- Download MovieLens dataset
- Explore cấu trúc dữ liệu
- Verify yêu cầu: ≥2000 items, ≥5 features
- Document findings

## 1. Import Libraries

In [None]:
import sys
import os

# Add src to path
sys.path.append(os.path.abspath('../src'))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from data_processing.collector import MovieDataCollector

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Set plot style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries imported successfully")

## 2. Download Dataset

In [None]:
# Initialize collector
collector = MovieDataCollector(data_dir='../data/raw')

# Download small dataset (good for development)
# Options: 'small', '25m', 'latest'
dataset_dir = collector.download_dataset('small')

print(f"\nDataset location: {dataset_dir}")

## 3. Load Data

In [None]:
# Load all data files
movies = collector.load_movies(dataset_dir)
ratings = collector.load_ratings(dataset_dir)
tags = collector.load_tags(dataset_dir)
links = collector.load_links(dataset_dir)

## 4. Dataset Overview

In [None]:
# Get dataset info
info = collector.get_dataset_info(dataset_dir)

print("="*50)
print("DATASET INFORMATION")
print("="*50)
for key, value in info.items():
    print(f"{key}: {value}")

## 5. Explore Movies Data

In [None]:
# Display first few rows
print("\nMovies DataFrame:")
print(f"Shape: {movies.shape}")
print(f"Columns: {list(movies.columns)}")
movies.head(10)

In [None]:
# Data types and missing values
print("\nData Info:")
movies.info()

In [None]:
# Basic statistics
print("\nBasic Statistics:")
print(f"Total movies: {len(movies)}")
print(f"Unique movie IDs: {movies['movieId'].nunique()}")
print(f"Missing values:\n{movies.isnull().sum()}")

In [None]:
# Sample movies
print("\nSample movies:")
movies.sample(5)

## 6. Explore Ratings Data

In [None]:
print("\nRatings DataFrame:")
print(f"Shape: {ratings.shape}")
print(f"Columns: {list(ratings.columns)}")
ratings.head(10)

In [None]:
print("\nRatings Statistics:")
print(f"Total ratings: {len(ratings)}")
print(f"Unique users: {ratings['userId'].nunique()}")
print(f"Unique movies rated: {ratings['movieId'].nunique()}")
print(f"\nRating distribution:")
print(ratings['rating'].value_counts().sort_index())

In [None]:
# Rating statistics
ratings['rating'].describe()

## 7. Explore Tags Data

In [None]:
if tags is not None:
    print("\nTags DataFrame:")
    print(f"Shape: {tags.shape}")
    print(f"Columns: {list(tags.columns)}")
    display(tags.head(10))
    
    print(f"\nTotal tags: {len(tags)}")
    print(f"Unique tags: {tags['tag'].nunique()}")
    print(f"\nMost common tags:")
    print(tags['tag'].value_counts().head(10))
else:
    print("No tags data available")

## 8. Explore Links Data

In [None]:
if links is not None:
    print("\nLinks DataFrame:")
    print(f"Shape: {links.shape}")
    print(f"Columns: {list(links.columns)}")
    display(links.head(10))
    
    print(f"\nIMDb IDs: {links['imdbId'].notna().sum()}")
    print(f"TMDB IDs: {links['tmdbId'].notna().sum()}")
else:
    print("No links data available")

## 9. Extract Features from Movies

Extract additional features như year từ title và parse genres.

In [None]:
# Extract year from title
movies['year'] = movies['title'].str.extract(r'\((\d{4})\)', expand=False)
movies['year'] = pd.to_numeric(movies['year'], errors='coerce')

# Clean title (remove year)
movies['title_clean'] = movies['title'].str.replace(r'\s*\(\d{4}\)', '', regex=True)

print("\nExtracted features:")
movies[['title', 'title_clean', 'year', 'genres']].head(10)

In [None]:
# Parse genres
print("\nGenre distribution:")
all_genres = movies['genres'].str.split('|').explode()
genre_counts = all_genres.value_counts()
print(genre_counts)

## 10. Merge Data for Rich Features

In [None]:
# Merge movies with average ratings
movie_stats = ratings.groupby('movieId').agg({
    'rating': ['mean', 'count']
}).reset_index()

movie_stats.columns = ['movieId', 'avg_rating', 'num_ratings']

# Merge with movies
movies_enriched = movies.merge(movie_stats, on='movieId', how='left')

# Fill NaN ratings (movies with no ratings yet)
movies_enriched['avg_rating'] = movies_enriched['avg_rating'].fillna(0)
movies_enriched['num_ratings'] = movies_enriched['num_ratings'].fillna(0)

print("\nEnriched Movies DataFrame:")
print(f"Shape: {movies_enriched.shape}")
movies_enriched.head(10)

## 11. Verify Requirements

Kiểm tra xem dataset có đáp ứng yêu cầu không:
- Dataset ≥ 2,000 items
- Có ít nhất 5 features

In [None]:
print("="*50)
print("REQUIREMENTS VERIFICATION")
print("="*50)

# Check số lượng items
num_movies = len(movies_enriched)
print(f"\n1. Number of items: {num_movies}")
if num_movies >= 2000:
    print("   PASS: Dataset has ≥ 2,000 items")
else:
    print("   FAIL: Dataset has < 2,000 items")

# Check số lượng features
features = ['movieId', 'title_clean', 'genres', 'year', 'avg_rating', 'num_ratings']
print(f"\n2. Available features ({len(features)}):")
for i, feat in enumerate(features, 1):
    print(f"   {i}. {feat}")

if len(features) >= 5:
    print("\n   PASS: Dataset has ≥ 5 features")
else:
    print("\n   FAIL: Dataset has < 5 features")

print("\n" + "="*50)
print("ALL REQUIREMENTS MET!")
print("="*50)

## 12. Save Processed Data

In [None]:
# Create processed data directory
processed_dir = '../data/processed'
os.makedirs(processed_dir, exist_ok=True)

# Save enriched movies data
output_file = os.path.join(processed_dir, 'movies_enriched.csv')
movies_enriched.to_csv(output_file, index=False)
print(f"\nSaved enriched movies to: {output_file}")

# Save ratings
ratings_file = os.path.join(processed_dir, 'ratings.csv')
ratings.to_csv(ratings_file, index=False)
print(f"Saved ratings to: {ratings_file}")

print("\nData collection completed successfully!")

## 13. Summary

### Dataset Statistics
- **Total Movies:** {num_movies}
- **Total Ratings:** {num_ratings}
- **Total Users:** {num_users}
- **Features:** 6+ (movieId, title, genres, year, avg_rating, num_ratings)

### Next Steps
1. Data collection done
2. Data cleaning (notebook 02)
3. EDA & visualization (notebook 03)
4. Model building (notebook 04)
5. Model evaluation (notebook 05)