In [2]:
# ============================================================
# Movie Recommendation System (Content-Based)
# Phase 1: Dataset Loading & Initial Inspection
# ============================================================
#
# Objective of this phase:
# - Load the datasets correctly
# - Understand what each dataset contains
# - Inspect data structure, types, and basic statistics
# - Create a merged dataset for further analysis
#
# IMPORTANT:
# No modeling or similarity computation is done here.
# This phase is ONLY about understanding the data.
# ============================================================


# -----------------------------
# Import Required Libraries
# -----------------------------

import pandas as pd              # Pandas for data manipulation and analysis
import numpy as np               # NumPy for numerical operations
import matplotlib.pyplot as plt  # Matplotlib for plotting
import seaborn as sns            # Seaborn for statistical visualization
from scipy import stats          # Scipy for statistical operations

# Set Seaborn style for clean plots
sns.set(style="whitegrid")

# Ensure plots render inside the notebook
%matplotlib inline


# -----------------------------
# Load the Datasets
# -----------------------------

# Load movie metadata dataset
# Contains:
# - movieId: Unique identifier for each movie
# - title: Movie title with release year
# - genres: Pipe-separated genres (e.g., Action|Comedy)

movies_df = pd.read_csv("../data/movies.csv")

# Load user ratings dataset
# Contains:
# - userId: Unique identifier for each user
# - movieId: Movie being rated
# - rating: Rating given by the user (0.5 to 5.0)
# - timestamp: UNIX timestamp of rating

ratings_df = pd.read_csv("../data/ratings.csv")


# -----------------------------
# Preview the Datasets
# -----------------------------

# Display first few rows of movies dataset
movies_df.head()


# Display first few rows of ratings dataset
ratings_df.head()


# -----------------------------
# Dataset Structure & Data Types
# -----------------------------

# Information about movies dataset
movies_df.info()

# Information about ratings dataset
ratings_df.info()


# -----------------------------
# Basic Statistical Summary
# -----------------------------

# Descriptive statistics for ratings dataset
# Helps understand:
# - Mean rating
# - Spread of ratings
# - Minimum and maximum values

ratings_df.describe()


# -----------------------------
# Inspect Genre Information
# -----------------------------

# Check unique genre combinations
# Genres are stored as a single string with '|' separator
movies_df['genres'].unique()[:10]


# -----------------------------
# Merge Movies and Ratings
# -----------------------------

# Merge datasets on movieId
# This creates a unified dataset combining:
# - Movie metadata
# - User rating behavior

merged_df = pd.merge(
    ratings_df,
    movies_df,
    on="movieId",
    how="inner"
)

# Preview merged dataset
merged_df.head()


# -----------------------------
# Basic Checks on Merged Dataset
# -----------------------------

# Check size of merged dataset
merged_df.shape

# Check for missing values in merged dataset
merged_df.isnull().sum()

merged_df.to_csv("../data/merged_movies_ratings.csv", index=False)



# ============================================================
# End of Phase 1
# What we achieved:
# - Loaded datasets correctly
# - Understood structure and data types
# - Created analysis-ready merged dataset
# - Prepared ground for Exploratory Data Analysis (EDA)
# ============================================================


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB
