## **Movies Data Ingestion**

### **Exploring**

In [1]:
import pandas as pd
movies_df = pd.read_csv(r"C:\Users\Shahe\movie-recommender\data\movie.csv")
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [2]:
movies_df.shape

(27278, 3)

In [3]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27278 entries, 0 to 27277
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  27278 non-null  int64 
 1   title    27278 non-null  object
 2   genres   27278 non-null  object
dtypes: int64(1), object(2)
memory usage: 639.5+ KB


In [4]:
movies_df.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

### **Processing**

In [5]:
import pandas as pd


def remove_duplicates(movies_df: pd.DataFrame) -> pd.DataFrame:
    """Remove duplicate movies based on movieId."""
    return movies_df.drop_duplicates(subset=["movieId"])


def parse_genres(genres_str: str) -> list:
    """Convert genre string into a list of genres."""
    if not isinstance(genres_str, str):
        return []

    if genres_str.lower() == "(no genres listed)":
        return []

    return [g.strip() for g in genres_str.split("|") if g.strip()]


def extract_release_year(movies_df: pd.DataFrame) -> pd.DataFrame:
    """Extract release year from movie title."""
    movies_df = movies_df.copy()
    movies_df["release_year"] = (
        movies_df["title"]
        .str.extract(r"\((\d{4})\)", expand=False)
        .astype("Int64")
    )
    return movies_df


def clean_movie_titles(movies_df: pd.DataFrame) -> pd.DataFrame:
    """Remove release year from movie title."""
    movies_df = movies_df.copy()
    movies_df["title"] = (
        movies_df["title"]
        .str.replace(r"\s*\(\d{4}\)", "", regex=True)
        .str.strip()
    )
    return movies_df


def validate_movie_ids(
    ratings_df: pd.DataFrame,
    movies_df: pd.DataFrame
) -> None:
    """Ensure all movieIds in ratings exist in movies."""
    missing = set(ratings_df["movieId"]) - set(movies_df["movieId"])
    if missing:
        raise ValueError(
            f"{len(missing)} movieIds in ratings not found in movies"
        )


def enrich_movies_with_ratings(
    ratings_df: pd.DataFrame,
    movies_df: pd.DataFrame
) -> pd.DataFrame:
    """Add rating statistics (mean and count) to movies."""
    ratings_agg = (
        ratings_df
        .groupby("movieId")
        .agg(
            avg_rating=("rating", "mean"),
            rating_count=("rating", "count")
        )
        .reset_index()
    )

    return movies_df.merge(
        ratings_agg,
        on="movieId",
        how="left"
    )

# Run full movie preprocessing pipeline:
def run_preprocessing(movies_df: pd.DataFrame, ratings_df: pd.DataFrame) -> pd.DataFrame:
    movies_df = remove_duplicates(movies_df)
    movies_df["genres_array"] = movies_df["genres"].apply(parse_genres)
    movies_df = extract_release_year(movies_df)
    movies_df = clean_movie_titles(movies_df)
    validate_movie_ids(ratings_df, movies_df)
    movies_df = enrich_movies_with_ratings(ratings_df, movies_df)
    movies_df = movies_df.drop(columns=["genres"])

    return movies_df

In [6]:
ratings_df = pd.read_csv(r"C:\Users\Shahe\movie-recommender\data\rating.csv")
movies_df_clean = run_preprocessing(movies_df, ratings_df)
movies_df_clean.head()

Unnamed: 0,movieId,title,genres_array,release_year,avg_rating,rating_count
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,3.92124,49695.0
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,3.211977,22243.0
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,3.15104,12735.0
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995,2.861393,2756.0
4,5,Father of the Bride Part II,[Comedy],1995,3.064592,12161.0


In [7]:
movies_df_clean.to_parquet(r"C:\Users\Shahe\movie-recommender\src\data\movies_cleaned.parquet", index=False)