## **Tags Data Ingestion**

### **Exploring Data**

In [16]:
import pandas as pd
tag_df = pd.read_csv(r"C:\Users\Shahe\movie-recommender\data\tag.csv")
tag_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,18,4141,Mark Waters,2009-04-24 18:19:40
1,65,208,dark hero,2013-05-10 01:41:18
2,65,353,dark hero,2013-05-10 01:41:19
3,65,521,noir thriller,2013-05-10 01:39:43
4,65,592,dark hero,2013-05-10 01:41:18


In [17]:
tag_df.shape

(465564, 4)

In [18]:
tag_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 465564 entries, 0 to 465563
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   userId     465564 non-null  int64 
 1   movieId    465564 non-null  int64 
 2   tag        465548 non-null  object
 3   timestamp  465564 non-null  object
dtypes: int64(2), object(2)
memory usage: 14.2+ MB


In [19]:
tag_df.isnull().sum()

userId        0
movieId       0
tag          16
timestamp     0
dtype: int64

### **Preprocessing Data**

In [20]:
import re

def run_preprocessing(tag_df: pd.DataFrame, min_freq: int = 5) -> pd.DataFrame:
    """
    Clean and normalize movie tags.

    Steps:
    - lowercase
    - trim spaces
    - replace spaces with _
    - remove punctuation
    - drop very rare tags (frequency < min_freq)
    """

    df = tag_df.copy()
    df = df.dropna(subset=["tag"])

    # 1. lowercase
    df["tag_clean"] = df["tag"].str.lower()

    # 2. trim spaces
    df["tag_clean"] = df["tag_clean"].str.strip()

    # 3. replace spaces with underscore
    df["tag_clean"] = df["tag_clean"].str.replace(r"\s+", "_", regex=True)

    # 4. remove punctuation (keep letters, numbers, underscore)
    df["tag_clean"] = df["tag_clean"].str.replace(
        r"[^\w_]", "", regex=True
    )

    # drop empty tags
    df = df[df["tag_clean"] != ""]

    # 5. drop very rare tags
    tag_counts = df["tag_clean"].value_counts()
    valid_tags = tag_counts[tag_counts >= min_freq].index

    df = df[df["tag_clean"].isin(valid_tags)]

    return df.reset_index(drop=True)



In [21]:
tags_clean = run_preprocessing(tag_df, min_freq=10)
tags_clean.head()
tags_clean.to_parquet(
    r"C:\Users\Shahe\movie-recommender\src\data\tags_clean.parquet",
    index=False
)
