# Data Pipeline & Feature Engineering — MovieLens 1M

This notebook builds the detection-oriented user feature representation used in the paper:
**"Unsupervised Anomaly Detection in Recommender Systems"**.

It performs:
- Loading MovieLens 1M raw `.dat` files
- Minimal filtering (users/items with < 20 interactions)
- User-level feature engineering (statistical, temporal, and item-popularity features)
- Export of the final feature matrix for downstream unsupervised detection models

---

## Expected Directory Structure

Place the official MovieLens 1M folder as:

- `data/ml-1m/ratings.dat`
- `data/ml-1m/users.dat`
- `data/ml-1m/movies.dat`

No dataset is redistributed in this repository. Download it from:
https://grouplens.org/datasets/movielens/1m/

---

## Outputs

This notebook generates the following artifact(s):

- `outputs/movielens1m_user_features.csv`

These features are consumed by:
- `02 notebook_detection_models.ipynb`
- `03 experimental_setup_evaluation_unsupervised.ipynb`
- `04 results_visuals_tables.ipynb`


In [1]:
import os
import numpy as np
import pandas as pd

ROOT = os.getcwd()                 
ML1M_DIR = os.path.join(ROOT, "ml-1m")

RATINGS_PATH = os.path.join(ML1M_DIR, "ratings.dat")
MOVIES_PATH  = os.path.join(ML1M_DIR, "movies.dat")
USERS_PATH   = os.path.join(ML1M_DIR, "users.dat")

pd.set_option("display.max_columns", 100)


## Load Data

In [2]:
def read_dat(path, cols):
    
    return pd.read_csv(
        path,
        sep=r"::",
        engine="python",
        names=cols,
        encoding="latin-1"
    )

ratings = read_dat(RATINGS_PATH, ["user_id", "movie_id", "rating", "timestamp"])
movies  = read_dat(MOVIES_PATH,  ["movie_id", "title", "genres"])
users   = read_dat(USERS_PATH,   ["user_id", "gender", "age", "occupation", "zip"])

ratings.head(), movies.head(), users.head()


(   user_id  movie_id  rating  timestamp
 0        1      1193       5  978300760
 1        1       661       3  978302109
 2        1       914       3  978301968
 3        1      3408       4  978300275
 4        1      2355       5  978824291,
    movie_id                               title                        genres
 0         1                    Toy Story (1995)   Animation|Children's|Comedy
 1         2                      Jumanji (1995)  Adventure|Children's|Fantasy
 2         3             Grumpier Old Men (1995)                Comedy|Romance
 3         4            Waiting to Exhale (1995)                  Comedy|Drama
 4         5  Father of the Bride Part II (1995)                        Comedy,
    user_id gender  age  occupation    zip
 0        1      F    1          10  48067
 1        2      M   56          16  70072
 2        3      M   25          15  55117
 3        4      M   45           7  02460
 4        5      M   25          20  55455)

## Data cleaning

In [3]:
# Tipos
ratings["user_id"]  = ratings["user_id"].astype(int)
ratings["movie_id"] = ratings["movie_id"].astype(int)
ratings["rating"]   = ratings["rating"].astype(float)

# Timestamp to datetime
ratings["ts"] = pd.to_datetime(ratings["timestamp"], unit="s", utc=True)

# Recommended order for temporal features
ratings = ratings.sort_values(["user_id", "ts"]).reset_index(drop=True)

print("ratings:", ratings.shape)
print("movies :", movies.shape)
print("users  :", users.shape)


ratings: (1000209, 5)
movies : (3883, 3)
users  : (6040, 5)


In [4]:
MIN_USER_RATINGS = 20
MIN_ITEM_RATINGS = 20

user_counts = ratings["user_id"].value_counts()
item_counts = ratings["movie_id"].value_counts()

keep_users = user_counts[user_counts >= MIN_USER_RATINGS].index
keep_items = item_counts[item_counts >= MIN_ITEM_RATINGS].index

ratings_f = ratings[ratings["user_id"].isin(keep_users) & ratings["movie_id"].isin(keep_items)].copy()
ratings_f = ratings_f.sort_values(["user_id", "ts"]).reset_index(drop=True)

print("Filtered ratings:", ratings_f.shape)


Filtered ratings: (995492, 5)


## Feature engineering

In [5]:
def rating_entropy(x: pd.Series) -> float:
    # Entropía sobre la distribución de ratings 1..5
    counts = x.value_counts().reindex([1,2,3,4,5], fill_value=0).values.astype(float)
    p = counts / counts.sum() if counts.sum() > 0 else counts
    p = p[p > 0]
    return float(-(p * np.log2(p)).sum()) if len(p) else 0.0

def build_user_features(r: pd.DataFrame) -> pd.DataFrame:
    g = r.groupby("user_id")

    feats = pd.DataFrame({
        "num_ratings": g["rating"].size(),
        "mean_rating": g["rating"].mean(),
        "std_rating":  g["rating"].std(ddof=0).fillna(0.0),
        "min_rating":  g["rating"].min(),
        "max_rating":  g["rating"].max(),
        "entropy_rating": g["rating"].apply(rating_entropy),
        "ratio_1": g["rating"].apply(lambda s: (s==1).mean()),
        "ratio_5": g["rating"].apply(lambda s: (s==5).mean()),
    }).reset_index()

    feats["extreme_ratio"] = feats["ratio_1"] + feats["ratio_5"]
    feats["mean_abs_dev"]  = g["rating"].apply(lambda s: float(np.mean(np.abs(s - s.mean())))).values

    return feats

user_feats = build_user_features(ratings_f)
user_feats.head()


Unnamed: 0,user_id,num_ratings,mean_rating,std_rating,min_rating,max_rating,entropy_rating,ratio_1,ratio_5,extreme_ratio,mean_abs_dev
0,1,53,4.188679,0.674512,3.0,5.0,1.436588,0.0,0.339623,0.339623,0.551086
1,2,129,3.713178,0.997624,1.0,5.0,1.953184,0.015504,0.263566,0.27907,0.851752
2,3,51,3.901961,0.975281,1.0,5.0,1.883402,0.019608,0.294118,0.313725,0.730488
3,4,21,4.190476,1.051939,1.0,5.0,1.66759,0.047619,0.47619,0.52381,0.770975
4,5,196,3.147959,1.130986,1.0,5.0,2.16235,0.096939,0.107143,0.204082,0.918524


In [6]:
def build_temporal_features(r: pd.DataFrame) -> pd.DataFrame:
    r = r.sort_values(["user_id", "ts"]).copy()
    r["delta_s"] = r.groupby("user_id")["ts"].diff().dt.total_seconds()

    g = r.groupby("user_id")

    # Inter-arrival stats
    delta_mean = g["delta_s"].mean().fillna(0.0)
    delta_std  = g["delta_s"].std(ddof=0).fillna(0.0)

    # Temporal profile window
    first_ts = g["ts"].min()
    last_ts  = g["ts"].max()
    span_s   = (last_ts - first_ts).dt.total_seconds().fillna(0.0)

    feats = pd.DataFrame({
        "user_id": delta_mean.index.astype(int),
        "delta_mean_s": delta_mean.values,
        "delta_std_s":  delta_std.values,
        "profile_span_s": span_s.values,
    })

    # Approximate frequency (ratings per day)
    span_days = feats["profile_span_s"] / 86400.0
    feats["ratings_per_day"] = feats["profile_span_s"].where(span_days > 0, 0)
    feats.loc[span_days > 0, "ratings_per_day"] = (
        r.groupby("user_id")["rating"].size().reindex(feats["user_id"]).values / span_days[span_days > 0].values
    )

    # “Burst ratio”: percentage of ratings with delta < 10 minutes (600 s), indicating burst activity
    burst_ratio = g["delta_s"].apply(lambda s: float((s < 600).mean()) if s.notna().any() else 0.0)
    feats["burst_ratio_10min"] = burst_ratio.reindex(feats["user_id"]).values

    return feats

temp_feats = build_temporal_features(ratings_f)
temp_feats.head()


Unnamed: 0,user_id,delta_mean_s,delta_std_s,profile_span_s,ratings_per_day,burst_ratio_10min
0,1,10083.307692,71662.866914,524332.0,8.733398,0.943396
1,2,16.015625,20.706273,2050.0,5436.878049,0.992248
2,3,29.72,52.37711,1486.0,2965.275908,0.980392
3,4,17.9,42.399175,358.0,5068.156425,0.952381
4,5,28.271795,70.647648,5513.0,3071.721386,0.989796


In [7]:
def build_item_popularity_features(r: pd.DataFrame) -> pd.DataFrame:
    item_pop = r["movie_id"].value_counts().rename("item_popularity")

    r2 = r.merge(item_pop, left_on="movie_id", right_index=True, how="left")

    g = r2.groupby("user_id")["item_popularity"]
    feats = pd.DataFrame({
        "user_id": g.mean().index.astype(int),
        "mean_item_pop": g.mean().values,
        "std_item_pop":  g.std(ddof=0).fillna(0.0).values,
        "min_item_pop":  g.min().values,
        "max_item_pop":  g.max().values,
    }).reset_index()

    return feats

pop_feats = build_item_popularity_features(ratings_f)
pop_feats.head()


Unnamed: 0,index,user_id,mean_item_pop,std_item_pop,min_item_pop,max_item_pop
0,0,1,1135.830189,750.967118,73,2991
1,1,2,1012.790698,718.464056,47,3428
2,2,3,1376.784314,838.632625,92,3428
3,3,4,1708.809524,829.642127,450,2991
4,4,5,760.755102,671.144006,21,3428


## Final Feature Matrix

In [8]:
features = (
    user_feats
    .merge(temp_feats, on="user_id", how="left")
    .merge(pop_feats, on="user_id", how="left")
)

# Set NaNs to 0 (for safety)
features = features.fillna(0.0)

features.describe(include="all").T.head(20)


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
user_id,6040.0,3020.5,1743.742,1.0,1510.75,3020.5,4530.25,6040.0
num_ratings,6040.0,164.8166,191.1177,16.0,44.0,95.0,206.0,2150.0
mean_rating,6040.0,3.704632,0.4288444,1.016667,3.448114,3.737448,4.0,4.962963
std_rating,6040.0,1.001632,0.2014292,0.128019,0.862644,0.989405,1.125364,1.860344
min_rating,6040.0,1.274503,0.5421272,1.0,1.0,1.0,1.0,4.0
max_rating,6040.0,4.995199,0.07812673,2.0,5.0,5.0,5.0,5.0
entropy_rating,6040.0,1.851679,0.2614875,0.122292,1.692623,1.880491,2.044558,2.318369
ratio_1,6040.0,0.04904325,0.06960368,0.0,0.005797,0.02726,0.06346917,0.9833333
ratio_5,6040.0,0.2709575,0.1512341,0.0,0.156055,0.252508,0.362069,0.962963
extreme_ratio,6040.0,0.3200008,0.1503168,0.0,0.208333,0.30303,0.4129197,1.0


In [9]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = features.drop(columns=["user_id"]).copy()

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test = train_test_split(X_scaled, test_size=0.2, random_state=42)
X_train.shape, X_test.shape


((4832, 20), (1208, 20))

## Save outputs

In [10]:
OUT_PATH = os.path.join(ROOT, "movielens1m_user_features.csv")
features.to_csv(OUT_PATH, index=False)
print("Saved:", OUT_PATH)


Saved: C:\Users\USUARIO\Desktop\app\movielens1m_user_features.csv
