In [2]:
# convert_movielens100k.py
import pandas as pd

# -------------------------
# Paths to Movielens 100K files
# -------------------------
ratings_file = "../data/ml-100k/u.data"    # format: userId \t movieId \t rating \t timestamp
movies_file = "../data/ml-100k/u.item"     # format: movieId | title | release_date | ... | genres (19 binary columns)

# -------------------------
# Load ratings
# -------------------------
ratings_cols = ["userId", "movieId", "rating", "timestamp"]
ratings = pd.read_csv(ratings_file, sep="\t", names=ratings_cols, encoding="latin-1")
ratings.drop(columns=["timestamp"], inplace=True)

# -------------------------
# Load movie metadata
# -------------------------
movie_cols = ["movieId", "title", "release_date", "video_release_date", "IMDb_URL"] + [
    "unknown", "Action", "Adventure", "Animation", "Children's", "Comedy", "Crime", "Documentary",
    "Drama", "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi",
    "Thriller", "War", "Western"
]
movies = pd.read_csv(movies_file, sep="|", names=movie_cols, encoding="latin-1")

# -------------------------
# Map genres to simplified categories
# -------------------------
def map_genre_to_category(row):
    # Example mapping:
    # Comedy, Musical, Animation, Children -> neutral
    # Action, Adventure, Thriller, Crime -> mildly_political
    # War, Film-Noir, Horror -> extreme
    genres = row[6:]  # genre columns
    genre_names = genres[genres==1].index.tolist()
    if any(g in ["Comedy", "Animation", "Children's", "Musical"] for g in genre_names):
        return "neutral"
    elif any(g in ["Action", "Adventure", "Thriller", "Crime"] for g in genre_names):
        return "mildly_political"
    elif any(g in ["War", "Film-Noir", "Horror"] for g in genre_names):
        return "extreme"
    else:
        return "neutral"

movies["category"] = movies.apply(map_genre_to_category, axis=1)

# Keep only relevant columns
movies_clean = movies[["movieId", "title", "category"]]

# -------------------------
# Merge ratings with movie info
# -------------------------
merged = ratings.merge(movies_clean, on="movieId")

# -------------------------
# Save to CSV for demo
# -------------------------
merged.to_csv("movielens_100k_categories.csv", index=False)
print("Saved converted CSV: movielens_100k_categories.csv")


Saved converted CSV: movielens_100k_categories.csv
