# MovieLense 20M

In [65]:
import pandas as pd

ratings = pd.read_csv("data/ratings.csv")
movies = pd.read_csv("data/movies.csv")

print("ratings:", ratings.shape)
print("movies:", movies.shape)

ratings: (20000263, 4)
movies: (27278, 3)


In [66]:
def filter_interactions(df, min_user_inter=20, min_item_inter=20):
    while True:
        user_counts = df['userId'].value_counts()
        df = df[df['userId'].isin(user_counts[user_counts >= min_user_inter].index)]

        item_counts = df['movieId'].value_counts()
        df = df[df['movieId'].isin(item_counts[item_counts >= min_item_inter].index)]

        if (
            (df['userId'].value_counts() >= min_user_inter).all()
            and (df['movieId'].value_counts() >= min_item_inter).all()
        ):
            break
    return df


In [67]:
# 평점 → label
ratings['label'] = (ratings['rating'] >= 4.0).astype(int)
interactions = ratings[['userId', 'movieId', 'label']]

# 논문 기준 필터링
interactions = filter_interactions(interactions)

# 인덱싱
user2id = {uid: idx for idx, uid in enumerate(interactions['userId'].unique())}
item2id = {iid: idx for idx, iid in enumerate(interactions['movieId'].unique())}

interactions['user'] = interactions['userId'].map(user2id)
interactions['item'] = interactions['movieId'].map(item2id)

# 확인
print("Number of users:", len(user2id))
print("Number of items:", len(item2id))
print("Interactions:", len(interactions))
print("Min user interactions:", interactions['user'].value_counts().min())
print("Min item interactions:", interactions['item'].value_counts().min())


Number of users: 138408
Number of items: 13130
Interactions: 19931488
Min user interactions: 20
Min item interactions: 20


In [69]:
def build_kg(movies_df, item2id):
    kg = []
    entity2id = {}
    relation2id = {"has_genre": 0, "released_in": 1}
    entity_counter = 0

    for _, row in movies_df.iterrows():
        movie_id = row['movieId']
        if movie_id not in item2id:
            continue  # 필터링된 아이템만 사용

        h = item2id[movie_id]

        # 장르 정보 처리
        genres = row['genres'].split('|') if pd.notnull(row['genres']) else []
        for g in genres:
            if g not in entity2id:
                entity2id[g] = entity_counter
                entity_counter += 1
            t = entity2id[g]
            kg.append((h, relation2id['has_genre'], t))

        # 출시 연도 추출
        if '(' in row['title'] and ')' in row['title']:
            try:
                year = row['title'].split('(')[-1].split(')')[0]
                if year.isdigit():
                    if year not in entity2id:
                        entity2id[year] = entity_counter
                        entity_counter += 1
                    t = entity2id[year]
                    kg.append((h, relation2id['released_in'], t))
            except:
                continue

    return kg, entity2id, relation2id


In [70]:
kg_triples, entity2id, relation2id = build_kg(movies, item2id)

print("Number of KG triples:", len(kg_triples))
print("Number of entities:", len(entity2id))
print("Relations:", relation2id)

Number of KG triples: 41327
Number of entities: 123
Relations: {'has_genre': 0, 'released_in': 1}


In [71]:
import os
import pickle

# 저장할 폴더 생성
os.makedirs("data/processed", exist_ok=True)

# interactions 저장
interactions[['user', 'item', 'label']].to_csv("data/processed/interactions.csv", index=False)

# KG triples 저장
with open("data/processed/kg_triples.pkl", "wb") as f:
    pickle.dump(kg_triples, f)

# 매핑 정보 저장
with open("data/processed/user2id.pkl", "wb") as f:
    pickle.dump(user2id, f)

with open("data/processed/item2id.pkl", "wb") as f:
    pickle.dump(item2id, f)

with open("data/processed/entity2id.pkl", "wb") as f:
    pickle.dump(entity2id, f)

with open("data/processed/relation2id.pkl", "wb") as f:
    pickle.dump(relation2id, f)

print("저장 완료")

저장 완료
