# import

In [1]:
import numpy as np
import pandas as pd
from collections import Counter


In [14]:
import torch
import torch.nn as nn
from collections import Counter
import ast
from torch.utils.data import Dataset, DataLoader
import torch
from torch.utils.data import DataLoader, TensorDataset
import gc


In [3]:
device = torch.device('mps')

In [4]:
df = pd.read_csv('./clean_df.csv', index_col=0)

In [7]:
# 데이터 처리를 위한 코드 (이미 제공됨)
df['solvedProblemList'] = df['solvedProblemList'].apply(ast.literal_eval)
df['triedNotsolvedList'] = df['triedNotsolvedList'].apply(ast.literal_eval)

# 행렬 생성

In [8]:
# 문제 번호의 빈도를 계산하기 위한 Counter 객체 초기화
problem_counts = Counter()

# solvedProblemList 칼럼에서 문제 번호의 빈도를 계산
for problem_list in df['solvedProblemList']:
    problem_counts.update(problem_list)


# triedNotsolvedList 칼럼에서 문제 번호의 빈도를 계산
for problem_list in df['triedNotsolvedList']:
    problem_counts.update(problem_list)


# problem_counts에서 값이 5000 이상인 항목만 필터링
filtered_problems = {problem: count for problem, count in problem_counts.items() if count >= 3000}


In [10]:
# 필터링된 문제 번호 집합 생성
filtered_problems_set = set(filtered_problems.keys())

# solvedProblemList와 triedNotsolvedList에서 필터링된 문제 번호만 유지
df['solvedProblemList'] = df['solvedProblemList'].apply(lambda x: [problem for problem in x if problem in filtered_problems_set])
df['triedNotsolvedList'] = df['triedNotsolvedList'].apply(lambda x: [problem for problem in x if problem in filtered_problems_set])


# 모든 문제의 고유 목록 생성
all_problems = set()
for row in df.itertuples():
    all_problems.update(row.solvedProblemList)
    all_problems.update(row.triedNotsolvedList)
all_problems = sorted(all_problems)

# 이진 상호작용 행렬 생성
interaction_matrix = pd.DataFrame(0, index=df['id'], columns=all_problems)


In [11]:
for row in df.itertuples():
    interaction_matrix.loc[row.id, row.solvedProblemList] = 1

# 모델 학습

In [12]:
# 데이터프레임을 PyTorch 텐서로 변환
train_tensor = torch.tensor(interaction_matrix.values).float().to(device)

In [13]:
train_tensor.shape[1]

983

In [15]:

gc.collect()
torch.mps.empty_cache()

In [16]:
class EASE(nn.Module):
    def __init__(self, num_items, reg_lambda=500):
        super(EASE, self).__init__()
        self.reg_lambda = reg_lambda
        self.num_items = num_items
        self.item_weights = nn.Parameter(torch.zeros(num_items, num_items))
        self.device = torch.device('mps')

    def forward(self, x):
        # x: 사용자-아이템 상호작용 행렬
        return torch.matmul(x, self.item_weights)
    def clear_memory(self):
        gc.collect()
        torch.cuda.empty_cache()
    
    def fit(self, data_loader):
        self.item_weights.data.zero_()  # 가중치 초기화

        for batch in data_loader:
            # 데이터 텐서 추출
            data_tensor = batch[0].to(self.device)

            # 배치에 대한 Gram Matrix 계산
            G_batch = torch.matmul(data_tensor.T, data_tensor) + self.reg_lambda * torch.eye(self.num_items, device=self.device)

            # Batch Gram Matrix의 역행렬 계산
            P_batch = torch.inverse(G_batch)

            # Batch Item Weights 계산
            B_batch = P_batch / -torch.diag(P_batch)
            B_batch.fill_diagonal_(0)

            # 각 배치의 결과를 누적
            self.item_weights.data += B_batch

        # 평균 가중치 계산
        self.item_weights.data /= len(data_loader)



In [18]:

# TensorDataset 및 DataLoader 생성
dataset = TensorDataset(train_tensor)
data_loader = DataLoader(dataset, batch_size=5000, shuffle=True)


In [19]:
model = EASE(num_items=train_tensor.shape[1], reg_lambda=100).to(device)

model.fit(data_loader)

In [26]:
interaction_matrix

Unnamed: 0_level_0,1000,1001,1002,10026,1003,10039,1004,1005,1008,1009,...,9625,9653,9654,9655,9656,9657,9663,9934,9935,9996
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
jh5154,1,1,0,0,1,1,0,0,1,0,...,0,0,0,0,0,0,1,0,1,0
baefr,1,1,1,1,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
nacean,1,1,0,1,1,1,0,0,1,1,...,1,0,1,0,0,0,1,0,0,0
klxex,0,0,0,1,1,1,0,1,0,0,...,0,0,0,0,0,0,1,0,1,0
qortndud97,1,1,0,0,0,1,0,1,1,1,...,1,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
kjsmel,1,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
limy0213,1,1,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
palzmnbvcx,1,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
planet4869,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# 모델 추론

In [25]:
solved_problems = df[df['id'] == 'jh5154']['solvedProblemList'].to_list()[0]

In [29]:
# 문제 번호와 칼럼 인덱스의 매핑 생성
problem_to_index = {problem: idx for idx, problem in enumerate(interaction_matrix.columns)}


In [31]:
# 사용자가 풀었던 문제 번호를 인덱스로 변환
solved_problems_indices = [problem_to_index[problem] for problem in solved_problems if problem in problem_to_index]


In [33]:
# 전체 문제 수
num_problems = train_tensor.shape[1]

# 사용자 상호작용 텐서 생성
user_interaction_tensor = torch.zeros(num_problems)
user_interaction_tensor[solved_problems_indices] = 1

In [34]:
# 텐서를 모델이 있는 디바이스로 이동
user_interaction_tensor = user_interaction_tensor.to(device)

In [60]:
# 모델을 사용하여 추천 점수 계산
predicted_scores = model(user_interaction_tensor)

# 이미 풀었던 문제의 점수를 제외
predicted_scores[solved_problems_indices] = -np.inf

# 추천 점수 기반으로 추천 문제 선택 (예: 상위 N개 문제)
recommended_problems = torch.topk(predicted_scores, k=10).indices


In [61]:
recommended_problems

tensor([161, 674, 168, 319, 664,  99, 531, 217, 840, 812], device='mps:0')

In [62]:
# 칼럼 인덱스에서 문제 번호로의 역매핑 생성 -> 위에있는 값이 value라서 key로 찾을 수 없음 그래서 역매핑해야됨
index_to_problem = {idx: problem for problem, idx in problem_to_index.items()}


In [63]:
# 추천된 인덱스를 문제 번호로 변환
recommended_problem_numbers = [index_to_problem[idx.item()] for idx in recommended_problems]


In [64]:
recommended_problem_numbers

['11279',
 '2455',
 '11399',
 '14499',
 '2443',
 '10833',
 '1916',
 '1181',
 '3085',
 '2914']

# 문제에서 확인해보기

In [45]:
problem = pd.read_excel('./data/problemList.xlsx')

In [65]:
recommended_problem_numbers =list(map(int, recommended_problem_numbers))

In [66]:
problem[problem['problemId'].isin(recommended_problem_numbers)]

Unnamed: 0,problemId,title,acceptedUserCount,level,key,bojTagId
181,1181,단어 정렬,52580,6,"['sorting', 'string']","[97, 158]"
909,1916,최소비용 구하기,18154,11,"['dijkstra', 'graphs', 'shortest_path']","[22, 7, 215]"
1447,2443,별 찍기 - 6,24267,3,['implementation'],[102]
1459,2455,지능형 기차,18316,3,"['arithmetic', 'implementation', 'math']","[121, 102, 124]"
1915,2914,저작권,17612,3,"['implementation', 'arithmetic', 'math']","[102, 121, 124]"
2081,3085,사탕 게임,10439,9,"['bruteforcing', 'implementation']","[125, 102]"
9581,10833,사과,8828,3,"['arithmetic', 'math']","[121, 124]"
10017,11279,최대 힙,26938,9,"['data_structures', 'priority_queue']","[175, 59]"
13099,14499,주사위 굴리기,15783,12,"['implementation', 'simulation']","[102, 141]"
