In [26]:
from sklearn.metrics import jaccard_score

import pandas as pd
import ast

In [2]:
df = pd.read_excel('./data/problemList.xlsx')

In [27]:
# 문자열을 리스트로 변환하는 함수 정의
def string_to_list(s):
    try:
        return ast.literal_eval(s)
    except ValueError:
        return []  # 또는 적절한 기본값 반환
    

df['key'] = df['key'].apply(string_to_list)


In [44]:
df[df['problemId'] == 1000]

Unnamed: 0,problemId,title,acceptedUserCount,level,key,bojTagId
0,1000,A+B,272273,1,"[implementation, arithmetic, math]","[102, 121, 124]"


In [47]:
li = [19604, 1252, 4107, 24294, 6840]

In [49]:
df[df['problemId'].isin(li)]


Unnamed: 0,problemId,title,acceptedUserCount,level,key,bojTagId
252,1252,이진수 덧셈,3593,4,"[arithmetic, implementation, math]","[121, 102, 124]"
3068,4107,Knitting,86,4,"[arithmetic, implementation, math, simulation]","[121, 102, 124, 141]"
3072,4107,Knitting,86,4,"[arithmetic, implementation, math, simulation]","[121, 102, 124, 141]"
4954,6840,Who is in the middle?,2650,1,[implementation],[102]
5803,6840,Who is in the middle?,2650,1,[implementation],[102]
17778,19604,Art,150,4,"[arithmetic, implementation, math]","[121, 102, 124]"
18316,19604,Art,150,4,"[arithmetic, implementation, math]","[121, 102, 124]"
22772,24294,ГРАДИНА,619,2,"[arithmetic, math]","[121, 124]"


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29166 entries, 0 to 29165
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   problemId          29166 non-null  int64 
 1   title              29158 non-null  object
 2   acceptedUserCount  29166 non-null  int64 
 3   level              29166 non-null  int64 
 4   key                29166 non-null  object
 5   bojTagId           29166 non-null  object
dtypes: int64(3), object(3)
memory usage: 1.3+ MB


In [6]:

def recommend_problems(problem_id, df):
    # 입력된 문제의 정보를 가져오기
    problem_info = df[df['problemId'] == problem_id].iloc[0]

    # 1단계: 같은 레벨의 문제를 필터링
    level_filtered = df[df['level'] == problem_info['level']]

    # 2단계: bojTagId가 유사한 문제를 찾기
    # 여기서는 간단히 bojTagId가 같은 문제를 필터링
    tag_filtered = level_filtered[level_filtered['bojTagId'] == problem_info['bojTagId']]

    # 3단계: acceptedUserCount가 높은 순으로 정렬
    recommended = tag_filtered.sort_values(by='acceptedUserCount', ascending=False)

    return recommended



In [7]:

# 예시로 문제 번호 1에 대한 추천을 받아봅니다.
recommend_problems(1000, df)

Unnamed: 0,problemId,title,acceptedUserCount,level,key,bojTagId
0,1000,A+B,272273,1,"['implementation', 'arithmetic', 'math']","[102, 121, 124]"
1,1001,A-B,232650,1,"['implementation', 'arithmetic', 'math']","[102, 121, 124]"
9744,10998,A×B,191317,1,"['implementation', 'arithmetic', 'math']","[102, 121, 124]"
9616,10869,사칙연산,188856,1,"['implementation', 'arithmetic', 'math']","[102, 121, 124]"
8,1008,A/B,188802,1,"['implementation', 'arithmetic', 'math']","[102, 121, 124]"
9197,10430,나머지,165021,1,"['implementation', 'arithmetic', 'math']","[102, 121, 124]"
1757,2753,윤년,149772,1,"['implementation', 'arithmetic', 'math']","[102, 121, 124]"
9696,10950,A+B - 3,138421,1,"['implementation', 'arithmetic', 'math']","[102, 121, 124]"
9698,10952,A+B - 5,120613,1,"['implementation', 'arithmetic', 'math']","[102, 121, 124]"
9767,11021,A+B - 7,115445,1,"['implementation', 'arithmetic', 'math']","[102, 121, 124]"


In [41]:
df[df['problemId'] == 25238]

Unnamed: 0,problemId,title,acceptedUserCount,level,key,bojTagId
23690,25238,가희와 방어율 무시,3847,2,"[arithmetic, math]","[121, 124]"


In [6]:
test=df[df['level'] == 1]

In [35]:
test_2 = df[df['level'] == 2]

In [36]:
test_2

Unnamed: 0,problemId,title,acceptedUserCount,level,key,bojTagId
264,1264,모음의 개수,10038,2,"[implementation, string]","[102, 158]"
1072,2083,럭비 클럽,5285,2,[implementation],[102]
1443,2439,별 찍기 - 2,142675,2,[implementation],[102]
1444,2440,별 찍기 - 3,56262,2,[implementation],[102]
1484,2480,주사위 세개,78903,2,"[arithmetic, case_work, implementation, math]","[121, 137, 102, 124]"
...,...,...,...,...,...,...
28929,30793,gahui and sousenkyo 3,240,2,"[arithmetic, implementation, math]","[121, 102, 124]"
28930,30794,가희와 클럽 오디션 1,290,2,"[arithmetic, implementation, math, string]","[121, 102, 124, 158]"
29003,30868,개표,586,2,[implementation],[102]
29011,30876,Tren del Fin del Mundo,394,2,[math],[124]


In [32]:
exploded_df = test_2.explode('key')


In [33]:
exploded_df.value_counts()

key
implementation    156
math              154
arithmetic        132
string             38
geometry           12
case_work           5
simulation          4
ad_hoc              2
number_theory       2
pythagoras          2
combinatorics       2
bruteforcing        2
sorting             1
parsing             1
precomputation      1
Name: count, dtype: int64

In [13]:
test.acceptedUserCount.describe()

count       141.000000
mean      37639.276596
std       63422.085355
min         412.000000
25%        1858.000000
50%        5461.000000
75%       39316.000000
max      302639.000000
Name: acceptedUserCount, dtype: float64

In [19]:


def calculate_jaccard_similarity(tag1, tag2):
    intersection = len(set(tag1).intersection(tag2))
    union = len(set(tag1).union(tag2))
    return intersection / union if union != 0 else 0

def recommend_problems(problem_id, df):
    problem_info = df[df['problemId'] == problem_id].iloc[0]

    level_filtered = df[df['level'] == problem_info['level']] # 레벨 앞뒤로 한개씩

    level_filtered['similarity'] = level_filtered['bojTagId'].apply(
        lambda x: calculate_jaccard_similarity(problem_info['bojTagId'], x))
    similar_tags = level_filtered[level_filtered['similarity'] > 0]  # 유사도가 0보다 큰 문제들을 선택

    recommended = similar_tags.sort_values(by=['similarity', 'acceptedUserCount'], ascending=[False, False])

    return recommended[['problemId', 'title', 'level', 'bojTagId', 'similarity', 'acceptedUserCount']]

In [37]:
check = recommend_problems(25238, df)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  level_filtered['similarity'] = level_filtered['bojTagId'].apply(


In [40]:
check.sample(10)

Unnamed: 0,problemId,title,level,bojTagId,similarity,acceptedUserCount
6509,7595,Triangles,2,[102],0.5,1437
9518,10768,특별한 날,2,[102],0.5,6333
9995,11257,IT Passport Examination,2,"[121, 102, 124]",0.875,821
17871,19698,헛간 청약,2,"[121, 124]",1.0,3028
25104,26731,Zagubiona litera,2,"[102, 158]",0.6,405
24209,25784,Easy-to-Solve Expressions,2,"[121, 102, 124]",0.875,633
24283,25858,Divide the Cash,2,"[121, 124]",1.0,522
1443,2439,별 찍기 - 2,2,[102],0.5,142675
24753,26340,Fold the Paper Nicely,2,"[121, 102, 124]",0.875,329
24129,25704,출석 이벤트,2,"[121, 102, 124]",0.875,1344
