In [1]:
import numpy as np # linear algebra
import pandas as pd
import random
from scipy.sparse.linalg import svds
from sklearn.metrics import accuracy_score, roc_auc_score

### Data Load

In [3]:
train_data = pd.read_csv('/opt/ml/input/data/train_data.csv')
test_data  = pd.read_csv('/opt/ml/input/data/test_data.csv')

### 데이터 구성

- 데이터는 학습 데이터셋과 테스트 데이터셋으로 구분되어 있다.
- 각 데이터에는 userID, assessmentItemID, testId, answerCode, Timestamp, KnowledgeTag의 정보가 있다.
- 여기서 assessmentItemID는 문제의 고유 ID이며, answerCode는 사용자가 해당 문제의 정답을 맞췄는지 여부로, 맞췄으면 1, 틀렸으면 0으로 표기된다.
- 기본적인 협업 필터링 적용을 위해 본 실습에서는 userID, assessmentItemID, answerCode만을 사용한다.

In [4]:
userid, itemid = list(set(train_data.userID)), list(set(train_data.assessmentItemID))
n_user, n_item = len(userid), len(itemid)

print(f"Train dataset")
display(train_data.head(5))
print(f" Num. Users    : {n_user}")
print(f" Max. UserID   : {max(userid)}")
print(f" Num. Items    : {n_item}")
print(f" Num. Records  : {len(train_data)}")

userid, itemid = list(set(test_data.userID)), list(set(test_data.assessmentItemID))
n_user, n_item = len(userid), len(itemid)

print(f"Test dataset")
display(test_data.head(5))
print(f" Num. Users    : {n_user}")
print(f" Max. UserID   : {max(userid)}")
print(f" Num. Items    : {n_item}")
print(f" Num. Records  : {len(test_data)}")

Train dataset


Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225


 Num. Users    : 6698
 Max. UserID   : 7441
 Num. Items    : 9454
 Num. Records  : 2266586
Test dataset


Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag
0,3,A050023001,A050000023,1,2020-01-09 10:56:31,2626
1,3,A050023002,A050000023,1,2020-01-09 10:56:57,2626
2,3,A050023003,A050000023,0,2020-01-09 10:58:31,2625
3,3,A050023004,A050000023,0,2020-01-09 10:58:36,2625
4,3,A050023006,A050000023,0,2020-01-09 10:58:43,2623


 Num. Users    : 744
 Max. UserID   : 7439
 Num. Items    : 9454
 Num. Records  : 260114


In [5]:
display(test_data.tail(3))

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag
260111,7439,A040130003,A040000130,1,2020-10-14 23:08:02,8244
260112,7439,A040130004,A040000130,1,2020-10-14 23:09:31,8244
260113,7439,A040130005,A040000130,-1,2020-10-14 23:10:03,8832


위와 같이 테스트 데이터셋에서는 answerCode가 -1인 경우가 나타난다. 이는 평가를 위한 것으로 해당 레코드는 제외하고 실습을 수행한다.

### Data Preprocessing

중복 레코드 제거
 - RS 모델에서는 시간에 따른 변화를 고려하지 않기 때문에 최종 성적만을 바탕으로 평가한다.
 - 사용자+문제항목을 Unique key로 하여 최종 레코드만을 보존하고 나머지 제거한다.

In [6]:
train_data.drop_duplicates(subset = ["userID", "assessmentItemID"],
                     keep = "last", inplace = True)
test_data.drop_duplicates(subset = ["userID", "assessmentItemID"],
                     keep = "last", inplace = True)

In [8]:
test_data[test_data.answerCode == -1]


Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag
1035,3,A050133008,A050000133,-1,2020-10-26 13:13:57,5289
1706,4,A070146008,A070000146,-1,2020-12-27 02:47:54,9080
3023,13,A070111008,A070000111,-1,2020-12-27 04:35:09,9660
4283,17,A090064006,A090000064,-1,2020-10-30 05:48:37,2611
4670,26,A060135007,A060000135,-1,2020-10-23 11:44:18,1422
...,...,...,...,...,...,...
260052,7395,A040122005,A040000122,-1,2020-09-08 02:05:20,10615
260067,7404,A030111005,A030000111,-1,2020-10-13 09:49:18,7636
260082,7416,A050193004,A050000193,-1,2020-10-04 02:44:41,10402
260097,7417,A050193004,A050000193,-1,2020-09-06 13:09:15,10402


불필요한 column 제거
- 다음과 같이 pandas에서는 불필요한 column을 제거할 수 있다.

In [9]:
train_data.drop(['Timestamp','testId','KnowledgeTag'],
                axis=1, inplace=True, errors='ignore')
train_data.head(10)

Unnamed: 0,userID,assessmentItemID,answerCode
0,0,A060001001,1
1,0,A060001002,1
2,0,A060001003,1
3,0,A060001004,1
4,0,A060001005,1
5,0,A060001007,1
6,0,A060003001,0
7,0,A060003002,1
8,0,A060003003,1
9,0,A060003004,1


평가 항목 제거
- 테스트 데이터셋에서 answerCode가 -1인 항목은 최종 평가시 사용되는 항목으로 여기에선 사용할 수 없다.
- 아래 결과에서와 같이 User, Item 수는 변화 없이 총 레코드 수만 변한다.

In [10]:
test_data_old = test_data.copy()
n_user_old, n_item_old = n_user, n_item

test_data  = test_data[test_data.answerCode>=0].copy()

userid, itemid = list(set(test_data.userID)), list(set(test_data.assessmentItemID))
n_user, n_item = len(userid), len(itemid)

display(test_data.tail(5))
print(f" Num. Users    : {n_user}->{n_user}")
print(f" Max. UserID   : {max(userid)}")
print(f" Num. Items    : {n_item}->{n_item}")
print(f" Num. Records  : {len(test_data_old)}->{len(test_data)}")

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag
260108,7439,A040197006,A040000197,1,2020-08-21 07:39:45,2132
260109,7439,A040130001,A040000130,0,2020-10-14 23:07:23,8832
260110,7439,A040130002,A040000130,1,2020-10-14 23:07:41,8832
260111,7439,A040130003,A040000130,1,2020-10-14 23:08:02,8244
260112,7439,A040130004,A040000130,1,2020-10-14 23:09:31,8244


 Num. Users    : 744->744
 Max. UserID   : 7439
 Num. Items    : 9454->9454
 Num. Records  : 256073->255329


평가 항목 신규 생성
- 남은 테스트 항목 중, 각 사용자별 최종 레코드를 새로운 평가 항목으로 정한다.

In [11]:
eval_data = test_data.copy()
eval_data.drop_duplicates(subset = ["userID"],
                     keep = "last", inplace = True)
display(eval_data.head(5))
display(eval_data.tail(5))
print(f" Num. Records  : {len(eval_data)}")

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag
1034,3,A050133007,A050000133,0,2020-10-26 13:13:11,5289
1705,4,A070146007,A070000146,1,2020-12-27 02:47:31,9080
3022,13,A070111007,A070000111,1,2020-12-27 04:35:01,9660
4282,17,A090064005,A090000064,1,2020-10-30 05:47:22,2611
4669,26,A060135006,A060000135,0,2020-10-23 11:44:01,1422


Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag
260051,7395,A040122004,A040000122,0,2020-09-08 02:05:18,2102
260066,7404,A030111004,A030000111,1,2020-10-13 09:47:31,7636
260081,7416,A050193003,A050000193,0,2020-10-04 02:44:17,10402
260096,7417,A050193003,A050000193,0,2020-09-06 13:08:54,10402
260112,7439,A040130004,A040000130,1,2020-10-14 23:09:31,8244


 Num. Records  : 744


평가 항목을 테스트 항목에서 제거한다.

In [12]:
test_data.drop(index=eval_data.index, inplace=True, errors='ignore')
display(test_data.tail(5))
print(f" Num. Records  : {len(test_data)}")

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag
260107,7439,A040197005,A040000197,0,2020-08-21 07:39:40,2132
260108,7439,A040197006,A040000197,1,2020-08-21 07:39:45,2132
260109,7439,A040130001,A040000130,0,2020-10-14 23:07:23,8832
260110,7439,A040130002,A040000130,1,2020-10-14 23:07:41,8832
260111,7439,A040130003,A040000130,1,2020-10-14 23:08:02,8244


 Num. Records  : 254585


사용자 - 문제항목 관계를 pivot 테이블로 변경
 - 각 사용자별로 해당 문제를 맞췄는지 여부를 matrix 형태로 변경
 - 해당 문제를 푼 적이 없는 경우 0.5(예시)으로 설정

In [13]:
matrix_train = train_data.pivot_table('answerCode', index='userID', columns='assessmentItemID')
matrix_train.fillna(0.5, inplace=True)
display(matrix_train.head(5))
print(f"Result Shape is {matrix_train.shape}")

assessmentItemID,A010001001,A010001002,A010001003,A010001004,A010001005,A010002001,A010002002,A010002003,A010002004,A010002005,...,A090073003,A090073004,A090073005,A090073006,A090074001,A090074002,A090074003,A090074004,A090074005,A090074006
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
1,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,...,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
2,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
6,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5


Result Shape is (6698, 9454)


## 🎯 SVD 분석

### 데이터 인덱스 매핑 생성

사용자/문제항목 ID와 table상에서의 index를 매칭시키기 위한 lookup table을 dictionary 형태로 생성

In [14]:
user_id2idx = {v:i for i,v in enumerate(matrix_train.index)}
user_idx2id = {i:v for i,v in enumerate(matrix_train.index)}

item_id2idx = {v:i for i,v in enumerate(matrix_train.columns)}
item_idx2id = {i:v for i,v in enumerate(matrix_train.columns)}

### S

사용자 - 문제항목의 pivot table을 normalize된 matrix로 변경

$A = ${User - Item 간의 value를 저장하는 matrix}

$A = R^{n_{user} \times n_{item}}$

In [15]:
A = matrix_train.values
a_mean = np.mean(A, axis=1)
Am = A - a_mean.reshape(-1,1)
display(pd.DataFrame(Am, columns=matrix_train.columns).head())

assessmentItemID,A010001001,A010001002,A010001003,A010001004,A010001005,A010002001,A010002002,A010002003,A010002004,A010002005,...,A090073003,A090073004,A090073005,A090073006,A090074001,A090074002,A090074003,A090074004,A090074005,A090074006
0,-0.010313,-0.010313,-0.010313,-0.010313,-0.010313,-0.010313,-0.010313,-0.010313,-0.010313,-0.010313,...,-0.010313,-0.010313,-0.010313,-0.010313,-0.010313,-0.010313,-0.010313,-0.010313,-0.010313,-0.010313
1,-0.033584,-0.033584,-0.033584,-0.033584,-0.033584,-0.033584,-0.033584,-0.033584,-0.033584,-0.033584,...,0.466416,0.466416,0.466416,0.466416,-0.533584,0.466416,0.466416,0.466416,0.466416,0.466416
2,-0.003279,-0.003279,-0.003279,-0.003279,-0.003279,-0.003279,-0.003279,-0.003279,-0.003279,-0.003279,...,-0.003279,-0.003279,-0.003279,-0.003279,-0.003279,-0.003279,-0.003279,-0.003279,-0.003279,-0.003279
3,-0.026074,-0.026074,-0.026074,-0.026074,-0.026074,-0.026074,-0.026074,-0.026074,-0.026074,-0.026074,...,-0.026074,-0.026074,-0.026074,-0.026074,-0.026074,-0.026074,-0.026074,-0.026074,-0.026074,-0.026074
4,0.006717,0.006717,0.006717,0.006717,0.006717,0.006717,0.006717,0.006717,0.006717,0.006717,...,0.006717,0.006717,0.006717,0.006717,0.006717,0.006717,0.006717,0.006717,0.006717,0.006717


위 matrix를 바탕으로 SVD 분석 수행

SVD
 - Target matrix $A$에 대해 $A = U \Sigma V$ 인 $U, \Sigma, V$ 를 구함
 - 여기서 $U, \Sigma, V$ 는 아래와 같음
   - $U = R^{ n_{user} \times n_{factor} }$
   - $\Sigma = R^{ n_{factor} \times n_{factor} }$ 인 대각행렬
   - $V = R^{ n_{factor} \times n_{item} }$

In [16]:
U, sigma, V = svds(Am, k=12)
print(f"U={U.shape}, sigma={sigma.shape}, V={V.shape}")
print(f"Singular Vlaues : {sigma}")

U=(6698, 12), sigma=(12,), V=(12, 9454)
Singular Vlaues : [ 55.84676263  58.08435018  58.50497257  61.71803668  69.91786616
  73.61657956  76.9598095   78.04681812  82.45156947  85.45001842
  91.51120585 101.4153135 ]


추론을 위해 predict matrix 복원
 - 처음 pivot table의 값을 SVD로 구한 matrix를 통해 복원했을 때, 두 행렬 사이의 오차 (restore error) 는 0에 가까울수록 SVD가 올바르게 구해짐

In [17]:
Sigma = np.diag(sigma)
A_pred = U @ Sigma @ V + a_mean.reshape(-1,1)
restore_error = np.sum(np.square(A_pred - A)) /A_pred.size
print(f"Restore Error : {restore_error}")

Restore Error : 0.007568071517679344


In [25]:
A_pred.shape

(6698, 9454)

## 🎯 예측 및 평가

### 학습 데이터 재현 평가

예측 함수 정의

In [21]:
def predict(userid, itemid):
    useridx = user_id2idx[userid]
    itemidx = item_id2idx[itemid]
    
    return A_pred[useridx, itemidx]

학습에 사용한 데이터를 얼마나 잘 예측하는지 평가

In [26]:
a_prob = [predict(u,i) for u,i in zip(train_data.userID, train_data.assessmentItemID)]
a_pred = [round(v) for v in a_prob] 
a_true = train_data.answerCode

print("Train data prediction")
print(f" - Accuracy = {100*accuracy_score(a_true, a_pred):.2f}%")
print(f" - ROC-AUC  = {100*roc_auc_score(a_true, a_prob):.2f}%")


Train data prediction
 - Accuracy = 75.92%
 - ROC-AUC  = 78.02%


In [37]:
def svd_apply(k_value):
    U, sigma, V = svds(Am, k=k_value)
    print(f"U={U.shape}, sigma={sigma.shape}, V={V.shape}")
    #print(f"Singular Vlaues : {sigma}")

    Sigma = np.diag(sigma)
    A_pred = U @ Sigma @ V + a_mean.reshape(-1,1)
    restore_error = np.sum(np.square(A_pred - A)) /A_pred.size
    print(f"Restore Error : {restore_error}")

    a_prob = [predict(u,i) for u,i in zip(train_data.userID, train_data.assessmentItemID)]
    a_pred = [round(v) for v in a_prob] 
    a_true = train_data.answerCode

    print("Train data prediction")
    print(f" - Accuracy = {100*accuracy_score(a_true, a_pred):.4f}%")
    print(f" - ROC-AUC  = {100*roc_auc_score(a_true, a_prob):.4f}%")


In [47]:
svd_apply(500)

U=(6698, 500), sigma=(500,), V=(500, 9454)
Restore Error : 0.00407137324373639
Train data prediction
 - Accuracy = 75.9245%
 - ROC-AUC  = 78.0223%


In [46]:
svd_apply(300)

U=(6698, 300), sigma=(300,), V=(300, 9454)
Restore Error : 0.00484295924655304
Train data prediction
 - Accuracy = 75.9245%
 - ROC-AUC  = 78.0223%


In [45]:
svd_apply(100)

U=(6698, 100), sigma=(100,), V=(100, 9454)
Restore Error : 0.005976612449808684
Train data prediction
 - Accuracy = 75.9245%
 - ROC-AUC  = 78.0223%


In [38]:
svd_apply(70)

U=(6698, 70), sigma=(70,), V=(70, 9454)
Restore Error : 0.006270176965598044
Train data prediction
 - Accuracy = 75.9245%
 - ROC-AUC  = 78.0223%


In [39]:
svd_apply(50)

U=(6698, 50), sigma=(50,), V=(50, 9454)
Restore Error : 0.006544577822317343
Train data prediction
 - Accuracy = 75.9245%
 - ROC-AUC  = 78.0223%


In [40]:
svd_apply(30)

U=(6698, 30), sigma=(30,), V=(30, 9454)
Restore Error : 0.006950405075311236
Train data prediction
 - Accuracy = 75.9245%
 - ROC-AUC  = 78.0223%


In [41]:
svd_apply(20)

U=(6698, 20), sigma=(20,), V=(20, 9454)
Restore Error : 0.007246007776531348
Train data prediction
 - Accuracy = 75.9245%
 - ROC-AUC  = 78.0223%


In [42]:
svd_apply(12)

U=(6698, 12), sigma=(12,), V=(12, 9454)
Restore Error : 0.007568071517679345
Train data prediction
 - Accuracy = 75.9245%
 - ROC-AUC  = 78.0223%


In [43]:
svd_apply(10)

U=(6698, 10), sigma=(10,), V=(10, 9454)
Restore Error : 0.00767060398938558
Train data prediction
 - Accuracy = 75.9245%
 - ROC-AUC  = 78.0223%


In [44]:
svd_apply(5)

U=(6698, 5), sigma=(5,), V=(5, 9454)
Restore Error : 0.008041128268997298
Train data prediction
 - Accuracy = 75.9245%
 - ROC-AUC  = 78.0223%


위 코드에서는 이미 학습된 사용자에 대해서만 추론값을 계산 가능하다.

- 테스트 데이터의 사용자는 학습 데이터셋에 존재하지 않는다.
- 따라서 해당 사용자의 값을 가져올 수 없기에 키 에러가 발생한다.

In [17]:
try:
    a_prob = [predict(u,i) for u,i in zip(test_data.userID, test_data.assessmentItemID)]
    a_pred = [round(v) for v in a_prob]
    a_true = test_data.answerCode

    print("Test data prediction")
    print(f" - Accuracy = {100*accuracy_score(a_true, a_pred):.2f}%")
    print(f" - ROC-AUC  = {100*roc_auc_score(a_true, a_prob):.2f}%")
except:
    print("Error Occurs!!")

Error Occurs!!


### 테스트 데이터 재현 평가

학습되지 않은 사용자에 대해서도 문제를 푼 데이터가 존재할 경우 이를 바탕으로 추론 가능하다.

$B = $ {학습되지 않은 사용자에 대한 User - Item 간 value 행렬}

$ A = U \Sigma V$ 일때 factor matrx $A_{factor} = R^{n_{user} \times n_{factor}}$ 인 $A_{factor}$ 는
- $A_{factor} \Sigma V = A$
- $A_{factor} = A {(\Sigma V)}^+ = A V^T \Sigma^+  $ ($U, V$ 는 직교행렬)

$ B_{pred} \approx B_{factor} \Sigma V  = B V^T \Sigma^+ \Sigma V$

In [51]:
sigma

array([ 55.84676263,  58.08435018,  58.50497257,  61.71803668,
        69.91786616,  73.61657956,  76.9598095 ,  78.04681812,
        82.45156947,  85.45001842,  91.51120585, 101.4153135 ])

In [50]:
def predict2(matrix, userid, itemid, user_id2idx, item_id2idx):
    
    Sigma_i = np.diag(1/sigma)
    pred_matrix = V.T @ Sigma_i @ Sigma @ V
    
    B = matrix
    B_mean = np.mean(B, axis=1)
    Bm = B - B_mean.reshape(-1,1)

    B_pred =  B @ pred_matrix + B_mean.reshape(-1,1)

    ret = [B_pred[user_id2idx[u], item_id2idx[i]] for u,i in zip(userid, itemid)]
    return ret

학습 데이터 재현 성공률

In [19]:
a_prob = predict2(matrix_train.values, train_data.userID, train_data.assessmentItemID, user_id2idx, item_id2idx)
a_true = train_data.answerCode
a_pred = [round(v) for v in a_prob]

print("Train data prediction")
print(f" - Accuracy = {100*accuracy_score(a_true, a_pred):.2f}%")
print(f" - ROC-AUC  = {100*roc_auc_score(a_true, a_prob):.2f}%")

Train data prediction
 - Accuracy = 75.92%
 - ROC-AUC  = 78.02%


In [55]:
def predict3(V, sigma, matrix, userid, itemid, user_id2idx, item_id2idx):
    
    Sigma_i = np.diag(1/sigma)
    Sigma = np.diag(sigma)
    pred_matrix = V.T @ Sigma_i @ Sigma @ V
    
    B = matrix
    B_mean = np.mean(B, axis=1)
    Bm = B - B_mean.reshape(-1,1)

    B_pred =  B @ pred_matrix + B_mean.reshape(-1,1)

    ret = [B_pred[user_id2idx[u], item_id2idx[i]] for u,i in zip(userid, itemid)]
    return ret
    
def train_svd_for_test(k_value):
    U, sigma, V = svds(Am, k=k_value)
    print(f"U={U.shape}, sigma={sigma.shape}, V={V.shape}")
    #print(f"Singular Vlaues : {sigma}")

    a_prob = predict3(V, sigma, matrix_train.values, train_data.userID, train_data.assessmentItemID, user_id2idx, item_id2idx)
    a_true = train_data.answerCode
    a_pred = [round(v) for v in a_prob]


    print("Train data prediction")
    print(f" - Accuracy = {100*accuracy_score(a_true, a_pred):.2f}%")
    print(f" - ROC-AUC  = {100*roc_auc_score(a_true, a_prob):.2f}%")


In [56]:
train_svd_for_test(300)

U=(6698, 300), sigma=(300,), V=(300, 9454)
Train data prediction
 - Accuracy = 86.92%
 - ROC-AUC  = 93.31%


In [57]:
train_svd_for_test(12)

U=(6698, 12), sigma=(12,), V=(12, 9454)
Train data prediction
 - Accuracy = 75.92%
 - ROC-AUC  = 78.02%


In [58]:
train_svd_for_test(3)

U=(6698, 3), sigma=(3,), V=(3, 9454)
Train data prediction
 - Accuracy = 72.76%
 - ROC-AUC  = 73.50%


테스트 데이터 재현 성공률

In [64]:
# item_id2idx는 train에서 사용한 것을 다시 사용한다.
userid = sorted(list(set([u for u in test_data.userID])))
user_id2idx_test = {v:i for i,v in enumerate(userid)}

matrix_test = 0.5*np.ones((len(userid), len(item_id2idx)))
for user,item,a in zip(test_data.userID, test_data.assessmentItemID, test_data.answerCode):
    user,item = user_id2idx_test[user],item_id2idx[item]
    matrix_test[user,item] = a

B = matrix_test
b_mean = np.mean(B, axis=1)
Bm = B - b_mean.reshape(-1,1)

# 성능 측정
U, sigma, V = svds(Bm, k=300)
a_prob = predict3(V, sigma, matrix_test, test_data.userID, test_data.assessmentItemID, user_id2idx_test, item_id2idx)
a_true = test_data.answerCode
a_pred = [round(v) for v in a_prob] 

print("Test data prediction")
print(f" - Accuracy = {100*accuracy_score(a_true, a_pred):.2f}%")
print(f" - ROC-AUC  = {100*roc_auc_score(a_true, a_prob):.2f}%")

Test data prediction
 - Accuracy = 98.33%
 - ROC-AUC  = 99.86%


### 테스트 평가 데이터 재현 평가

테스트 데이터 기반 선별된 평가항목 추론

In [65]:
U, sigma, V = svds(Bm, k=300)
a_prob = predict3(V, sigma, matrix_test, eval_data.userID, eval_data.assessmentItemID, user_id2idx_test, item_id2idx)
a_true = eval_data.answerCode
a_pred = [round(v) for v in a_prob] 

print("Test data prediction")
print(f" - Accuracy = {100*accuracy_score(a_true, a_pred):.2f}%")
print(f" - ROC-AUC  = {100*roc_auc_score(a_true, a_prob):.2f}%")

Test data prediction
 - Accuracy = 67.61%
 - ROC-AUC  = 73.25%


In [48]:
len(a_pred)

2220633

In [66]:
fin_test_data = test_data_old[test_data_old.answerCode == -1]
fin_test_data

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag
1035,3,A050133008,A050000133,-1,2020-10-26 13:13:57,5289
1706,4,A070146008,A070000146,-1,2020-12-27 02:47:54,9080
3023,13,A070111008,A070000111,-1,2020-12-27 04:35:09,9660
4283,17,A090064006,A090000064,-1,2020-10-30 05:48:37,2611
4670,26,A060135007,A060000135,-1,2020-10-23 11:44:18,1422
...,...,...,...,...,...,...
260052,7395,A040122005,A040000122,-1,2020-09-08 02:05:20,10615
260067,7404,A030111005,A030000111,-1,2020-10-13 09:49:18,7636
260082,7416,A050193004,A050000193,-1,2020-10-04 02:44:41,10402
260097,7417,A050193004,A050000193,-1,2020-09-06 13:09:15,10402


In [67]:
eval_data

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag
1034,3,A050133007,A050000133,0,2020-10-26 13:13:11,5289
1705,4,A070146007,A070000146,1,2020-12-27 02:47:31,9080
3022,13,A070111007,A070000111,1,2020-12-27 04:35:01,9660
4282,17,A090064005,A090000064,1,2020-10-30 05:47:22,2611
4669,26,A060135006,A060000135,0,2020-10-23 11:44:01,1422
...,...,...,...,...,...,...
260051,7395,A040122004,A040000122,0,2020-09-08 02:05:18,2102
260066,7404,A030111004,A030000111,1,2020-10-13 09:47:31,7636
260081,7416,A050193003,A050000193,0,2020-10-04 02:44:17,10402
260096,7417,A050193003,A050000193,0,2020-09-06 13:08:54,10402


In [68]:
a_prob = predict3(V, sigma, matrix_test, fin_test_data.userID, fin_test_data.assessmentItemID, user_id2idx_test, item_id2idx)
#a_true = eval_data.answerCode
a_pred = [round(v) for v in a_prob] 

print("Test data prediction")
#print(f" - Accuracy = {100*accuracy_score(a_true, a_pred):.2f}%")
#print(f" - ROC-AUC  = {100*roc_auc_score(a_true, a_prob):.2f}%")

Test data prediction


In [71]:
len(a_prob)


744

In [74]:
submit = pd.read_csv('/opt/ml/input/data/sample_submission.csv')
submit

Unnamed: 0,id,prediction
0,0,0.5
1,1,0.5
2,2,0.5
3,3,0.5
4,4,0.5
...,...,...
739,739,0.5
740,740,0.5
741,741,0.5
742,742,0.5


In [75]:
submit = pd.read_csv('/opt/ml/input/data/sample_submission.csv')
submit.prediction = a_prob
submit

Unnamed: 0,id,prediction
0,0,0.491703
1,1,0.514357
2,2,0.495379
3,3,0.522067
4,4,0.523577
...,...,...
739,739,0.501367
740,740,0.490454
741,741,0.515057
742,742,0.512841


In [76]:
submit.to_csv('mf.csv')

In [84]:
from sklearn.decomposition import NMF

def predict_nmf(comp_value, matrix, userid, itemid, user_id2idx, item_id2idx):
    X = matrix_train.values
    nmf = NMF(n_components=comp_value)
    nmf.fit(X)

    X = matrix
    X_pred = nmf.inverse_transform(nmf.transform(X))

    ret = [X_pred[user_id2idx[u], item_id2idx[i]] for u,i in zip(userid, itemid)]
    return ret

In [85]:
comp_value = 12

# item_id2idx는 train에서 사용한 것을 다시 사용한다.
userid = sorted(list(set([u for u in test_data.userID])))
user_id2idx_test = {v:i for i,v in enumerate(userid)}

matrix_test = 0.5*np.ones((len(userid), len(item_id2idx)))
for user,item,a in zip(test_data.userID, test_data.assessmentItemID, test_data.answerCode):
    user,item = user_id2idx_test[user],item_id2idx[item]
    matrix_test[user,item] = a

# 성능 측정
a_prob = predict_nmf(comp_value, matrix_test, test_data.userID, test_data.assessmentItemID, user_id2idx_test, item_id2idx)
a_true = test_data.answerCode
a_pred = [round(v) for v in a_prob] 

print("Test data prediction")
print(f" - Accuracy = {100*accuracy_score(a_true, a_pred):.2f}%")
print(f" - ROC-AUC  = {100*roc_auc_score(a_true, a_prob):.2f}%")



Test data prediction
 - Accuracy = 75.62%
 - ROC-AUC  = 77.29%


In [86]:
comp_value = 3

# item_id2idx는 train에서 사용한 것을 다시 사용한다.
userid = sorted(list(set([u for u in test_data.userID])))
user_id2idx_test = {v:i for i,v in enumerate(userid)}

matrix_test = 0.5*np.ones((len(userid), len(item_id2idx)))
for user,item,a in zip(test_data.userID, test_data.assessmentItemID, test_data.answerCode):
    user,item = user_id2idx_test[user],item_id2idx[item]
    matrix_test[user,item] = a

# 성능 측정
a_prob = predict_nmf(comp_value, matrix_test, test_data.userID, test_data.assessmentItemID, user_id2idx_test, item_id2idx)
a_true = test_data.answerCode
a_pred = [round(v) for v in a_prob] 

print("Test data prediction")
print(f" - Accuracy = {100*accuracy_score(a_true, a_pred):.2f}%")
print(f" - ROC-AUC  = {100*roc_auc_score(a_true, a_prob):.2f}%")



Test data prediction
 - Accuracy = 73.09%
 - ROC-AUC  = 74.48%


In [87]:
comp_value = 50

# item_id2idx는 train에서 사용한 것을 다시 사용한다.
userid = sorted(list(set([u for u in test_data.userID])))
user_id2idx_test = {v:i for i,v in enumerate(userid)}

matrix_test = 0.5*np.ones((len(userid), len(item_id2idx)))
for user,item,a in zip(test_data.userID, test_data.assessmentItemID, test_data.answerCode):
    user,item = user_id2idx_test[user],item_id2idx[item]
    matrix_test[user,item] = a

# 성능 측정
a_prob = predict_nmf(comp_value, matrix_test, test_data.userID, test_data.assessmentItemID, user_id2idx_test, item_id2idx)
a_true = test_data.answerCode
a_pred = [round(v) for v in a_prob] 

print("Test data prediction")
print(f" - Accuracy = {100*accuracy_score(a_true, a_pred):.2f}%")
print(f" - ROC-AUC  = {100*roc_auc_score(a_true, a_prob):.2f}%")



Test data prediction
 - Accuracy = 77.78%
 - ROC-AUC  = 81.36%


In [88]:
comp_value = 100

# item_id2idx는 train에서 사용한 것을 다시 사용한다.
userid = sorted(list(set([u for u in test_data.userID])))
user_id2idx_test = {v:i for i,v in enumerate(userid)}

matrix_test = 0.5*np.ones((len(userid), len(item_id2idx)))
for user,item,a in zip(test_data.userID, test_data.assessmentItemID, test_data.answerCode):
    user,item = user_id2idx_test[user],item_id2idx[item]
    matrix_test[user,item] = a

# 성능 측정
a_prob = predict_nmf(comp_value, matrix_test, test_data.userID, test_data.assessmentItemID, user_id2idx_test, item_id2idx)
a_true = test_data.answerCode
a_pred = [round(v) for v in a_prob] 

print("Test data prediction")
print(f" - Accuracy = {100*accuracy_score(a_true, a_pred):.2f}%")
print(f" - ROC-AUC  = {100*roc_auc_score(a_true, a_prob):.2f}%")



Test data prediction
 - Accuracy = 78.82%
 - ROC-AUC  = 83.11%


In [91]:
comp_value = 100

# item_id2idx는 train에서 사용한 것을 다시 사용한다.
userid = sorted(list(set([u for u in fin_test_data.userID])))
user_id2idx_test = {v:i for i,v in enumerate(userid)}

matrix_test = 0.5*np.ones((len(userid), len(item_id2idx)))
for user,item,a in zip(fin_test_data.userID, fin_test_data.assessmentItemID, fin_test_data.answerCode):
    user,item = user_id2idx_test[user],item_id2idx[item]
    matrix_test[user,item] = 0.5

# 성능 측정
a_prob = predict_nmf(comp_value, matrix_test, fin_test_data.userID, fin_test_data.assessmentItemID, user_id2idx_test, item_id2idx)
a_pred = [round(v) for v in a_prob] 

print("Test data prediction")
#print(f" - Accuracy = {100*accuracy_score(a_true, a_pred):.2f}%")
#print(f" - ROC-AUC  = {100*roc_auc_score(a_true, a_prob):.2f}%")



Test data prediction


In [94]:
len(a_prob)

744

In [95]:
submit = pd.read_csv('/opt/ml/input/data/sample_submission.csv')
submit.prediction = a_prob
submit

Unnamed: 0,id,prediction
0,0,0.499035
1,1,0.500940
2,2,0.501699
3,3,0.501253
4,4,0.500417
...,...,...
739,739,0.495091
740,740,0.501149
741,741,0.505622
742,742,0.505622


In [96]:
submit.to_csv('nmf.csv')

In [97]:
svd = pd.read_csv('mf.csv')
svd

Unnamed: 0.1,Unnamed: 0,id,prediction
0,0,0,0.491703
1,1,1,0.514357
2,2,2,0.495379
3,3,3,0.522067
4,4,4,0.523577
...,...,...,...
739,739,739,0.501367
740,740,740,0.490454
741,741,741,0.515057
742,742,742,0.512841
