In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import SGDClassifier

%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [None]:
import os
os.chdir("C:/Users/dlgus/Desktop/PSAT/KartRider")

In [None]:
newdata = pd.read_csv('user_to_map.csv', encoding = "cp949")

In [None]:
newdata.shape

In [None]:
newdata

Unnamed: 0,accountNo,공동묘지 유령의 계곡 R,공동묘지 해골 손가락 R,광산 보석채굴장 R,광산 뽀글뽀글 용암굴 R,노르테유 부스터존 점령작전 R,노르테유 붕붕 점프 R,노르테유 우주 정거장 R,노르테유 우주선 침투 R,노르테유 위험한 지름길 R,...,해적 폭풍 속의 질주,해적 해적선 보물탈취,황금문명 CSO 아즈텍,황금문명 고디우스의 분노,황금문명 비밀장치의 위협,황금문명 신비의 황금 콘도르,황금문명 오르에트 황금 좌표,황금문명 전설의 황금용광로,황금문명 황금신전을 찾아서,황금문명 흔들리는 위협
0,1006645812,0,0,2,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,1006663416,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,1006666250,0,0,2,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,1006675051,0,0,1,0,0,0,0,0,0,...,0,0,1,2,0,1,0,0,0,1
4,1006675682,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31885,990700055,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31886,990700279,1,0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,3,0,0
31887,990701335,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
31888,990701775,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
data = newdata.drop('accountNo', axis = 1)
for i in range(0, len(data.columns)):
    data.iloc[:,i] = pd.to_numeric(data.iloc[:,i], errors='ignore')

In [None]:
data.dtypes

공동묘지 유령의 계곡 R       int64
공동묘지 해골 손가락 R       int64
광산 보석채굴장 R          int64
광산 뽀글뽀글 용암굴 R       int64
노르테유 부스터존 점령작전 R    int64
                    ...  
황금문명 신비의 황금 콘도르     int64
황금문명 오르에트 황금 좌표     int64
황금문명 전설의 황금용광로      int64
황금문명 황금신전을 찾아서      int64
황금문명 흔들리는 위협        int64
Length: 285, dtype: object

data = newdata.drop('accountNo', axis = 1)

In [None]:
# M명의 사용자가 N개 맵에 대해 평가한 데이터 포함한 2차원 행렬

R = np.array(data, dtype = float)

In [None]:
R = np.nan_to_num(R)

In [None]:
data.iloc[4].unique()

array([0, 1, 5, 3, 2, 4, 6], dtype=int64)

In [None]:
num_users, num_items = R.shape
k = 3

In [None]:
np.random.seed(6012)

In [None]:
# P : 사용자 잠재요인 행렬 (각 사용자의 특성을 나타냄)
# Q : 아이템 잠재요인 행렬 (각 아이템의 특성을 나타냄)

P = np.random.normal(scale = 1./k, size = (num_users, k))
Q = np.random.normal(scale = 1./k, size = (num_items, k))

In [None]:
P = np.nan_to_num(P)
Q = np.nan_to_num(Q)

In [None]:
non_zeros = [(i, j, R[i,j]) for i in range(num_users) for j in range(num_items) if R[i,j]>0]

In [None]:
non_zeros

[(0, 2, 2.0),
 (0, 18, 1.0),
 (0, 26, 1.0),
 (0, 31, 1.0),
 (0, 57, 1.0),
 (0, 66, 1.0),
 (0, 71, 1.0),
 (0, 92, 4.0),
 (0, 99, 1.0),
 (0, 113, 1.0),
 (0, 121, 2.0),
 (0, 141, 1.0),
 (0, 147, 1.0),
 (0, 153, 3.0),
 (0, 164, 2.0),
 (0, 171, 1.0),
 (0, 174, 3.0),
 (0, 183, 1.0),
 (0, 187, 2.0),
 (0, 188, 1.0),
 (0, 208, 1.0),
 (0, 224, 1.0),
 (0, 226, 1.0),
 (0, 239, 1.0),
 (0, 241, 1.0),
 (0, 242, 1.0),
 (0, 244, 1.0),
 (0, 251, 1.0),
 (0, 266, 1.0),
 (0, 275, 1.0),
 (1, 49, 1.0),
 (1, 50, 1.0),
 (1, 54, 2.0),
 (1, 89, 1.0),
 (1, 120, 1.0),
 (1, 135, 1.0),
 (1, 143, 1.0),
 (1, 144, 2.0),
 (1, 146, 1.0),
 (1, 156, 1.0),
 (1, 157, 21.0),
 (1, 158, 1.0),
 (1, 162, 1.0),
 (1, 175, 1.0),
 (1, 219, 1.0),
 (1, 227, 1.0),
 (1, 238, 1.0),
 (1, 247, 1.0),
 (1, 269, 1.0),
 (1, 281, 1.0),
 (2, 2, 2.0),
 (2, 3, 2.0),
 (2, 26, 1.0),
 (2, 29, 1.0),
 (2, 60, 1.0),
 (2, 71, 2.0),
 (2, 72, 2.0),
 (2, 76, 1.0),
 (2, 92, 1.0),
 (2, 102, 1.0),
 (2, 121, 3.0),
 (2, 125, 1.0),
 (2, 129, 1.0),
 (2, 139, 1.0),


In [None]:
np.isnan(non_zeros).any()

False

In [None]:
from sklearn.metrics import mean_squared_error

def get_rmse(R, P, Q, non_zeros):
    error = 0
    
    full_pred_matrix = np.dot(P, Q.T)
    
    
    x_non_zero_ind = [non_zero[0] for non_zero in non_zeros]
    y_non_zero_ind = [non_zero[1] for non_zero in non_zeros]
    R_non_zeros = R[x_non_zero_ind, y_non_zero_ind]
    full_pred_matrix_non_zeros = full_pred_matrix[x_non_zero_ind, y_non_zero_ind]

    # 원래 방식
    
    mse = mean_squared_error(R_non_zeros, full_pred_matrix_non_zeros)
    
    # 내가 새로 짜는 코드 방식
    
    mse = np.subtract(R_non_zeros, full_pred_matrix_non_zeros)
    mse2 = np.multiply(mse, mse)
    lengtharray = np.repeat(len(mse), 1)
    mse3 = np.divide(mse2, lengtharray)
    
    
    rmse = np.sqrt(mse3)
    
    return rmse

In [None]:
steps = 1000
learning_rate = 0.01
r_lambda = 0.01

for step in range(steps):
    for i, j, r in non_zeros:
        
        eij = r - np.dot(P[i,:], Q[j,:].T)
        
        P[i,:] = P[i,:] + learning_rate*(eij * Q[j,:] - r_lambda * P[i,:])
        Q[j,:] = Q[j,:] + learning_rate*(eij * P[i,:] - r_lambda * Q[j,:])
        
    rmse = get_rmse(R, P, Q, non_zeros)
    if (step % 50) == 0:
        print("###iteration steps: ", step, "rmse: ", rmse)

###iteration steps:  0 rmse:  [nan nan nan ... nan nan nan]


In [None]:
pred_matrix = np.dot(P, Q.T)
print("예측 행렬:\n", np.round(pred_matrix, 3))

In [None]:
# train, test 분리

from sklearn.utils import shuffle
TRAIN_SIZE = 0.8
cutoff = int(TRAIN_SIZE * len(newdata))
newdata_train = newdata.iloc[:cutoff]
newdata_test = newdata.iloc[cutoff:]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, np.ravel(data.target), test_size = 0.3, random_state = 2728)

In [None]:
clf = SGDClassfier(loss = 'hinge', penalty = 'elasticnet', fit_intercept = True, random_state = 6012, shuffle = False, verbose = 0)

In [None]:
# Randomized Search CV

param_dist = {'learning_rate' = [0.001, 0.01, 0.1],
             'alpha' : [0.0001, 0.001, 0.01, 0.1]}

In [None]:
n_iter_search = 20
random_search = RandomizedSearchCV(clf, param_distributions = param_dist, n_iter = n_iter_search)
random_search.fit(X,y)

In [None]:
start = time()
print("RandomizedSearchCV took %.2f sconds for %d candidates"
     " parameter settings." % ((time() - start), n_iter_search))

In [None]:
# Grid Search CV

param_grid = {'learning_rate' = [0.001, 0.01, 0.1],
             'alpha' : [0.0001, 0.001, 0.01, 0.1]}

In [None]:
P = np.nan_to_num(P)
Q = np.nan_to_num(Q)