Same as User Based Matrix

In [3]:
import pandas as pd
# reading csv file created and exception handling
try:
    df = pd.read_csv('final_user_data.csv')
    # print dataset
    print("========================== Dataset ========================== ")
    print(df)
except FileNotFoundError as e:
    print("Error: Cannot find the File(CSV), Check the Path again")
    exit(1)

          userId                    gameName isPlay  playTime
0      151603712  The Elder Scrolls V Skyrim   play     273.0
1      151603712                   Fallout 4   play      87.0
2      151603712                       Spore   play      14.9
3      151603712           Fallout New Vegas   play      12.1
4      151603712               Left 4 Dead 2   play       8.9
...          ...                         ...    ...       ...
69546  128470551                Fallen Earth   play       2.4
69547  128470551                 Magic Duels   play       2.2
69548  128470551                 Titan Souls   play       1.5
69549  128470551  Grand Theft Auto Vice City   play       1.5
69550  128470551                        RUSH   play       1.4

[69551 rows x 4 columns]


In [4]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Assume df is your original DataFrame

# pivot_table 생성 (fill_value를 사용하지 않고 NaN 유지)
pivot_table = df.pivot_table(index='userId', columns='gameName', values='playTime', aggfunc='sum', fill_value=0)

# 각 userId의 row 별로 Min-Max Scaling 수행
scaler = MinMaxScaler()
scaled_values = scaler.fit_transform(pivot_table.T).T

# 스케일링된 값을 다시 pivot_table에 적용
pivot_table.iloc[:, :] = scaled_values

# Categorical encoding (0 to 5)
bins = [0, 0.2, 0.4, 0.6, 0.8, 1.0]  # Define your bins as needed
labels = [0, 1, 2, 3, 4]  # Corresponding labels for each bin

# Mapping scaled values to categorical labels
categorical_values = pd.cut(pivot_table.values.flatten(), bins=bins, labels=labels, include_lowest=True)

# Reshape back to the original shape
pivot_table = pd.DataFrame(categorical_values.reshape(pivot_table.shape), index=pivot_table.index, columns=pivot_table.columns)

# Convert values to int
pivot_table = pivot_table.astype(int)

# 결과 출력
print(pivot_table)


gameName   007 Legends  0RBITALIS   
userId                              
5250                 0          0  \
76767                0          0   
86540                0          0   
144736               0          0   
181212               0          0   
...                ...        ...   
309434439            0          0   
309554670            0          0   
309626088            0          0   
309824202            0          0   
309903146            0          0   

gameName   1... 2... 3... KICK IT! (Drop That Beat Like an Ugly Baby)   
userId                                                                  
5250                                                       0           \
76767                                                      0            
86540                                                      0            
144736                                                     0            
181212                                                     0            
...    

In [5]:
# userId 기준으로 각 row의 모든 값이 0인 행 삭제
pivot_table = pivot_table[(pivot_table.T != 0).any()]

# 결과 출력
print(pivot_table)


gameName   007 Legends  0RBITALIS   
userId                              
5250                 0          0  \
76767                0          0   
86540                0          0   
144736               0          0   
181212               0          0   
...                ...        ...   
309434439            0          0   
309554670            0          0   
309626088            0          0   
309824202            0          0   
309903146            0          0   

gameName   1... 2... 3... KICK IT! (Drop That Beat Like an Ugly Baby)   
userId                                                                  
5250                                                       0           \
76767                                                      0            
86540                                                      0            
144736                                                     0            
181212                                                     0            
...    

For Item Based Change the position(userId, gameName)

In [6]:
# pivot_table을 Transpose하여 userId와 gameName의 위치를 바꿈
pivot_table_item_based = pivot_table.T

# 결과 출력
print(pivot_table_item_based)


userId                                              5250       76767       
gameName                                                                   
007 Legends                                                 0          0  \
0RBITALIS                                                   0          0   
1... 2... 3... KICK IT! (Drop That Beat Like an...          0          0   
10 Second Ninja                                             0          0   
10,000,000                                                  0          0   
...                                                       ...        ...   
rymdkapsel                                                  0          0   
sZone-Online                                                0          0   
the static speaks my name                                   0          0   
theHunter                                                   0          0   
theHunter Primal                                            0          0   

userId     

In [7]:
from sklearn.metrics.pairwise import cosine_similarity

# ITEM 간의 코사인 유사도 계산
item_similarity = cosine_similarity(pivot_table_item_based)

# 유사도 행렬을 DataFrame으로 변환
item_similarity = pd.DataFrame(item_similarity, index=pivot_table_item_based.index, columns=pivot_table_item_based.index)

# 결과 출력
print(item_similarity)


gameName                                            007 Legends  0RBITALIS   
gameName                                                                     
007 Legends                                                 0.0        0.0  \
0RBITALIS                                                   0.0        0.0   
1... 2... 3... KICK IT! (Drop That Beat Like an...          0.0        0.0   
10 Second Ninja                                             0.0        0.0   
10,000,000                                                  0.0        0.0   
...                                                         ...        ...   
rymdkapsel                                                  0.0        0.0   
sZone-Online                                                0.0        0.0   
the static speaks my name                                   0.0        0.0   
theHunter                                                   0.0        0.0   
theHunter Primal                                            0.0 

In [8]:
def find_n_neighbors(similarity_matrix, n=5):
    """
    Parameters:
    - similarity_matrix: 사용자 간의 유사도 행렬
    - n: 반환할 이웃의 수

    Returns:
    - top_n_neighbors: 각 사용자에 대해 가장 유사한 상위 n 이웃
    """
    top_n_neighbors = pd.DataFrame(index=similarity_matrix.index, columns=range(1, n + 1))

    for user in similarity_matrix.index:
        # 각 사용자에 대해 가장 유사한 상위 n 이웃 선택
        top_n_neighbors.loc[user, :] = similarity_matrix.loc[user].sort_values(ascending=False).iloc[1:n + 1].index

    return top_n_neighbors

# 예를 들어, top 30 이웃을 찾는 경우
top_30_neighbors = find_n_neighbors(item_similarity, n=30)

# 결과 출력
print(top_30_neighbors.head())


                                                              1    
gameName                                                           
007 Legends                                         Risk of Rain  \
0RBITALIS                                           Risk of Rain   
1... 2... 3... KICK IT! (Drop That Beat Like an...  Risk of Rain   
10 Second Ninja                                     Risk of Rain   
10,000,000                                          Risk of Rain   

                                                                2    
gameName                                                             
007 Legends                                         Rayman Origins  \
0RBITALIS                                           Rayman Origins   
1... 2... 3... KICK IT! (Drop That Beat Like an...  Rayman Origins   
10 Second Ninja                                     Rayman Origins   
10,000,000                                          Rayman Origins   

                                

In [9]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Assume df is your original DataFrame

# pivot_table 생성 (fill_value를 사용하지 않고 NaN 유지)
pivot_table = df.pivot_table(index='userId', columns='gameName', values='playTime', aggfunc='sum', fill_value=0)

# 각 userId의 row 별로 Min-Max Scaling 수행
scaler = MinMaxScaler()
scaled_values = scaler.fit_transform(pivot_table.T).T

# 스케일링된 값을 다시 pivot_table에 적용
pivot_table.iloc[:, :] = scaled_values

# pivot_table을 Transpose하여 userId와 gameName의 위치를 바꿈
pivot_table_item_based1 = pivot_table.T

# Categorical encoding (0 to 4)
bins = [0, 0.2, 0.4, 0.6, 0.8, 1.0]  
labels = [0, 1, 2, 3, 4]  # Corresponding labels for each bin

# Mapping scaled values to categorical labels
categorical_values = pd.cut(pivot_table_item_based1.values.flatten(), bins=bins, labels=labels, include_lowest=True)

# Reshape back to the original shape
pivot_table_item_based1 = pd.DataFrame(categorical_values.reshape(pivot_table_item_based1.shape), index=pivot_table_item_based1.index, columns=pivot_table_item_based1.columns)

# Convert values to int
pivot_table_item_based1 = pivot_table_item_based1.astype(int)


print(pivot_table_item_based1)


userId                                              5250       76767       
gameName                                                                   
007 Legends                                                 0          0  \
0RBITALIS                                                   0          0   
1... 2... 3... KICK IT! (Drop That Beat Like an...          0          0   
10 Second Ninja                                             0          0   
10,000,000                                                  0          0   
...                                                       ...        ...   
rymdkapsel                                                  0          0   
sZone-Online                                                0          0   
the static speaks my name                                   0          0   
theHunter                                                   0          0   
theHunter Primal                                            0          0   

userId     

In [10]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Assume df is your original DataFrame

# pivot_table 생성 (fill_value를 사용하지 않고 NaN 유지)
pivot_table = df.pivot_table(index='userId', columns='gameName', values='playTime', aggfunc='sum', fill_value=0)

# 각 userId의 row 별로 Min-Max Scaling 수행
scaler = MinMaxScaler()
scaled_values = scaler.fit_transform(pivot_table.T).T

# 스케일링된 값을 다시 pivot_table에 적용
pivot_table.iloc[:, :] = scaled_values

# Categorical encoding (0 to 5)
bins = [0, 0.2, 0.4, 0.6, 0.8, 1.0]  # Define your bins as needed
labels = [0, 1, 2, 3, 4]  # Corresponding labels for each bin

# Mapping scaled values to categorical labels
categorical_values = pd.cut(pivot_table.values.flatten(), bins=bins, labels=labels, include_lowest=True)

# Reshape back to the original shape
pivot_table = pd.DataFrame(categorical_values.reshape(pivot_table.shape), index=pivot_table.index, columns=pivot_table.columns)

pivot_table = pivot_table.astype(int)


# 결과 출력
print(pivot_table)


gameName   007 Legends  0RBITALIS   
userId                              
5250                 0          0  \
76767                0          0   
86540                0          0   
144736               0          0   
181212               0          0   
...                ...        ...   
309434439            0          0   
309554670            0          0   
309626088            0          0   
309824202            0          0   
309903146            0          0   

gameName   1... 2... 3... KICK IT! (Drop That Beat Like an Ugly Baby)   
userId                                                                  
5250                                                       0           \
76767                                                      0            
86540                                                      0            
144736                                                     0            
181212                                                     0            
...    

AVERAGED WEIGHTED 

In [11]:
import pandas as pd
import numpy as np

# Assuming 'item_similarity' is a DataFrame containing item-item similarity scores
# Assuming 'pivot_table' is a DataFrame containing user-item ratings
target_user_id = 5250

# Step 1: Extract games rated by the target user
rated_games = pivot_table.columns[pivot_table.loc[target_user_id] != 0]

# Convert rated_games to a list
rated_games_list = list(rated_games)

# Step 2: Extract games with zero ratings for the target user
unrated_games = pivot_table.columns[pivot_table.loc[target_user_id] == 0]

# Step 3: Predict ratings for unrated games using average weighted similarity
predicted_ratings = pd.Series(index=unrated_games)
for game in unrated_games:
    # 현재 미평가 게임에 대한 가중 평균 계산
    relevant_ratings = pivot_table.loc[pivot_table[game] != 0, rated_games_list]
    relevant_weights = item_similarity.loc[game, rated_games_list]

    # 0이 아닌 가중치만 사용하여 가중 평균 계산
    non_zero_weights = relevant_weights[relevant_weights != 0]
    if not non_zero_weights.empty and len(non_zero_weights) == len(relevant_ratings.columns):
        weighted_means = np.average(relevant_ratings, weights=non_zero_weights, axis=1)
        
        # 가중 평균의 평균값 사용
        if not np.isnan(weighted_means).all():
            predicted_ratings.loc[game] = np.mean(weighted_means)

# Step 4: Extract top 3 recommendations
top_n_recommendations = predicted_ratings.nlargest(5)

print(f"Top 5 Recommendations for UserId {target_user_id}:")
print(top_n_recommendations)


Top 5 Recommendations for UserId 5250:
gameName
Adventurer Manager                                    1.000000
Spacebase DF-9                                        1.000000
Vampire The Masquerade - Bloodlines                   0.819872
Banished                                              0.572400
DYNASTY WARRIORS 8 Xtreme Legends Complete Edition    0.500000
dtype: float64


EVALUATION: MSE, MAE

In [12]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

# 'pivot_table'은 사용자-아이템 평점을 포함하는 DataFrame이라고 가정합니다.
# 'item_similarity'은 아이템-아이템 유사도 점수를 포함하는 DataFrame이라고 가정합니다.

# 대상 사용자 ID를 설정합니다.
target_user_id = 86540

# Step 1: 예측 평점을 저장할 빈 DataFrame을 만듭니다.
predicted_ratings = pd.DataFrame(index=pivot_table.index, columns=pivot_table.columns)

# Step 2: 아이템 기반 협업 필터링 코드를 사용하여 예측 평점을 채웁니다.
rated_games = pivot_table.columns[pivot_table.loc[target_user_id] != 0]

for game in rated_games:
    unrated_games = pivot_table.columns[pivot_table.loc[target_user_id] == 0]
    relevant_ratings = pivot_table.loc[pivot_table[game] != 0, rated_games]
    relevant_weights = item_similarity.loc[game, rated_games]

    non_zero_weights = relevant_weights[relevant_weights != 0]
    if not non_zero_weights.empty and len(non_zero_weights) == len(relevant_ratings.columns):
        # 0이 아닌 가중치가 있는 경우에만 가중 평균을 계산합니다.
        weighted_means = np.average(relevant_ratings, weights=non_zero_weights, axis=1)

        if not np.isnan(weighted_means).all():
            # 가중 평균을 예측 평점으로 사용합니다.
            predicted_ratings.loc[target_user_id, game] = np.mean(weighted_means)

# Step 3: MSE 및 MAE 계산을 위해 데이터프레임을 펼칩니다.
actual_ratings_flat = pivot_table.loc[target_user_id, rated_games].values

predicted_ratings_flat = predicted_ratings.loc[target_user_id, rated_games].values

predicted_ratings_series = pd.Series(predicted_ratings_flat, index=rated_games)
actual_ratings_series = pd.Series(actual_ratings_flat, index=rated_games)


# Step 4: MSE 및 MAE 계산
mse = mean_squared_error(actual_ratings_flat, predicted_ratings_flat)
mae = mean_absolute_error(actual_ratings_flat, predicted_ratings_flat)

print("===== PREDICTED RATINGS(PLAYTIME) =====")
print(predicted_ratings_series)

print("\n===== ACTUAL RATINGS(PLAYTIME) =====")
print(actual_ratings_series)

print("\n===== PERFORMANCE SCORE =====")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")



===== PREDICTED RATINGS(PLAYTIME) =====
gameName
Audiosurf                     1.955702
The Elder Scrolls V Skyrim    2.686008
XCOM Enemy Unknown            2.334799
dtype: object

===== ACTUAL RATINGS(PLAYTIME) =====
gameName
Audiosurf                     2
The Elder Scrolls V Skyrim    4
XCOM Enemy Unknown            1
dtype: int64

===== PERFORMANCE SCORE =====
Mean Squared Error (MSE): 1.1700753962375936
Mean Absolute Error (MAE): 0.8976962709765098


In [None]:
import random
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error

# 'pivot_table'은 사용자-아이템 평점을 포함하는 DataFrame이라고 가정합니다.
# 'item_similarity'은 아이템-아이템 유사도 점수를 포함하는 DataFrame이라고 가정합니다.

# 전체 사용자를 포함한 인덱스 리스트
all_users = list(pivot_table.index)

# k-fold 교차 검증을 위해 데이터를 섞음
random.shuffle(all_users)

# k-fold 교차 검증 설정 (여기서는 k=5로 설정)
kfold = KFold(n_splits=5)

total_mse = 0
total_mae = 0

for train_indices, test_indices in kfold.split(all_users):
    train_users = [all_users[i] for i in train_indices]
    test_users = [all_users[i] for i in test_indices]

    for target_user_id in test_users:
        # Step 1: 예측 평점을 저장할 빈 DataFrame을 만듭니다.
        predicted_ratings = pd.DataFrame(index=pivot_table.index, columns=pivot_table.columns)

        # Step 2: 아이템 기반 협업 필터링 코드를 사용하여 예측 평점을 채웁니다.
        rated_games = pivot_table.columns[pivot_table.loc[target_user_id] != 0]

        for game in rated_games:
            unrated_games = pivot_table.columns[pivot_table.loc[target_user_id] == 0]
            relevant_ratings = pivot_table.loc[pivot_table[game] != 0, rated_games]
            relevant_weights = item_similarity.loc[game, rated_games]

            non_zero_weights = relevant_weights[relevant_weights != 0]
            if not non_zero_weights.empty and len(non_zero_weights) == len(relevant_ratings.columns):
                # 0이 아닌 가중치가 있는 경우에만 가중 평균을 계산합니다.
                weighted_means = np.average(relevant_ratings, weights=non_zero_weights, axis=1)

                if not np.isnan(weighted_means).all():
                    # 가중 평균을 예측 평점으로 사용합니다.
                    predicted_ratings.loc[target_user_id, game] = np.mean(weighted_means)

        # Step 3: MSE 및 MAE 계산을 위해 데이터프레임을 펼칩니다.
        actual_ratings_flat = pivot_table.loc[target_user_id, rated_games].values
        predicted_ratings_flat = predicted_ratings.loc[target_user_id, rated_games].values

        # Step 4: MSE 및 MAE 계산
        mse = mean_squared_error(actual_ratings_flat, predicted_ratings_flat)
        mae = mean_absolute_error(actual_ratings_flat, predicted_ratings_flat)

        total_mse += mse
        total_mae += mae

# 전체 교차 검증 세트에 대한 평균 MSE와 MAE 계산
average_mse = total_mse / kfold.get_n_splits()
average_mae = total_mae / kfold.get_n_splits()

print("===== AVERAGE PERFORMANCE SCORE =====")
print(f"Average Mean Squared Error (MSE): {average_mse}")
print(f"Average Mean Absolute Error (MAE): {average_mae}")
