Load preprocessed USER Dataset 

In [32]:
import pandas as pd
# reading csv file created and exception handling
try:
    df = pd.read_csv('final_user_data.csv')
    # print dataset
    print("========================== Dataset ========================== ")
    print(df)

    user_row = df[df['userId'] == 309824202]
    
    # Print the rows for user ID 309554670
    print("========================== User ID 309554670 ========================== ")
    print(user_row)

except FileNotFoundError as e:
    print("Error: Cannot find the File(CSV), Check the Path again")
    exit(1)

          userId                    gameName isPlay  playTime
0      151603712  The Elder Scrolls V Skyrim   play     273.0
1      151603712                   Fallout 4   play      87.0
2      151603712                       Spore   play      14.9
3      151603712           Fallout New Vegas   play      12.1
4      151603712               Left 4 Dead 2   play       8.9
...          ...                         ...    ...       ...
69546  128470551                Fallen Earth   play       2.4
69547  128470551                 Magic Duels   play       2.2
69548  128470551                 Titan Souls   play       1.5
69549  128470551  Grand Theft Auto Vice City   play       1.5
69550  128470551                        RUSH   play       1.4

[69551 rows x 4 columns]
         userId gameName isPlay  playTime
3588  309824202   Dota 2   play       0.7


행렬 변환 
1. User based filtering을 위해 행렬변환
2. Row(USER) 별로 MINMAX Scaling => 유저마다 플레이하는 시간의 기준이 다르기때문
3. 행렬 출력

In [33]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Assume df is your original DataFrame

# pivot_table 생성 (fill_value를 사용하지 않고 NaN 유지)
pivot_table = df.pivot_table(index='userId', columns='gameName', values='playTime', aggfunc='sum', fill_value=0)

# 각 userId의 row 별로 Min-Max Scaling 수행
scaler = MinMaxScaler()
scaled_values = scaler.fit_transform(pivot_table.T).T

# 스케일링된 값을 다시 pivot_table에 적용
pivot_table.iloc[:, :] = scaled_values

# Categorical encoding (0 to 5)
bins = [0, 0.2, 0.4, 0.6, 0.8, 1.0]  # Define your bins as needed
labels = [0, 1, 2, 3, 4]  # Corresponding labels for each bin

# Mapping scaled values to categorical labels
categorical_values = pd.cut(pivot_table.values.flatten(), bins=bins, labels=labels, include_lowest=True)

# Reshape back to the original shape
pivot_table = pd.DataFrame(categorical_values.reshape(pivot_table.shape), index=pivot_table.index, columns=pivot_table.columns)

pivot_table = pivot_table.astype(int)


# 결과 출력
print(pivot_table)


gameName   007 Legends  0RBITALIS   
userId                              
5250                 0          0  \
76767                0          0   
86540                0          0   
144736               0          0   
181212               0          0   
...                ...        ...   
309434439            0          0   
309554670            0          0   
309626088            0          0   
309824202            0          0   
309903146            0          0   

gameName   1... 2... 3... KICK IT! (Drop That Beat Like an Ugly Baby)   
userId                                                                  
5250                                                       0           \
76767                                                      0            
86540                                                      0            
144736                                                     0            
181212                                                     0            
...    

In [34]:
# userId 기준으로 각 row의 모든 값이 0인 행 삭제
pivot_table = pivot_table[(pivot_table.T != 0).any()]

# 결과 출력
print(pivot_table)


gameName   007 Legends  0RBITALIS   
userId                              
5250                 0          0  \
76767                0          0   
86540                0          0   
144736               0          0   
181212               0          0   
...                ...        ...   
309434439            0          0   
309554670            0          0   
309626088            0          0   
309824202            0          0   
309903146            0          0   

gameName   1... 2... 3... KICK IT! (Drop That Beat Like an Ugly Baby)   
userId                                                                  
5250                                                       0           \
76767                                                      0            
86540                                                      0            
144736                                                     0            
181212                                                     0            
...    

User Similarity Matrix 생성

In [35]:
from sklearn.metrics.pairwise import cosine_similarity

# pivot_table에서 NaN 값을 0으로 채우기 (코사인 유사도 계산을 위해)
pivot_table_filled = pivot_table.fillna(0)

# 사용자 간의 코사인 유사도 계산
user_similarity = cosine_similarity(pivot_table_filled)

# 유사도 행렬을 DataFrame으로 변환
user_similarity_df = pd.DataFrame(user_similarity, index=pivot_table.index, columns=pivot_table.index)

# 결과 출력
print(user_similarity_df)


userId     5250       76767      86540      144736     181212     229911      
userId                                                                        
5250             1.0   0.000000        0.0   0.000000   0.000000   0.000000  \
76767            0.0   1.000000        0.0   0.696311   0.675521   0.335013   
86540            0.0   0.000000        1.0   0.000000   0.000000   0.000000   
144736           0.0   0.696311        0.0   1.000000   0.970143   0.192450   
181212           0.0   0.675521        0.0   0.970143   1.000000   0.186704   
...              ...        ...        ...        ...        ...        ...   
309434439        0.0   0.000000        0.0   0.000000   0.000000   0.000000   
309554670        0.0   0.000000        0.0   0.000000   0.000000   0.000000   
309626088        0.0   0.000000        0.0   0.000000   0.000000   0.000000   
309824202        0.0   0.000000        0.0   0.000000   0.000000   0.000000   
309903146        0.0   0.000000        0.0   0.00000

User Similarity Matrix Top 30 생성
(Computation 감소)

In [36]:
def find_n_neighbors(similarity_matrix, n=5):
    """
    Parameters:
    - similarity_matrix: 사용자 간의 유사도 행렬
    - n: 반환할 이웃의 수

    Returns:
    - top_n_neighbors: 각 사용자에 대해 가장 유사한 상위 n 이웃
    """
    top_n_neighbors = pd.DataFrame(index=similarity_matrix.index, columns=range(1, n + 1))

    for user in similarity_matrix.index:
        # 각 사용자에 대해 가장 유사한 상위 n 이웃 선택
        top_n_neighbors.loc[user, :] = similarity_matrix.loc[user].sort_values(ascending=False).iloc[1:n + 1].index

    return top_n_neighbors

# 예를 들어, top 30 이웃을 찾는 경우
top_30_neighbors = find_n_neighbors(user_similarity_df, n=30)

# 결과 출력
print(top_30_neighbors.head())


               1          2          3          4          5          6    
userId                                                                     
5250    261857176  257528104  275437638  263936784  224844255  135012938  \
76767    77542968   71809133   83955783   47164966    9544834   27579751   
86540   116325720   94858657   53245953  153841049   60760816   97571329   
144736   62923086   40576653   40529387   12563913   33587126   12660489   
181212   38885272   49629862   71809133   28041586   28040133   13227098   

               7          8          9          10  ...         21         22   
userId                                              ...                         
5250    142475478  298446224  298516674  224751217  ...  106746754   33121288  \
76767     9740704    9759887   40529387   62923086  ...   25452454   25411448   
86540    94239910   94231294   94186745  297821985  ...   93326304  259798464   
144736   27689253  200759485   13227098   13227113  ...   1470

추천시스템 입력값으로 사용할 예정: n user에게 추천하고 싶으면 여기서 n userId row 가져와서 사용

In [37]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Assume df is your original DataFrame

# pivot_table 생성 (fill_value를 사용하지 않고 NaN 유지)
pivot_table1 = df.pivot_table(index='userId', columns='gameName', values='playTime', aggfunc='sum', fill_value=0)

# 각 userId의 row 별로 Min-Max Scaling 수행
scaler = MinMaxScaler()
scaled_values = scaler.fit_transform(pivot_table1.T).T

# 스케일링된 값을 다시 pivot_table에 적용
pivot_table1.iloc[:, :] = scaled_values

# Categorical encoding (0 to 5)
bins = [0, 0.2, 0.4, 0.6, 0.8, 1.0]  # Define your bins as needed
labels = [0, 1, 2, 3, 4]  # Corresponding labels for each bin

# Mapping scaled values to categorical labels
categorical_values = pd.cut(pivot_table1.values.flatten(), bins=bins, labels=labels, include_lowest=True)

# Reshape back to the original shape
pivot_table1 = pd.DataFrame(categorical_values.reshape(pivot_table.shape), index=pivot_table.index, columns=pivot_table.columns)

pivot_table1 = pivot_table1.astype(int)

# 결과 출력
print(pivot_table1)


gameName   007 Legends  0RBITALIS   
userId                              
5250                 0          0  \
76767                0          0   
86540                0          0   
144736               0          0   
181212               0          0   
...                ...        ...   
309434439            0          0   
309554670            0          0   
309626088            0          0   
309824202            0          0   
309903146            0          0   

gameName   1... 2... 3... KICK IT! (Drop That Beat Like an Ugly Baby)   
userId                                                                  
5250                                                       0           \
76767                                                      0            
86540                                                      0            
144736                                                     0            
181212                                                     0            
...    

User based 추천시스템 최종

Reccomendation Based on Majority Vote of neighbors(not preferred)

In [38]:
def recommend_games(user_id, top_30_neighbors, pivot_table, df1):
    """
    Parameters:
    - user_id: 추천을 받을 사용자의 ID
    - top_n_neighbors: 각 사용자에 대한 가장 유사한 이웃 정보가 있는 DataFrame
    - pivot_table: 사용자별 게임 플레이 시간이 있는 DataFrame
    - df1: 원본데이터(사용자가 플레이하지 않은 게임은 0), 추천할 사용자 데이터는 여기서

    Returns:
    - game_counts: 추천된 게임 목록, 횟수
    """

    # 주어진 사용자의 가장 유사한 이웃 찾기
    user_neighbors = top_30_neighbors.loc[user_id, :]
 
    # 사용자가 이미 플레이한 게임 목록
    user_played_games = df1.loc[user_id, :].index[df1.loc[user_id, :] > 0]

    # 추천된 게임 목록 초기화
    recommended_games = []
    game_counts = {}
    
    for neighbor_id in user_neighbors:
        max_playtime_game = None
        max_playtime = 0
        # 이웃이 플레이한 게임 목록
        neighbor_played_games = pivot_table.loc[neighbor_id, :].index[pivot_table.loc[neighbor_id, :] > 0]
        # 이웃이 플레이한 게임 중 사용자가 아직 플레이하지 않은 게임 찾기
        new_games = set(neighbor_played_games) - set(user_played_games)

        # 이웃이 가장 많이 플레이한 게임 찾기
        for game in new_games:
            playtime = pivot_table.loc[neighbor_id, game]
            if playtime > max_playtime:
                max_playtime_game = game
                max_playtime = playtime
        # 가장 많이 플레이한 게임을 추천 목록에 추가
        if max_playtime_game:
            recommended_games.append(max_playtime_game)
            # 중복된 게임 카운트
            if max_playtime_game in game_counts:
                game_counts[max_playtime_game] += 1
            else:
                game_counts[max_playtime_game] = 1
    
    return recommended_games, game_counts

# userId가 5250 사용자에게 추천을
user_id_to_recommend = 5250
recommended_games, game_counts = recommend_games(user_id_to_recommend, top_30_neighbors, pivot_table, pivot_table1)

# 결과 출력
print(f"UserId {user_id_to_recommend}에게 추천된 게임 목록: {game_counts}")


UserId 5250에게 추천된 게임 목록: {"Sid Meier's Civilization V": 1, 'Team Fortress 2': 1, 'Half-Life 2': 2, 'Never Alone (Kisima Ingitchuna)': 1, 'NBA 2K16': 1, "Garry's Mod": 1, 'Prison Architect': 1, 'AdVenture Capitalist': 1, 'ARK Survival Evolved': 1, 'Two Worlds II': 1, 'Total War ROME II - Emperor Edition': 1, 'Euro Truck Simulator 2': 1, 'Starbound': 1}


Averaged Weight Predicted Rate & Reccomendations(preferred)

In [46]:
import pandas as pd
import numpy as np

# Assuming 'user_similarity_df' is a DataFrame containing similarity scores between users
# For example, user_similarity_df = calculate_similarity_function()

# 특정 사용자
target_user_id = 5250

# 특정 사용자에 대한 상위 30명 이웃 선택
neighbors = top_30_neighbors.loc[target_user_id]

# 상위 30명 이웃의 게임 평점 평균 계산
neighbor_ratings = pivot_table1[pivot_table1.index.isin(neighbors.values)]

# Weighted mean calculation using similarity scores as weights
weights = user_similarity_df.loc[target_user_id, neighbors]
weighted_mean = np.average(neighbor_ratings, axis=0, weights=weights)

# 특정 사용자가 아직 평가하지 않은 게임 필터링
unrated_games = pivot_table1.columns[pivot_table1.loc[target_user_id] == 0].tolist()

# 예측 평점 계산
predicted_ratings = pd.Series(data=weighted_mean, index=neighbor_ratings.columns)[unrated_games]

# 예측 평점이 가장 높은 상위 N개의 게임 추출 (여기서는 5개로 가정)
top_n_recommendations = predicted_ratings.nlargest(5)

print(f"UserId {target_user_id}에게 추천된 게임 목록:")
print(top_n_recommendations)


UserId 5250에게 추천된 게임 목록:
gameName
Half-Life 2                            0.359929
Starbound                              0.305276
Prison Architect                       0.269947
Total War ROME II - Emperor Edition    0.269947
Terraria                               0.185423
dtype: float64


Evaluation: MSE, MAE for Single User

In [47]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

# 특정 사용자에 대한 상위 30명 이웃 선택
neighbors = top_30_neighbors.loc[target_user_id]

# 상위 30명 이웃의 게임 평점 평균 계산
neighbor_ratings = pivot_table1[pivot_table1.index.isin(neighbors.values)]
neighbor_ratings_mean = neighbor_ratings.mean()

# 특정 사용자가 평가한 게임에 대한 예측 평점 계산
predicted_ratings = neighbor_ratings_mean[pivot_table1.loc[target_user_id] != 0]
print()
print("===== PREDICTED RATINGS(PLAYTIME) =====")
print(predicted_ratings)
# 실제 평점 가져오기
actual_ratings = pivot_table1.loc[target_user_id][pivot_table1.loc[target_user_id] != 0]
print()
print("===== ACTUAL RATINGS(PLAYTIME) =====")
print(actual_ratings)
# MSE와 MAE 계산
mse = mean_squared_error(actual_ratings, predicted_ratings)
mae = mean_absolute_error(actual_ratings, predicted_ratings)

print()
print("===== PERFORMANCE SCORE =====")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")



===== PREDICTED RATINGS(PLAYTIME) =====
gameName
Cities Skylines             3.133333
Deus Ex Human Revolution    0.533333
dtype: float64

===== ACTUAL RATINGS(PLAYTIME) =====
gameName
Cities Skylines             4
Deus Ex Human Revolution    2
Name: 5250, dtype: int64

===== PERFORMANCE SCORE =====
Mean Squared Error (MSE): 1.4511111111111115
Mean Absolute Error (MAE): 1.1666666666666667


Evaluation: K-Fold Cross Validation (AVERGAGE)

In [41]:
import random
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error

# 전체 사용자를 포함한 인덱스 리스트
all_users = list(pivot_table1.index)

# k-fold 교차 검증을 위해 데이터를 섞음
random.shuffle(all_users)

# k-fold 교차 검증 설정 (여기서는 k=5로 설정)
kfold = KFold(n_splits=5)

total_mse = 0
total_mae = 0

for train_indices, test_indices in kfold.split(all_users):
    train_users = [all_users[i] for i in train_indices]
    test_users = [all_users[i] for i in test_indices]

    for target_user_id in test_users:
        neighbors = top_30_neighbors.loc[target_user_id]
        neighbor_ratings = pivot_table1[pivot_table1.index.isin(neighbors.values)]
        neighbor_ratings_mean = neighbor_ratings.mean()

        predicted_ratings = neighbor_ratings_mean[pivot_table1.loc[target_user_id] != 0]
        actual_ratings = pivot_table1.loc[target_user_id][pivot_table1.loc[target_user_id] != 0]

        mse = mean_squared_error(actual_ratings, predicted_ratings)
        mae = mean_absolute_error(actual_ratings, predicted_ratings)

        total_mse += mse
        total_mae += mae

# 전체 교차 검증 세트에 대한 평균 MSE와 MAE 계산
average_mse = total_mse / len(all_users)
average_mae = total_mae / len(all_users)

print("===== AVERAGE PERFORMANCE SCORE =====")
print(f"Average Mean Squared Error (MSE): {average_mse}")
print(f"Average Mean Absolute Error (MAE): {average_mae}")


===== AVERAGE PERFORMANCE SCORE =====
Average Mean Squared Error (MSE): 1.9470796130025285
Average Mean Absolute Error (MAE): 0.7225092184038385
