In [65]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from datetime import datetime

class CategoriesGenresTagsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.vectorizer = TfidfVectorizer(stop_words='english')
        
    def fit(self, X, y=None):
        combined_text = X['categories'] + " " + X['genres'] + " " + X['steamspy_tags']
        self.vectorizer.fit(combined_text)
        return self
    
    def transform(self, X):
        combined_text = X['categories'] + " " + X['genres'] + " " + X['steamspy_tags']
        return self.vectorizer.transform(combined_text).toarray()

class RatingsTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        total_ratings = X['positive_ratings'] + X['negative_ratings']
        rating_ratio = X['positive_ratings'] / total_ratings
        return rating_ratio.values.reshape(-1, 1)

class PlaytimeTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.average_playtime_max = X['average_playtime'].max()
        self.median_playtime_max = X['median_playtime'].max()
        return self
    
    def transform(self, X):
        normalized_average_playtime = X['average_playtime'] / self.average_playtime_max
        normalized_median_playtime = X['median_playtime'] / self.median_playtime_max
        return pd.concat([normalized_average_playtime, normalized_median_playtime], axis=1).values

class OwnersTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        owners_processed = X['owners'].str.split('-').apply(lambda x: (int(x[0]) + int(x[1])) / 2)
        return owners_processed.values.reshape(-1, 1)

class DaysSinceReleaseTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        now = datetime.now()
        days_since_release = X['release_date'].apply(lambda x: (now - datetime.strptime(x, '%Y-%m-%d')).days)
        return days_since_release.values.reshape(-1, 1)

# Combined Feature Transformer
feature_transformer = FeatureUnion([
    ('categories_genres_tags', CategoriesGenresTagsTransformer()),
    ('ratings', RatingsTransformer()),
    ('playtime', PlaytimeTransformer()),
    ('owners', OwnersTransformer()),
    ('days_since_release', DaysSinceReleaseTransformer())
])


In [33]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.neighbors import NearestNeighbors

# Load Data
games_filtered_df = pd.read_csv("../data/games_filtered.csv")
users_df = pd.read_csv("../data/users_filtered.csv")

# featureTransformer with pca
gameFeatureTransformer_pca = Pipeline([
    ('features', feature_transformer),
    ('pca', PCA(n_components=100))  # Reducing dimensions to 100
])

# Fit and transform game data
gameFeatureTransformer_pca.fit(games_filtered_df)
transformed_games_pca = gameFeatureTransformer_pca.transform(games_filtered_df)

# Train NearestNeighbors model
K = 10
game_model_pca = NearestNeighbors(algorithm="brute", n_neighbors=K+1)
game_model_pca.fit(transformed_games_pca)

# Recommendation function
def recommend_games_pca(model, gamesTaken, k=5):
    aggregate_vector = np.zeros((1, 100))
    for game in gamesTaken:
        game_data = games_filtered_df[games_filtered_df['cleaned_title'] == game]
        if not game_data.empty:
            game_vector = gameFeatureTransformer_pca.transform(game_data)
            aggregate_vector += game_vector

    distances, indices = model.kneighbors(aggregate_vector, n_neighbors=k+1)
    recommendations = [games_filtered_df.iloc[index]['cleaned_title'] for index in indices[0]][1:]
    return recommendations

# Example usage:
# sample_games = ["Counter-Strike", "Team Fortress 2"]
sample_games = ["call of duty"]
recommended_games = recommend_games_pca(game_model_pca, sample_games, 5)
print(recommended_games)


['call of duty 2', 'call of duty united offensive', 'x2 the threat', 'x3 reunion', 'flatout']


In [66]:
from sklearn.metrics.pairwise import cosine_similarity

# 加载数据
games_df = pd.read_csv("../data/games_filtered.csv")
users_df = pd.read_csv("../data/users_filtered.csv")

# 处理数据
users_df['Hours'] = np.where(users_df['Action'] == 'purchase', 0, users_df['Hours'])

# 使用feature_transformer计算每款游戏的特征向量

feature_transformer.fit(games_df)
game_features = feature_transformer.transform(games_df)

# 将游戏标题与其对应的特征向量一一对应起来，形成一个字典
game_features_dict = {game: features for game, features in zip(games_df['cleaned_title'], game_features)}

# 基于用户玩的游戏和游玩时长计算用户表示
def compute_user_representation(user_id, users_df, game_features_dict):
    user_games = users_df[users_df['User_ID'] == user_id]
    representation = np.zeros(next(iter(game_features_dict.values())).shape)
    for _, row in user_games.iterrows():
        game_title = row['cleaned_game_title']
        hours = row['Hours']
        representation += game_features_dict[game_title] * hours
    return representation / len(user_games)

# 找到与目标用户最接近的游戏
def recommend_games_based_on_user_representation(user_id, game_features, top_n=5):
    user_representation = compute_user_representation(user_id, users_df, game_features_dict)
    similarities = cosine_similarity([user_representation], game_features)
    recommended_indices = similarities[0].argsort()[-top_n:][::-1]
    recommended_games = games_df.iloc[recommended_indices]['cleaned_title'].tolist()
    return recommended_games

# 示例
user_id_example = users_df['User_ID'].iloc[0]
print(recommend_games_based_on_user_representation(user_id_example, game_features))

['left 4 dead 2', 'portal 2', 'ark survival evolved', 'the elder scrolls v skyrim', 'tom clancys rainbow six siege']


In [50]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
# 创建用户-游戏交互矩阵
interaction_matrix = users_df.pivot_table(index='User_ID', columns='cleaned_game_title', values='Hours', aggfunc='sum').fillna(0)

# 标准化交互矩阵
scaler = StandardScaler()
scaled_matrix = scaler.fit_transform(interaction_matrix)

# 使用KMeans进行聚类
n_clusters = 10
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(scaled_matrix)

# 将聚类标签添加到用户数据中
interaction_matrix['cluster_label'] = cluster_labels

# 修改推荐函数来考虑聚类标签
def recommend_games_based_on_user_representation(user_id, game_features, top_n=5):
    user_representation = compute_user_representation(user_id, users_df, game_features_dict)
    
    # 获取用户的聚类标签
    user_cluster = interaction_matrix.loc[user_id, 'cluster_label']
    
    # 在此处考虑聚类标签来提高推荐的精确性...
    # 例如，您可以优先推荐与用户同一聚类中的其他用户喜欢的游戏
    
    similarities = cosine_similarity([user_representation], game_features)
    recommended_indices = similarities[0].argsort()[-top_n:][::-1]
    recommended_games = games_df.iloc[recommended_indices]['cleaned_title'].tolist()
    return recommended_games


  super()._check_params_vs_input(X, default_n_init=10)


In [67]:
user_id_example = users_df['User_ID'].iloc[0]
print(recommend_games_based_on_user_representation(user_id_example, game_features))

['left 4 dead 2', 'portal 2', 'ark survival evolved', 'the elder scrolls v skyrim', 'tom clancys rainbow six siege']


In [78]:
def hit_rate_recommendation(user_id, game_features, users_df, threshold_hours=5, top_n=5):

    recommended_games = recommend_games_based_on_user_representation(user_id, game_features, top_n=top_n)
    
    # Get games that user truly likes based on threshold_hours
    liked_games = users_df[(users_df['User_ID'] == user_id) & (users_df['Hours'] > threshold_hours)]['cleaned_game_title'].tolist()
    
    # Check for hits
    hits = set(recommended_games).intersection(set(liked_games))
    
    return len(hits) > 0

# Calculate hit rate for all users
total_users = users_df['User_ID'].nunique()
hits = sum(hit_rate_recommendation(user, game_features, users_df) for user in users_df['User_ID'].unique())

hit_rate = hits / total_users
hit_rate


0.45

In [90]:
def combined_recommendation(user_id, game_features, top_n=5):
    # 1. 使用游戏的特征向量计算与目标用户表示的余弦相似性
    user_representation = compute_user_representation(user_id, users_df, game_features_dict)
    similarities = cosine_similarity([user_representation], game_features)
    
    # 2. 基于与目标用户在同一聚类中的其他用户喜欢的游戏进行推荐
    user_cluster = interaction_matrix.loc[user_id, 'cluster_label']
    users_in_same_cluster = interaction_matrix[interaction_matrix['cluster_label'] == user_cluster].index
#     games_liked_by_cluster = (
#         users_df[users_df['User_ID'].isin(users_in_same_cluster)]
#         .groupby('cleaned_game_title')['Hours']
#         .sum()
#         .sort_values(ascending=False)
#     )
    games_liked_by_cluster = games_liked_by_cluster_function(users_in_same_cluster)
    
    # 综合这两种策略的结果
    recommended_indices_similarity = similarities[0].argsort()[-top_n:][::-1]
    recommended_games_similarity = games_df.iloc[recommended_indices_similarity]['cleaned_title'].tolist()
    
    recommended_games_cluster = games_liked_by_cluster.head(top_n).index.tolist()
    
    # 将两个推荐列表结合起来，并删除重复的游戏
    combined_recommendations = list(dict.fromkeys(recommended_games_similarity + recommended_games_cluster))
    
    return combined_recommendations[:top_n]

# 使用示例进行测试
user_id_example = users_df['User_ID'].iloc[0]
combined_recommendation(user_id_example, game_features)


['left 4 dead 2',
 'portal 2',
 'ark survival evolved',
 'the elder scrolls v skyrim',
 'tom clancys rainbow six siege']

In [93]:
def recommendation(user_id, game_features, users_df, threshold_hours=5, top_n=5):

    recommended_games = combined_recommendation(user_id, game_features, top_n=top_n)
    
    # Get games that user truly likes based on threshold_hours
    liked_games = users_df[(users_df['User_ID'] == user_id) & (users_df['Hours'] > threshold_hours)]['cleaned_game_title'].tolist()
    
    # Check for hits
    hits = set(recommended_games).intersection(set(liked_games))
    
    return len(hits) > 0

num = 100
# Calculate hit rate for 100 users
total_users = users_df['User_ID'].nunique()
hits = sum(recommendation(user, game_features, users_df) for user in users_df['User_ID'].unique()[:num])



hit_rate = hits / num
hit_rate

0.5

In [89]:
def games_liked_by_cluster_function(cluster_users):
    # Filter out the user_game_hours dataframe for users in the given cluster
    cluster_game_hours = user_game_hours.loc[cluster_users]
    
    # Sum the hours for each game across all users in the cluster
    cluster_game_sum = cluster_game_hours.sum().sort_values(ascending=False)
    
    return cluster_game_sum

In [85]:
user_game_hours = users_df.groupby(['User_ID', 'cleaned_game_title'])['Hours'].sum().unstack(fill_value=0)