111

In [6]:
import pandas as pd

# 加载播放记录（用户-歌曲-播放次数）
triplets = pd.read_csv('train_triplets.txt', sep='\t', header=None, names=['user_id', 'song_id', 'plays'])
print(triplets.head())

                                    user_id             song_id  plays
0  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOAKIMP12A8C130995      1
1  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOAPDEY12A81C210A9      1
2  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOBBMDR12A8C13253B      2
3  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOBFNSP12AF72A0E22      1
4  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOBFOVM12A58A7D494      1


In [7]:
import pandas as pd
import h5py

def load_metadata(filename):
    with h5py.File(filename, "r") as f:
        songs_dataset = f['metadata']['songs']
        
        # 提取原始字节数据
        song_ids_bytes = songs_dataset['song_id'][()]  # 字节数组
        titles_bytes = songs_dataset['title'][()]      # 字节数组
        
        # 安全解码为 UTF-8（处理非法字符）
        song_ids = [s.decode('utf-8', errors='ignore').strip() for s in song_ids_bytes]
        titles = [t.decode('utf-8', errors='ignore').strip() for t in titles_bytes]
        
        # 构建 DataFrame
        df = pd.DataFrame({
            'song_id': song_ids,
            'title': titles
        })
        
        # 移除空 song_id
        df = df[df['song_id'].str.len() > 0]
    return df

metadata = load_metadata('msd_summary_file.h5')

# 合并数据
merged_data = pd.merge(
    triplets,
    metadata,
    on='song_id',
    how='left'
)

# 验证结果
print("标题缺失比例:", merged_data['title'].isnull().mean())
print("示例数据:")
print(merged_data.head())

标题缺失比例: 0.0
示例数据:
                                    user_id             song_id  plays  \
0  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOAKIMP12A8C130995      1   
1  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOAPDEY12A81C210A9      1   
2  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOBBMDR12A8C13253B      2   
3  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOBFNSP12AF72A0E22      1   
4  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOBFOVM12A58A7D494      1   

                             title  
0                         The Cove  
1             Nothing from Nothing  
2                  Entre Dos Aguas  
3            Under Cold Blue Stars  
4  Riot Radio (Soundtrack Version)  


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Multiply, Dense
from tensorflow.keras.models import Model
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import ModelCheckpoint
import joblib
from pathlib import Path

# ----------------------
# 数据预处理
# ----------------------

# 1. 用户和歌曲ID编码（转换为连续整数）
user_encoder = LabelEncoder()
song_encoder = LabelEncoder()

# 对用户ID和歌曲ID进行编码
merged_data['user_id_encoded'] = user_encoder.fit_transform(merged_data['user_id'])
merged_data['song_id_encoded'] = song_encoder.fit_transform(merged_data['song_id'])

# 2. 归一化播放次数到 [0,1]
max_play = merged_data['plays'].max()
merged_data['plays_normalized'] = merged_data['plays'] / max_play

# 3. 提取训练数据
user_ids = merged_data['user_id_encoded'].values
item_ids = merged_data['song_id_encoded'].values
labels = merged_data['plays_normalized'].values  # 归一化后的播放次数

# 4. 划分训练集和测试集
train_user, test_user, train_item, test_item, train_label, test_label = train_test_split(
    user_ids, item_ids, labels, test_size=0.2, random_state=42
)

# ----------------------
# 模型构建
# ----------------------

# 定义用户和物品数量
num_users = len(user_encoder.classes_)  # 105,283
num_items = len(song_encoder.classes_)  # 384,546
embedding_size = 32  # 嵌入维度

policy = tf.keras.mixed_precision.Policy('mixed_float16')
tf.keras.mixed_precision.set_global_policy(policy)




# 构建GMF模型（回归任务）
def build_gmf_model():
    pretrained_path = "best_gmf_model.keras"
    
    if Path(pretrained_path).exists():
        print("⏳ 检测到预训练模型，加载中...")
        # 加载预训练模型
        model = tf.keras.models.load_model(pretrained_path)
        print("✅ 成功加载预训练模型")
        
        # 可选：调整学习率（继续训练时通常需要降低学习率）
        new_learning_rate = 0.0001  # 原始为0.001
        model.compile(
            optimizer=tf.keras.optimizers.Adam(learning_rate=new_learning_rate),
            loss='mse',
            metrics=['mae']
        )
        print(f"🔄 已调整学习率至 {new_learning_rate}")
        return model
    
    else:
        print("🆕 未找到预训练模型，创建新模型")
        user_input = Input(shape=(1,), dtype=tf.int32, name='user_input')
        item_input = Input(shape=(1,), dtype=tf.int32, name='item_input')
        
        # 降低嵌入维度至 16
        user_embed = Embedding(num_users, 16, input_length=1, name='user_embed')(user_input)
        item_embed = Embedding(num_items, 16, input_length=1, name='item_embed')(item_input)
        
        # 展平维度 (None, 1, 16) -> (None, 16)
        user_flatten = tf.keras.layers.Reshape((16,))(user_embed)
        item_flatten = tf.keras.layers.Reshape((16,))(item_embed)
        
        # 简化计算：直接点积代替乘法+全连接
        dot_product = tf.keras.layers.Dot(axes=1)([user_flatten, item_flatten])
        
        model = Model(inputs=[user_input, item_input], outputs=dot_product)
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                    loss='mse', metrics=['mae'])
        return model

model = build_gmf_model()
model.summary()

# ----------------------
# 模型训练
# ----------------------

# 转换为TensorFlow Dataset（提升性能）
train_dataset = tf.data.Dataset.from_tensor_slices(
    ({"user_input": train_user, "item_input": train_item}, train_label)
).shuffle(100000, reshuffle_each_iteration=True).batch(16384).cache().prefetch(tf.data.AUTOTUNE)  # 自动预加载

test_dataset = tf.data.Dataset.from_tensor_slices(
    ({"user_input": test_user, "item_input": test_item}, test_label)
).batch(16384).cache().prefetch(tf.data.AUTOTUNE)

early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)

# 添加模型保存回调（自动保存最佳模型）
class CustomCheckpoint(ModelCheckpoint):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # 添加优化器状态保存路径
        self.optimizer_path = "optimizer_state_gmf.pkl"
    
    def on_train_end(self, logs=None):
        # 保存优化器权重
        joblib.dump(self.model.optimizer.get_weights(), self.optimizer_path)
        print(f"💾 已保存优化器状态至 {self.optimizer_path}")

# 配置检查点（增强版）
checkpoint = CustomCheckpoint(
    "best_gmf_model.keras",
    monitor='val_loss',
    save_best_only=True,
    mode='min',
    verbose=1
)


# 如果检测到优化器状态则加载
if Path("optimizer_state_gmf.pkl").exists():
    print("⏳ 加载优化器状态...")
    optimizer_weights = joblib.load("optimizer_state_gmf.pkl")
    model.optimizer.set_weights(optimizer_weights)
    print("✅ 优化器状态已恢复")

# 训练模型（epochs可根据需要调整）
history = model.fit(
    train_dataset,
    validation_data=test_dataset,
    epochs=100,  # 设置较大值，依靠早停机制
    callbacks=[early_stop, checkpoint]
)

# ----------------------
# 模型评估与预测
# ----------------------

# 评估测试集
test_loss, test_mae = model.evaluate(test_dataset)
print(f"testsets MSE: {test_loss:.4f}, MAE: {test_mae:.4f}")

# 保存编码器（训练后立即执行）
joblib.dump(max_play, 'max_play_gmf.pkl')
joblib.dump(user_encoder, 'user_encoder_gmf.pkl')
joblib.dump(song_encoder, 'song_encoder_gmf.pkl')




⏳ 检测到预训练模型，加载中...
✅ 成功加载预训练模型
🔄 已调整学习率至 0.0001
Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_input (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
item_input (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
user_embed (Embedding)          (None, 1, 16)        16309088    user_input[0][0]                 
__________________________________________________________________________________________________
item_embed (Embedding)          (None, 1, 16)        6152736     item_input[0][0]                 
_____________________________________________

['song_encoder.pkl']

In [None]:


import numpy as np
import pandas as pd
import tensorflow as tf
import joblib
from sklearn.preprocessing import LabelEncoder
import h5py
# ----------------------
# Load metadata 
# ----------------------
def load_metadata(filename):
    """Metadata loading function identical to the training code"""
    with h5py.File(filename, "r") as f:
        songs_dataset = f['metadata/songs']  # Note the hierarchy uses '/' separators
        
        # Extract all required fields
        song_ids_bytes = songs_dataset['song_id'][()]
        titles_bytes = songs_dataset['title'][()]
        artists_bytes = songs_dataset['artist_name'][()]  # Added artist field
        
        # Unified decoding process
        decode_func = lambda x: x.decode('utf-8', errors='ignore').strip()
        song_ids = list(map(decode_func, song_ids_bytes))
        titles = list(map(decode_func, titles_bytes))
        artists = list(map(decode_func, artists_bytes))
        
        # Build DataFrame
        df = pd.DataFrame({
            'song_id': song_ids,
            'title': titles,
            'artist_name': artists  # Add other required fields
        })
        
        # Filter invalid data
        return df[df['song_id'].str.len() > 0]

# ----------------------
# Enhanced Recommendation System Class
# ----------------------
class AdvancedMusicRecommender:
    def __init__(self):
        # Load metadata
        self.metadata = load_metadata('msd_summary_file.h5')
        
        # Load model and encoders
        self.model = tf.keras.models.load_model('best_gmf_model.keras')
        self.song_encoder = joblib.load('song_encoder_gmf.pkl')
        
        # Handle missing max_play
        try:
            self.max_play = joblib.load('max_play_gmf.pkl')
        except FileNotFoundError:
            print("Warning: Estimating max_play using metadata")
            self.max_play = self.metadata['plays'].max() if 'plays' in self.metadata else 1
        
        # Create song ID to index mapping
        self.song_id_to_idx = {
            song_id: idx 
            for idx, song_id in enumerate(self.song_encoder.classes_)
        }
        
        # Get song embeddings
        self.song_embeddings = self.model.get_layer('item_embed').get_weights()[0]

    def search_songs(self, query, top_k=5):
        """Modified search function with consistent fields"""
        mask = (
            self.metadata['title'].str.contains(query, case=False) |
            self.metadata['artist_name'].str.contains(query, case=False)
        )
        return self.metadata[mask].head(top_k)[['song_id', 'title', 'artist_name']]
    
    def create_virtual_user(self, song_ids):
        """Create virtual user features from song IDs"""
        valid_ids = [song_id for song_id in song_ids if song_id in self.song_id_to_idx]
        
        if not valid_ids:
            raise ValueError("No valid song IDs found")
            
        indices = [self.song_id_to_idx[song_id] for song_id in valid_ids]
        avg_embedding = np.mean(self.song_embeddings[indices], axis=0)
        return avg_embedding

    def _select_songs_interactively(self, matched_songs):
        """Interactive song selection with re-search option"""
        print("\n🔍 Found matching songs:")
        print("0. Search again (unsatisfied with results)")
        for idx, (_, row) in enumerate(matched_songs.iterrows(), 1):
            print(f"{idx}. {row['title']} - {row['artist_name']}")
        
        while True:
            try:
                selected = input("Enter song numbers (space-separated, 0 to re-search, enter for all): ").strip()
                if not selected:
                    return matched_songs['song_id'].tolist()
                
                if '0' in selected.split():
                    return None
                
                indices = list(map(int, selected.split()))
                valid_indices = [i for i in indices if 1 <= i <= len(matched_songs)]
                
                if not valid_indices:
                    print("⚠️ Invalid input, please try again")
                    continue
                
                return matched_songs.iloc[[i-1 for i in valid_indices]]['song_id'].tolist()
            
            except ValueError:
                print("⚠️ Please enter valid numbers")

    def _format_grouped_results(self, grouped_results):
        """Format grouped search results with hierarchical numbering"""
        formatted = []
        for group_idx, (query, results) in enumerate(grouped_results, 1):
            if not results.empty:
                formatted.append(f"\n🔍 Results for: '{query}'")
                for item_idx, (_, row) in enumerate(results.iterrows(), 1):
                    formatted.append(f"{group_idx}.{item_idx} {row['title']} - {row['artist_name']}")
            else:
                formatted.append(f"\n⚠️ No results found for: '{query}'")
        return "\n".join(formatted)

    def _parse_group_selection(self, selection, grouped_results):
        """Parse hierarchical selection like '1.1 2.3'"""
        selected_ids = []
        valid_groups = [g for g in grouped_results if not g[1].empty]
        
        for part in selection.split():
            try:
                group_num, item_num = map(int, part.split('.'))
                # Adjust for valid groups only
                if 1 <= group_num <= len(valid_groups):
                    group_query, group_df = valid_groups[group_num-1]
                    if 1 <= item_num <= len(group_df):
                        selected_ids.append(group_df.iloc[item_num-1]['song_id'])
            except:
                continue
        return selected_ids
    
    def generate_recommendations(self, input_titles, top_n=10, verbose=True):
        """
        Core recommendation generation function (Fixed Version)
        """
        try:
            # Step 1: Process each query separately
            grouped_results = []
            valid_queries = 0

            for query in input_titles:
                query = query.strip()
                if not query:
                    continue

                results = self.search_songs(query)
                grouped_results.append((query, results))
                if not results.empty:
                    valid_queries += 1

            # Step 2: Display grouped results
            if verbose:
                print("\n" + "="*50)
                print(self._format_grouped_results(grouped_results))
                print("="*50 + "\n")

            # Step 3: Interactive selection
            selected_ids = []
            while True:
                try:
                    selection = input(
                        "Enter selections (e.g. '1.1 2.3'), '0' to re-search, or enter to confirm: "
                    ).strip()
                    
                    if selection == '0':
                        return None
                    if not selection:
                        break
                        
                    selected_ids = self._parse_group_selection(selection, grouped_results)
                    if not selected_ids:
                        print("⚠️ No valid selections, try again")
                        continue
                    break
                        
                except KeyboardInterrupt:
                    print("\n⏹ Selection canceled")
                    if input("Continue? (y/n): ").lower() == 'n':
                        return pd.DataFrame()

            # Step 4: Create virtual user
            try:
                if verbose:
                    print("\n⭐ Analyzing song features...")
                
                virtual_user = self.create_virtual_user(selected_ids)
            except ValueError as e:
                if verbose:
                    print(f"❌ Feature analysis failed: {str(e)}")
                return pd.DataFrame()
            except Exception as e:
                if verbose:
                    print(f"❌ Unexpected error: {str(e)}")
                return pd.DataFrame()

            # Step 5: Calculate similarity (Fixed)
            try:
                if verbose:
                    print("🔢 Calculating similarities...")
                
                scores = np.dot(self.song_embeddings, virtual_user)
                
                # 使用当前选择的ID来排除已选歌曲
                input_indices = [
                    self.song_id_to_idx[sid] 
                    for sid in selected_ids  # 使用实际选择的ID而不是all_matches
                    if sid in self.song_id_to_idx
                ]
                scores[input_indices] = -np.inf  # 排除已选歌曲
            except Exception as e:
                if verbose:
                    print(f"❌ Similarity calculation failed: {str(e)}")
                return pd.DataFrame()

            # Step 6: Generate recommendations
            try:
                top_indices = np.argsort(scores)[-top_n:][::-1]
                top_scores = scores[top_indices]
                top_song_ids = self.song_encoder.inverse_transform(top_indices)

                recommendations = self.metadata[
                    self.metadata['song_id'].isin(top_song_ids)
                ].copy()
                
                try:
                    recommendations['predicted_plays'] = np.clip(
                        top_scores * self.max_play, 
                        0,
                        None
                    )
                except:
                    recommendations['predicted_plays'] = 0

                return recommendations.sort_values('predicted_plays', ascending=False)

            except Exception as e:
                if verbose:
                    print(f"❌ Recommendation generation failed: {str(e)}")
                return pd.DataFrame()

        except KeyboardInterrupt:
            print("\n⏹ Recommendation process interrupted")
            return pd.DataFrame()
        except Exception as e:
            if verbose:
                print(f"❗ Unhandled exception: {str(e)}")
            return pd.DataFrame()

# ----------------------
# Interactive Recommendation Flow
# ----------------------
def interactive_recommendation():
    recommender = AdvancedMusicRecommender()
    
    while True:
        print("\n🎵 Please enter your favorite songs/artists (enter 'exit' to quit):")
        user_input = input("> ").strip()
        
        if user_input.lower() == 'exit':
            break
            
        queries = [q.strip() for q in user_input.split(',')]
        
        while True:
            result = recommender.generate_recommendations(queries)
            if result is None:
                break
            elif not result.empty:
                print("\n🎧 Recommended songs for you:")
                print(result[['title', 'artist_name', 'predicted_plays']]
                    .head(10).to_string(index=False))
                break
            else:
                print("Unable to generate recommendations, please try different input")
                break

if __name__ == "__main__":
    interactive_recommendation()


🎵 Please enter your favorite songs/artists (enter 'exit' to quit):


🔍 Results for: 'merry christmas mr'
1.1 Merry Christmas Mr. Lawrence - Richard Clayderman
1.2 Merry Christmas Mr Lawrence - Fact
1.3 Merry Christmas Mr Lawrence (Heart Of Asia) (DJ Quicksilver's Radio Edit) - Watergate
1.4 Merry Christmas Mr. Lawrence - L'Orchestra Cinematique

🔍 Results for: 'gymno'
2.1 N2 3 Gymnopedies 1888 - Ciccolini_ Aldo / Tacchino_ Gabriel
2.2 Gymnopedie #2 - Carl Doy
2.3 Gymnopédie No. 1 (Demo) - Gary Numan
2.4 Gymnopedie #3 - Carl Doy
2.5 Gymnopédies for Piano/Gymnopédie No. 3 - Alex de Grassi;Paul McCandless


⭐ Analyzing song features...
🔢 Calculating similarities...

🎧 Recommended songs for you:
                                               title                                 artist_name  predicted_plays
Lovin' You So (Dr. Dolittle Soundtrack) (LP Version)                                 Jody Watley        84.708435
                                               Alive                  