依赖部分

In [None]:

# Baseline implementation for Last.fm dataset using XGBoost

import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer

数据读取

In [None]:
# Load the dataset
# Replace these paths with actual dataset paths
user_artists_path = "user_artists.dat"
tags_path = "tags.dat"

# Load user-artists data
user_artists = pd.read_csv(user_artists_path, sep='\t')
user_artists.columns = ['userID', 'artistID', 'weight']

# Load tags data
tags = pd.read_csv(tags_path, sep='\t')
tags.columns = ['tagID', 'tagValue']

数据预处理

In [None]:
# Preprocess tags with TF-IDF
vectorizer = TfidfVectorizer(max_features=50)  # Limiting features for simplicity
tag_features = vectorizer.fit_transform(tags['tagValue']).toarray()
tag_features_df = pd.DataFrame(tag_features, columns=[f"tag_{i}" for i in range(tag_features.shape[1])])

# Merge datasets for basic features
data = user_artists.merge(tag_features_df, left_on='artistID', right_index=True, how='inner')

模型设计

In [None]:
# Train an XGBoost model
model = xgb.XGBRegressor(
    objective='reg:squarederror', 
    n_estimators=100, 
    learning_rate=0.1, 
    max_depth=6, 
    random_state=42
)
model.fit(X_train, y_train)

训练

In [None]:
# Prepare feature matrix (X) and target vector (y)
X = data.drop(['userID', 'artistID', 'weight'], axis=1).values
y = data['weight'].values

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


测试与输出

In [None]:
# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse:.4f}")

# Feature importance
feature_importances = model.feature_importances_
print("Feature Importances:", feature_importances)