In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import preprocessing
import xgboost as xgb
from sklearn.preprocessing import StandardScaler

a_url = 'C:/Users/User/Downloads/processed_data.csv'
one_m_data = pd.read_csv(a_url)

one_m_data['audio_mode'] = one_m_data['mode']
one_m_data.audio_mode= one_m_data.audio_mode.astype(float)

threshold = one_m_data['popularity'].quantile(0.75)

# 選擇人氣度前25%的歌曲
top_25p_songs = one_m_data[one_m_data['popularity'] >= threshold]

# num_of_songs = len(top_25p_songs)
# num_of_songs

# 選擇人氣度後75%的歌曲
random_seed = 42
bot_75p_songs = one_m_data[one_m_data['popularity'] < threshold]

# 從人氣度後75%的歌曲中隨機挑選與前25%數量相同的歌曲
random_seed = 42
sam_75p_songs = bot_75p_songs.sample(n=len(top_25p_songs))

# 合併上述兩個子集以創建新的DataFrame
one_m_data = pd.concat([top_25p_songs, sam_75p_songs], axis=0).reset_index(drop=True)

# one_m_data.shape

# 要取對數的特征列表
features_to_log = ['loudness', 'speechiness', 'liveness']

# 為了避免負值和零，對 loudness 進行正值轉換，因為它可能有負值
one_m_data['loudness'] = one_m_data['loudness'] - one_m_data['loudness'].min() + 1
one_m_data['speechiness'] = one_m_data['speechiness'] - one_m_data['speechiness'].min() + 1
one_m_data['liveness'] = one_m_data['liveness'] - one_m_data['liveness'].min() + 1

# 對這些特徵取對數，並在取對數之前加上一個很小的正數
epsilon = 1e-10
for feature in features_to_log:
    one_m_data[feature] = np.log(one_m_data[feature] + epsilon)

# 顯示轉換後的特徵的前幾行
print(one_m_data[features_to_log].head())

# Features to standardize
features_to_standardize = ['loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'tempo']

# Initialize the StandardScaler
scaler = StandardScaler()

# Apply the standardization to the f_engineering dataframe
one_m_data[features_to_standardize] = scaler.fit_transform(one_m_data[features_to_standardize])

# Display the first few rows of the standardized features
print(one_m_data[features_to_standardize].head())

one_m_data["popularity"]= [ 1 if i>=40 else 0 for i in one_m_data.popularity ]
one_m_data["popularity"].value_counts()

one_m_data["key"] = one_m_data["key"].astype("category")
one_m_data = pd.get_dummies(one_m_data, columns=["key"])
one_m_data["key_0"] = one_m_data["key_0"].astype(int)
one_m_data["key_1"] = one_m_data["key_1"].astype(int)
one_m_data["key_2"] = one_m_data["key_2"].astype(int)
one_m_data["key_3"] = one_m_data["key_3"].astype(int)
one_m_data["key_4"] = one_m_data["key_4"].astype(int)
one_m_data["key_5"] = one_m_data["key_5"].astype(int)
one_m_data["key_6"] = one_m_data["key_6"].astype(int)
one_m_data["key_7"] = one_m_data["key_7"].astype(int)
one_m_data["key_8"] = one_m_data["key_8"].astype(int)
one_m_data["key_9"] = one_m_data["key_9"].astype(int)
one_m_data["key_10"] = one_m_data["key_10"].astype(int)
one_m_data["key_11"] = one_m_data["key_11"].astype(int)
one_m_data.head()

one_m_data["audio_mode"] = one_m_data["audio_mode"].astype("category")
one_m_data = pd.get_dummies(one_m_data, columns=["audio_mode"])
one_m_data["audio_mode_0.0"] = one_m_data["audio_mode_0.0"].astype(int)
one_m_data["audio_mode_1.0"] = one_m_data["audio_mode_1.0"].astype(int)
one_m_data.head()

one_m_data.drop(['mode'],axis=1,inplace=True)
one_m_data.columns[one_m_data.isnull().any()]

y = one_m_data["popularity"].values
X = one_m_data.drop(["popularity"],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

xgbr = xgb.XGBRegressor(objective="reg:squarederror", random_state=42)

param_grid = {
    'max_depth': [3, 5, 7, 9, 11],
    'learning_rate': [0.01, 0.05, 0.1, 0.3, 0.5],
    'n_estimators': [50, 75, 100, 150, 200]
}

grid_search = GridSearchCV(xgbr, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

grid_search.fit(X_train, y_train)

best_xgbr = grid_search.best_estimator_

y_pred = best_xgbr.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Best Parameters:", grid_search.best_params_)
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared (R2): {r2:.2f}")

# 尚未切分資料集前的結果
# param_grid = {
#     'max_depth': [3, 5, 7, 9, 11],
#     'learning_rate': [0.01, 0.05, 0.1, 0.3, 0.5],
#     'n_estimators': [50, 75, 100, 150, 200]
# }
# Best Parameters: {'learning_rate': 0.1, 'max_depth': 11, 'n_estimators': 200}
# Mean Squared Error (MSE): 164.85
# R-squared (R2): 0.29

# 切分資料集後的結果
# param_grid = {
#     'max_depth': [3, 5, 7, 9, 11],
#     'learning_rate': [0.01, 0.05, 0.1, 0.3, 0.5],
#     'n_estimators': [50, 75, 100, 150, 200]
# }
# Best Parameters: {'learning_rate': 0.05, 'max_depth': 11, 'n_estimators': 200}
# Mean Squared Error (MSE): 195.48
# R-squared (R2): 0.31

   loudness  speechiness  liveness
0  3.765771     0.042005  0.108854
1  3.760479     0.025473  0.092944
2  3.677414     0.031789  0.085719
3  3.770690     0.035657  0.076683
4  3.867778     0.029753  0.104360
   loudness  speechiness  acousticness  instrumentalness  liveness     tempo
0 -0.154865    -0.493486      1.146309         -0.627155 -0.533055  0.390042
1 -0.189139    -0.731439      0.507700         -0.627116 -0.649024  0.617059
2 -0.727106    -0.640525      0.098637         -0.627012 -0.701684  0.605333
3 -0.123009    -0.584862      1.478856         -0.627155 -0.767542  2.787356
4  0.505782    -0.669835     -0.682408         -0.571985 -0.565814  1.678504
Best Parameters: {'learning_rate': 0.05, 'max_depth': 11, 'n_estimators': 200}
Mean Squared Error (MSE): 0.14
R-squared (R2): 0.26
