In [41]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
import pickle



In [42]:
# 加载数据集
data = pd.read_csv('bookstoscrape.csv')

In [43]:
# 数据预处理
# 检查数据中是否存在重复项和空值
num_duplicates = data.duplicated().sum()
num_missing_values = data.isnull().sum().sum()



In [44]:
# 重新检查价格列的数据类型
price_data_type = data['Price'].dtype

In [45]:
# 如果价格不是数值类型，则进行转换
if not pd.api.types.is_numeric_dtype(price_data_type):
    data['Price'] = data['Price'].str.replace(',', '').astype(float)

In [46]:
# 对星级评分进行编码
star_rating_mapping = {'One': 1, 'Two': 2, 'Three': 3, 'Four': 4, 'Five': 5}
data['Star Rating'] = data['Star Rating'].map(star_rating_mapping)

In [47]:
# 文本预处理和特征提取
# 将书籍标题转换为小写，并去除标点符号
data['Title'] = data['Title'].str.lower().str.replace(r'[^\w\s]', '')

In [48]:
# 使用TF-IDF进行特征提取
tfidf_vectorizer = TfidfVectorizer(max_features=500, stop_words='english', ngram_range=(1, 2))
tfidf_matrix = tfidf_vectorizer.fit_transform(data['Title'])

In [49]:
# 准备标签
y = data['Star Rating']

In [50]:
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, y, test_size=0.2, random_state=42)

In [51]:
# 使用随机森林回归模型
rf_regressor = RandomForestRegressor(random_state=42)

In [52]:
# 使用梯度提升回归模型
gb_regressor = GradientBoostingRegressor(random_state=42)

In [53]:
# 超参数调优
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

param_grid_gb = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5]
}

grid_search_rf = GridSearchCV(estimator=rf_regressor, param_grid=param_grid_rf, cv=3, n_jobs=-1, verbose=2)
grid_search_gb = GridSearchCV(estimator=gb_regressor, param_grid=param_grid_gb, cv=3, n_jobs=-1, verbose=2)

grid_search_rf.fit(X_train, y_train)
grid_search_gb.fit(X_train, y_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits
Fitting 3 folds for each of 27 candidates, totalling 81 fits


GridSearchCV(cv=3, estimator=GradientBoostingRegressor(random_state=42),
             n_jobs=-1,
             param_grid={'learning_rate': [0.01, 0.1, 0.2],
                         'max_depth': [3, 4, 5],
                         'n_estimators': [50, 100, 200]},
             verbose=2)

In [54]:
# 最佳模型
best_rf_regressor = grid_search_rf.best_estimator_
best_gb_regressor = grid_search_gb.best_estimator_

In [55]:
# 评估模型
y_pred_rf = best_rf_regressor.predict(X_test)
y_pred_gb = best_gb_regressor.predict(X_test)

mse_rf = mean_squared_error(y_test, y_pred_rf)
mse_gb = mean_squared_error(y_test, y_pred_gb)

print(f'Random Forest Mean Squared Error: {mse_rf}')
print(f'Gradient Boosting Mean Squared Error: {mse_gb}')

Random Forest Mean Squared Error: 2.097388427002538
Gradient Boosting Mean Squared Error: 2.0685136275018046


In [56]:
# 选择性能最好的模型
if mse_rf < mse_gb:
    best_model = best_rf_regressor
    best_model_name = 'Random Forest'
    mse_best = mse_rf
else:
    best_model = best_gb_regressor
    best_model_name = 'Gradient Boosting'
    mse_best = mse_gb

print(f'Best Model: {best_model_name} with MSE: {mse_best}')

Best Model: Gradient Boosting with MSE: 2.0685136275018046


In [57]:
# 为第一个用户生成推荐
first_user_train_indices = X_train[y_train.index == 0].toarray()
cosine_similarities = cosine_similarity(first_user_train_indices, tfidf_matrix)
similar_indices = cosine_similarities.argsort().flatten()[-5:][::-1]
recommendations = data.iloc[similar_indices]['Title']
print(f'Recommendations for the first user: {recommendations}')

Recommendations for the first user: 0                     a light in the attic
340            all the light we cannot see
929             the light of the fireflies
327         catching jordan (hundred oaks)
339    angels & demons (robert langdon #1)
Name: Title, dtype: object


In [58]:
# 保存最佳模型
model_path = 'model.pkl'
with open(model_path, 'wb') as file:
    pickle.dump(best_model, file)

model_path

'model.pkl'