In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
# 加载数据集
data = pd.read_csv('bookstoscrape.csv')

In [3]:
# 数据预处理

# 检查数据中是否存在重复项和空值
num_duplicates = data.duplicated().sum()
num_missing_values = data.isnull().sum().sum()

In [4]:
# 重新检查价格列的数据类型
price_data_type = data['Price'].dtype

# 如果价格不是数值类型，则进行转换
if not pd.api.types.is_numeric_dtype(price_data_type):
    data['Price'] = data['Price'].str.replace(',', '').astype(float)

# 再次展示数据集的前几行以确认转换结果
data.head()


Unnamed: 0,Title,Price,Star Rating
0,A Light in the Attic,51.77,Three
1,Tipping the Velvet,53.74,One
2,Soumission,50.1,One
3,Sharp Objects,47.82,Four
4,Sapiens: A Brief History of Humankind,54.23,Five


In [5]:
# 对星级评分进行编码
star_rating_mapping = {'One': 1, 'Two': 2, 'Three': 3, 'Four': 4, 'Five': 5}
data['Star Rating'] = data['Star Rating'].map(star_rating_mapping)

num_duplicates, num_missing_values, data.head()

(0,
 0,
                                    Title  Price  Star Rating
 0                   A Light in the Attic  51.77            3
 1                     Tipping the Velvet  53.74            1
 2                             Soumission  50.10            1
 3                          Sharp Objects  47.82            4
 4  Sapiens: A Brief History of Humankind  54.23            5)

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [7]:
# 文本预处理和特征提取
# 将书籍标题转换为小写，并去除标点符号
data['Title'] = data['Title'].str.lower().str.replace(r'[^\w\s]', '')

In [8]:
# 使用TF-IDF进行特征提取
tfidf_vectorizer = TfidfVectorizer(max_features=500, stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(data['Title'])

In [9]:
# 展示TF-IDF模型的一些结果
tfidf_matrix.shape, tfidf_vectorizer.get_feature_names_out()[:10]

((1000, 500),
 array(['000', '01', '10', '11', '125', '14', '20', '40', '60',
        'acceptance'], dtype=object))

In [10]:
# 准备特征和标签
y = data['Star Rating']

In [11]:
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, y, test_size=0.2, random_state=42)

In [12]:
# 训练线性回归模型
linear_regression = LinearRegression()
linear_regression.fit(X_train, y_train)

In [13]:
# 评估模型
y_pred = linear_regression.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 7.644686207203211


In [14]:
# 为第一个用户生成推荐
first_user_train_indices = X_train[y_train.index == 0].toarray()
cosine_similarities = cosine_similarity(first_user_train_indices, tfidf_matrix)
similar_indices = cosine_similarities.argsort().flatten()[-5:][::-1]
recommendations = data.iloc[similar_indices]['Title']
print(f'Recommendations for the first user: {recommendations}')

Recommendations for the first user: 0                     a light in the attic
340            all the light we cannot see
929             the light of the fireflies
327         catching jordan (hundred oaks)
339    angels & demons (robert langdon #1)
Name: Title, dtype: object


In [16]:
import pickle

# 保存模型
model_path = 'model.pkl'
with open(model_path, 'wb') as file:
    pickle.dump(linear_regression, file)

model_path


'model.pkl'