## 模型训练

In [100]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

# 1. 读取CSV文件，并设置第一列中文名为行索引
df = pd.read_csv('train.csv', index_col=0, encoding='utf-8')

# 2. 获取特征数据，这里假设除了作为索引的第一列外，其余列都是特征
X = df.iloc[:, 0:]  # 特征：除了第一列的所有列

# 3. 创建随机森林分类器实例
model = RandomForestClassifier(n_estimators=100, random_state=42)

# 4. 拟合模型 - 注意，这里我们不使用任何目标变量，只是拟合模型以评估特征重要性
# 由于没有目标变量，我们使用随机森林模型的基学习器来模拟一个随机的目标变量
# 这种方法不适用于实际的分类任务，但可以用来评估特征重要性
X_train = X.sample(frac=1)  # 随机打乱数据作为训练集
y_train = X_train.apply(lambda x: x.sum(), axis=1)  # 随机生成一个目标变量
model.fit(X_train, y_train)

# 5. 评估特征重要性
importances = model.feature_importances_
feature_names = X.columns.tolist()
importance_dict = dict(zip(feature_names, importances))
print(importance_dict)
# 打印特征重要性
# print("Feature importances:")
# for feature, importance in sorted(importance_dict.items(), key=lambda item: item[1], reverse=True):
#     print(f"{feature}: {importance:.4f}")

{'恋爱': 0.004840875848311756, '奇幻': 0.007408901740408985, '战斗': 0.009952864761817427, '漫画改': 0.004190832435842426, '美食': 0.0017751828018150312, '冒险': 0.009368666432201191, '搞笑': 0.00573502060912221, '治愈': 0.0021199655943203084, '科幻': 0.007022499017157177, '日常': 0.007639105277787348, '热血': 0.005251253470023959, '推理': 0.013462509088411685, '校园': 0.006886650285842861, '架空': 0.004196605343233361, '运动': 0.003720899476446856, '音乐': 0.0008423045362925913, '游戏改': 0.003606953198902026, '智斗': 0.002164763589202018, '穿越': 0.0031774742574760205, '催泪': 0.002746415810048269, '少女': 0.008195403229769128, '社团': 0.001864976025893963, '励志': 0.0008333706839639961, '原创': 0.009416558810393182, '少儿': 0.0005402829106343945, '萌系': 0.004486907801240542, '魔法': 0.0073042841759480845, '小说改': 0.008451590560642503, '动画': 0.0065432002064243575, '爱情': 0.01204767211938605, '剧情': 0.004828312225749179, '悬疑': 0.001065317205782172, '时泪': 0.002349020322684527, '神魔': 0.0014087793213625852, '职场': 0.00038039667103295926, '未来': 0

In [101]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import accuracy_score
import numpy as np

df = pd.read_csv('train.csv', index_col=0, encoding='utf-8')  # 第一列是动漫名字，作为索引

X = df.iloc[:, :]  # 特征：除了动漫名字的所有列
y = df.index  # 目标变量：动漫的名字

X_train, y_train = X, y
# # 打印X_train第一列的列名
# print(X_train.columns[0])

# # 打印X_train第一列的所有数据
# print(X_train.iloc[:, 0])
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.preprocessing import label_binarize
y_train_bin = label_binarize(y_train, classes=np.unique(y_train))
auc_scorer = make_scorer(roc_auc_score, needs_proba=True, multi_class='ovr')


param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'criterion': ['gini', 'entropy']
}
model = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=KFold(n_splits=2, shuffle=True, random_state=42), 
                            scoring=auc_scorer, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)
print("Best parameters found: ", grid_search.best_params_)
best_model = grid_search.best_estimator_



best_model.fit(X_train, y_train)
#评估准确率
y_pred = best_model.predict(X_train)
accuracy = accuracy_score(y_train, y_pred)
print(f'Accuracy: {accuracy:.2f}')
#评估auc
y_pred_proba = best_model.predict_proba(X_train)
roc_auc = roc_auc_score(y_train_bin, y_pred_proba, average=None)
print(f'AUC Scores for Each Class: {roc_auc}')


user_preferences = X.iloc[3].values  
# user_preferences[:] = 0
# user_preferences[0] = 1
# user_preferences[3] = 1
# user_preferences[6] = 1
# user_preferences[12] = 1

# 打印结果以确认
print(user_preferences)

similarity_scores = best_model.predict_proba([user_preferences])  # 获取概率
recommendations = best_model.classes_[np.argmax(similarity_scores, axis=1)]  # 推荐索引
print(f'Recommended anime: {recommendations}')


Fitting 2 folds for each of 24 candidates, totalling 48 fits
Best parameters found:  {'criterion': 'entropy', 'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 200}
Accuracy: 0.99
AUC Scores for Each Class: [1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         0.99295775 0.99295775 1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.        ]
[0 1 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 



In [102]:
from joblib import dump, load
dump(best_model, 'model.joblib')

['model.joblib']

In [3]:
from joblib import dump, load
import numpy as np
# 加载之前保存的模型
model = load('model.joblib')
# X_test=user_preferences.reshape(1,-1)
X_test = np.zeros((1, 466))
X_test[0][0] = 1
X_test[0][3] = 1
X_test[0][6] = 1
X_test[0][12] = 1
# X_test = pd.Series([0] * 466).values.reshape(1, -1)
# print(X_test.shape)
# 使用加载的模型进行预测
predictions = model.predict(X_test)
print(predictions[0])

probabilities = model.predict_proba(X_test)

# 获取每个实例前5个最可能的类别及其概率
top_indices = np.argsort(probabilities, axis=1)[:,-5:]  # 取每个实例概率最高的5个类别索引

# 将索引转换为对应的类别名称
top_classes_and_probs = [(list(model.classes_[idx]), probabilities[i, idx]) for i, idx in enumerate(top_indices)]


print(top_classes_and_probs)
for item in top_classes_and_probs:
    anime_list, prob_array = item
    # print(anime_list[::-1], prob_array[::-1])
    print(anime_list[-1])

辉夜大小姐想让我告白 -究极浪漫-
[(['擅长捉弄的高木同学', '徒然喜欢你', '理科生坠入情网，故尝试证明。', '女高中生的虚度日常', '辉夜大小姐想让我告白 -究极浪漫-'], array([0.05093458, 0.0531147 , 0.06512866, 0.06539768, 0.15221124]))]
辉夜大小姐想让我告白 -究极浪漫-


