In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

In [5]:
data = pd.DataFrame(data={'MovieName': ['功夫熊猫', '叶问3', '二次曝光', '代理情人', 
                                      '新步步惊心', '谍影重重', '美人鱼', '宝贝当家', '唐人街探案'],
                        'ComedyScene': [39, 3, 2, 9, 8, 5, 21, 45, 23], 
                        'hugScene': [0, 2, 3, 38, 34, 2, 17, 2, 3],
                        'fightScene': [31, 65, 55, 2, 17, 57, 5, 9, 17],
                        'type': ['喜剧片', '动作片', '爱情片', '爱情片', 
                                 '爱情片', '动作片', '喜剧片', '喜剧片', np.nan]},
                        columns=["MovieName","ComedyScene","hugScene","fightScene","type"])
data

Unnamed: 0,MovieName,ComedyScene,hugScene,fightScene,type
0,功夫熊猫,39,0,31,喜剧片
1,叶问3,3,2,65,动作片
2,二次曝光,2,3,55,爱情片
3,代理情人,9,38,2,爱情片
4,新步步惊心,8,34,17,爱情片
5,谍影重重,5,2,57,动作片
6,美人鱼,21,17,5,喜剧片
7,宝贝当家,45,2,9,喜剧片
8,唐人街探案,23,3,17,


In [32]:
# 数据量小，且为多分类任务，选择逻辑回归更好，可解释性也强
# 先划分特征和目标
X_train = data[data['type'].notna()][['ComedyScene', 'hugScene', 'fightScene']]
y_train = data[data['type'].notna()]['type'] # data.iloc[:8,:]
X_target = data[data['type'].isna()][['ComedyScene', 'hugScene', 'fightScene']]

In [33]:
# 所有特征标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_target_scaled = scaler.transform(X_target)

In [34]:
# 逻辑回归模型
model = LogisticRegression(multi_class='ovr')
model.fit(X_train_scaled, y_train)



0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [37]:
# 预测结果
pred = model.predict(X_target_scaled)
pred_proba = model.predict_proba(X_target_scaled)
pred

array(['喜剧片'], dtype=object)

In [39]:
print(f"《唐人街探案》预测类型：{pred[0]}")
print(f"各类别概率：{dict(zip(model.classes_, pred_proba[0]))}")

《唐人街探案》预测类型：喜剧片
各类别概率：{'动作片': np.float64(0.13530647769469467), '喜剧片': np.float64(0.6619269035529736), '爱情片': np.float64(0.2027666187523317)}


In [40]:
y_pred = model.predict(X_train_scaled)
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

         动作片       0.67      1.00      0.80         2
         喜剧片       1.00      1.00      1.00         3
         爱情片       1.00      0.67      0.80         3

    accuracy                           0.88         8
   macro avg       0.89      0.89      0.87         8
weighted avg       0.92      0.88      0.88         8

