In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

train_df = pd.read_csv(r'E:\HuaweiMoveData\Users\liuzhihan\Desktop\各学科排名模型训练集.csv')
test_df = pd.read_csv(r'E:\HuaweiMoveData\Users\liuzhihan\Desktop\各学科排名模型测试集.csv')

results = []
for discipline in train_df['discipline'].unique():
    train_sub = train_df[train_df['discipline'] == discipline]
    test_sub = test_df[test_df['discipline'] == discipline]
    
    if len(train_sub) < 5 or len(test_sub) < 2: 
        continue
    
    features = ['Cites', 'Top_Papers', 'Cites_Per_Paper']
    target = 'rank_position'
    X_train = train_sub[features]
    y_train = train_sub[target]
    X_test = test_sub[features]
    y_test = test_sub[target]
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    results.append({
        '学科': discipline,
        '平均绝对误差': mae,
        '决定系数': r2,
        'Cites系数': model.coef_[0],
        'Top_Papers系数': model.coef_[1],
        'Cites_Per_Paper系数': model.coef_[2]
    })

result_df = pd.DataFrame(results)
pd.set_option('display.max_columns', None) 
pd.set_option('display.width', 200)       
pd.set_option('display.precision', 4)  
print("各学科建模结果：")
print(result_df)

def train_and_predict_single_discipline(discipline, cites, top_papers, cites_per_paper):
    train_sub = train_df[train_df['discipline'] == discipline] 
    features = ['Cites', 'Top_Papers', 'Cites_Per_Paper']
    target = 'rank_position'
    X_train = train_sub[features]
    y_train = train_sub[target]
    
    # 训练模型
    model = LinearRegression()
    model.fit(X_train, y_train)

    new_data = pd.DataFrame({
        'Cites': [cites],
        'Top_Papers': [top_papers],
        'Cites_Per_Paper': [cites_per_paper]
    })
    pred_rank = model.predict(new_data)[0]
    pred_rank = max(pred_rank, 1) 
    return f"{discipline} 学科新数据预测排名：{pred_rank:.2f}"

def predict_single_discipline(discipline, cites, top_papers, cites_per_paper):
    train_sub = train_df[train_df['discipline'] == discipline].copy() 
    
    features = ['Cites', 'Top_Papers', 'Cites_Per_Paper']
    train_sub['rank_label'] = pd.cut(train_sub['rank_position'], bins=range(0, 1001, 100), labels=False)
    X_train = train_sub[features]
    y_train = train_sub['rank_label']
    
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    new_data = pd.DataFrame({
        'Cites': [cites],
        'Top_Papers': [top_papers],
        'Cites_Per_Paper': [cites_per_paper]
    })
    pred_label = model.predict(new_data)[0]
    pred_rank_range = f"第{pred_label*100+1}-{pred_label*100+100}名"
    return f"{discipline} 学科新数据预测排名区间：{pred_rank_range}"

# 调用示例
print("\n新数据预测示例：")
print(train_and_predict_single_discipline('agricultural_sciences', 10000, 22, 30.0))
print(predict_single_discipline('agricultural_sciences', 10000, 22, 30.0))

各学科建模结果：
                            学科     平均绝对误差      决定系数     Cites系数  Top_Papers系数  Cites_Per_Paper系数
0        agricultural_sciences   684.1194  -84.3530 -5.5016e-03       -0.0403             0.4911
1         biology_biochemistry   825.7990  -85.7272 -4.9412e-03        3.3060             0.0099
2                    chemistry  1113.7044  -96.1986 -2.9601e-03        2.0138            -9.0949
3            clinical_medicine  2967.4275 -123.1398 -1.1038e-02        7.2902             0.2316
4             computer_science    85.9606  -58.2177 -9.5853e-04        0.1364             0.0017
5           economics_business    98.9452  -73.4179 -1.0903e-03        0.4187            -0.0540
6                  engineering  1412.5570 -102.7984 -4.2799e-03        0.7433            -3.2478
7          environment_ecology  1063.1690  -92.2686 -1.1749e-03       -2.6787            -0.5859
8                  geosciences   607.5368  -85.7818 -5.4607e-04       -1.1334            -0.7694
9                   i