In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.losses import Huber
import os
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from scipy.stats import spearmanr
import tensorflow as tf


folder_path = r"E:\HuaweiMoveData\Users\liuzhihan\Desktop\download\download" 
all_disciplines = {}

for filename in os.listdir(folder_path):
    if filename.endswith(".csv"):
        discipline = filename.split(".")[0].replace("_", " ").title()
        file_path = os.path.join(folder_path, filename)
        
        df = pd.read_csv(file_path, encoding='iso-8859-1', header=1, skipfooter=1, engine='python')
        df = df.iloc[:, [0, 1, 3, 4, 5]]
        df.columns = ["Rank","Institutions", "Cites", "Cites_Per_Paper", "Top_Papers"]
        df["rank_position"] = df["Rank"]
        df["discipline"] = discipline
        
        df = df.sample(frac=1, random_state=42).reset_index(drop=True)
        
        train_size = int(len(df) * 0.8)
        train_df = df.iloc[:train_size]
        test_df = df.iloc[train_size:]
        
        all_disciplines[discipline] = {
            "train": train_df,
            "test": test_df
        }

discipline_models = {}
discipline_scalers = {}
discipline_metrics = {}
os.makedirs("./models", exist_ok=True)

for discipline, data in all_disciplines.items():
    train_df = data["train"]
    test_df = data["test"]
    
    if len(train_df) < 20:
        print(f"学科 {discipline} 训练数据不足20条，跳过训练")
        continue
    

    X_train = train_df[["Cites", "Top_Papers", "Cites_Per_Paper"]].values
    y_train = train_df["rank_position"].values
    X_test = test_df[["Cites", "Top_Papers", "Cites_Per_Paper"]].values
    y_test = test_df["rank_position"].values

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test) if len(X_test) > 0 else np.array([])
    
    n_samples = len(train_df)
    if n_samples < 100:
        inputs = Input(shape=(3,))
        x = Dense(32, activation="relu")(inputs)
        x = Dropout(0.2)(x)
        outputs = Dense(1)(x)
    else:
        inputs = Input(shape=(3,))
        x = Dense(128, activation="relu")(inputs)
        x = Dropout(0.2)(x)
        x = Dense(64, activation="relu")(x)
        x = Dropout(0.2)(x)
        outputs = Dense(1)(x)
    model = Model(inputs=inputs, outputs=outputs)
    
    model.compile(optimizer="adam", loss=Huber(delta=1.0), metrics=["mae"])

    callbacks = [EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True)]
    
    # 训练模型
    model.fit(
        X_train_scaled, y_train,
        epochs=50,
        batch_size=32,
        validation_split=0.2,
        callbacks=callbacks,
        verbose=0
    )
    
    # 评估模型
    metrics = {}
    if len(X_test) > 0:
        y_pred = model.predict(X_test_scaled).flatten()
        mse = mean_squared_error(y_test, y_pred)
        mape = mean_absolute_percentage_error(y_test, y_pred)
        if len(np.unique(y_test)) == 1 or len(np.unique(y_pred)) == 1:
            spearman = np.nan
        else:
            spearman, _ = spearmanr(y_test, y_pred)
        
        metrics = {
            "MSE": mse,
            "MAPE": mape,
            "Spearman": spearman
        }
        print(f"\n学科 {discipline} 测试集评估：")
        print(f"均方误差（MSE）：{mse:.4f}")
        print(f"平均绝对百分比误差（MAPE）：{mape:.4f}")
        print(f"斯皮尔曼相关系数：{spearman:.4f}")
    else:
        print(f"学科 {discipline} 无测试数据，跳过评估")
    
    model.save(f"./models/{discipline}_model.keras")
    discipline_models[discipline] = model
    discipline_scalers[discipline] = scaler
    discipline_metrics[discipline] = metrics

metrics_df = pd.DataFrame(discipline_metrics).T
metrics_df = metrics_df.reset_index().rename(columns={"index": "discipline"})
metrics_df.to_excel("各学科模型评估结果.xlsx", index=False)

def predict_rank(discipline, cites, top_papers, cites_per_paper):
    if discipline not in discipline_models:
        return f"学科 {discipline} 无可用模型"
    
    model = discipline_models[discipline]
    scaler = discipline_scalers[discipline]
    
    new_data = np.array([[cites, top_papers, cites_per_paper]])
    new_data_scaled = scaler.transform(new_data)
    pred_rank = model.predict(new_data_scaled)[0][0]
    return max(round(pred_rank), 1)

# 预测示例
print("\n预测示例:")
print(f"Engineering - Cites=5000, Top_Papers=20, Cites_Per_Paper=15 → 预测排名：{predict_rank('Engineering', 5000, 20, 15)}")
print(f"Clinical Medicine - Cites=10000, Top_Papers=30, Cites_Per_Paper=20 → 预测排名：{predict_rank('Clinical Medicine', 10000, 30, 20)}")

[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 

学科 Agricultural Sciences 测试集评估：
均方误差（MSE）：10086.4434
平均绝对百分比误差（MAPE）：0.1045
斯皮尔曼相关系数：0.9776
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 

学科 Biology & Biochemistry 测试集评估：
均方误差（MSE）：19588.0879
平均绝对百分比误差（MAPE）：0.1114
斯皮尔曼相关系数：0.9696
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 

学科 Chemistry 测试集评估：
均方误差（MSE）：56434.7930
平均绝对百分比误差（MAPE）：0.0866
斯皮尔曼相关系数：0.9876
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step

学科 Clinical Medicine 测试集评估：
均方误差（MSE）：239406.9531
平均绝对百分比误差（MAPE）：0.1203
斯皮尔曼相关系数：0.9797
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step 

学科 Computer Science 测试集评估：
均方误差（MSE）：4349.5742
平均绝对百分比误差（MAPE）：0.1261
斯皮尔曼相关系数：0.9790
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 

学科 Economics & Business 测试集评估：
均方误差（MSE）：1095.8883
平均绝对百分比误差（MAPE）：0.1122
斯皮尔曼相关系数：0.9964
[1m18/18[0m [32m━━━━━━━━