# 实训二 基于回归分析的大学综合得分预测

## 崔敬然

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

data_df = pd.read_csv('C:/Users/Ran/Desktop/vscode_mycode/Training/2/cwurData.csv')  # 读入 csv 文件为 pandas 的 DataFrame
data_df.head(3).T  # 观察前几列并转置方便观察

data_df = data_df.dropna()  # 舍去包含 NaN 的 row
len(data_df)

feature_cols = ['quality_of_faculty', 'publications', 'citations', 'alumni_employment', 
                'influence', 'quality_of_education', 'broad_impact', 'patents']
X = data_df[feature_cols]
Y = data_df['score']

# 读取数据
data_df = pd.read_csv('C:/Users/Ran/Desktop/vscode_mycode/Training/2/cwurData.csv')

# 删除包含 NaN 的行
data_df = data_df.dropna()

# 选择特征列和目标列
feature_cols = ['quality_of_education', 'alumni_employment', 'quality_of_faculty', 
                'publications', 'influence', 'citations', 'broad_impact', 'patents']
X = data_df[feature_cols]
Y = data_df['score']

# 随机划分训练集和测试集，比例为 80% 训练集，20% 测试集
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# 数据标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 初始化线性回归模型
model = LinearRegression()

# 训练模型
model.fit(X_train_scaled, Y_train)

# 使用测试集进行预测
Y_pred = model.predict(X_test_scaled)

# 计算均方根误差 (RMSE)
rmse = np.sqrt(mean_squared_error(Y_test, Y_pred))
print(f"测试集上的 RMSE: {rmse}")

# 输出线性回归模型的系数和截距
coefficients = pd.DataFrame(model.coef_, feature_cols, columns=['Coefficient'])
intercept = model.intercept_

print(f"模型的截距: {intercept}")
print("模型的系数：")
print(coefficients)

# 可视化各特征与目标变量之间的关系
plt.figure(figsize=(10, 8))
sns.heatmap(X.corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("特征间的相关性热力图")
plt.show()

sns.pairplot(data_df[['quality_of_education', 'alumni_employment', 'quality_of_faculty', 'publications', 'score']])
plt.show()

# 绘制真实值与预测值的对比
plt.figure(figsize=(8, 6))
plt.scatter(Y_test, Y_pred, alpha=0.5)
plt.plot([min(Y_test), max(Y_test)], [min(Y_test), max(Y_test)], color='red', linestyle='--')
plt.title('真实值与预测值的对比')
plt.xlabel('真实值')
plt.ylabel('预测值')
plt.show()


测试集上的 RMSE: 3.2888875354885996
模型的截距: 47.25239375
模型的系数：
                      Coefficient
quality_of_education    -0.730797
alumni_employment       -1.261856
quality_of_faculty      -3.662068
publications             0.157062
influence                0.235541
citations               -0.022441
broad_impact            -0.783997
patents                 -0.635591


  func(*args)
  func(*args)
  func(*args)
  func(*args)
  func(*args)
  func(*args)
  func(*args)
  func(*args)
  func(*args)
  func(*args)
  func(*args)
  func(*args)
  func(*args)
  func(*args)
  func(*args)
  func(*args)
  func(*args)
  func(*args)
  func(*args)
