In [1]:
# 读取csv，比较两列关系，求相关系数
import pandas as pd
import numpy as np

# 读取csv文件
df = pd.read_csv('aligned&cleaned.csv')

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

y=df['change_ratio'].values
X=df['net_inflow_rate'].values.reshape(-1,1)

# 将数据集分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 创建线性回归模型
model = LinearRegression()

# 训练模型
model.fit(X_train, y_train)

# 使用模型进行预测
y_pred = model.predict(X_test)

# 计算性能指标
r2 = r2_score(y_test, y_pred)  # R²
mse = mean_squared_error(y_test, y_pred)  # 均方误差
rmse = np.sqrt(mse)  # 均方根误差

# 打印性能指标
print(f'R²: {r2:.2f}')
print(f'MSE: {mse:.2f}')
print(f'RMSE: {rmse:.2f}')


R²: 0.25
MSE: 0.00
RMSE: 0.02


In [5]:
# 假设模型已经被训练
print(f'回归系数（斜率）: {model.coef_[0]}')
print(f'截距（常数项）: {model.intercept_}')

回归系数（斜率）: 0.07694546236471675
截距（常数项）: 0.0036171471605092897


In [7]:
import statsmodels.api as sm

# 为X添加常数项，以拟合截距
X_sm = sm.add_constant(X_train)

# 使用statsmodels拟合模型
model_sm = sm.OLS(y_train, X_sm).fit()

# 打印模型的摘要，包括回归系数、截距和置信区间等
print(model_sm.summary())


                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.252
Model:                            OLS   Adj. R-squared:                  0.252
Method:                 Least Squares   F-statistic:                 3.011e+05
Date:                Tue, 26 Mar 2024   Prob (F-statistic):               0.00
Time:                        16:38:22   Log-Likelihood:             2.1833e+06
No. Observations:              895787   AIC:                        -4.367e+06
Df Residuals:                  895785   BIC:                        -4.367e+06
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0036   2.31e-05    156.550      0.0

In [10]:
from scipy.stats import spearmanr

In [11]:
# 计算Spearman等级相关系数
coef, p = spearmanr(X, y)

print(f"Spearman correlation coefficient: {coef:.3f}")
print(f"P-value: {p:.3f}")

Spearman correlation coefficient: 0.690
P-value: 0.000
