In [1]:
import pandas as pd

# 读取CSV文件
file_path = "climate_new/static_filtered.csv"  # 替换为你的CSV文件路径
data = pd.read_csv(file_path)

# 读取表头为 'STRAHLER_MAX' 的那一列
strahler_max = data['STRAHLER_MAX']
drain_sqkm = data['DRAIN_SQKM']

# 分别统计不同区间的数据数量
total_count = len(strahler_max)
group_1 = data[strahler_max <= 3]
group_2 = data[(strahler_max > 3) & (strahler_max <= 6)]
group_3 = data[strahler_max > 6]

# 计算数量和比例
less_or_equal_3 = len(group_1)
greater_3_and_less_or_equal_6 = len(group_2)
greater_than_6 = len(group_3)

less_or_equal_3_ratio = less_or_equal_3 / total_count
greater_3_and_less_or_equal_6_ratio = greater_3_and_less_or_equal_6 / total_count
greater_than_6_ratio = greater_than_6 / total_count

# 计算 DRAIN_SQKM 的平均数、中位数和标准差
group_1_mean = group_1['DRAIN_SQKM'].mean()
group_1_median = group_1['DRAIN_SQKM'].median()
group_1_std = group_1['DRAIN_SQKM'].std()

group_2_mean = group_2['DRAIN_SQKM'].mean()
group_2_median = group_2['DRAIN_SQKM'].median()
group_2_std = group_2['DRAIN_SQKM'].std()

group_3_mean = group_3['DRAIN_SQKM'].mean()
group_3_median = group_3['DRAIN_SQKM'].median()
group_3_std = group_3['DRAIN_SQKM'].std()

# 输出结果
print(f"总数量: {total_count}")

print(f"小于等于3的数量: {less_or_equal_3}，比例: {less_or_equal_3_ratio:.2%}")
print(f"  - DRAIN_SQKM 平均数: {group_1_mean:.2f}, 中位数: {group_1_median:.2f}, 标准差: {group_1_std:.2f}")

print(f"大于3但小于等于6的数量: {greater_3_and_less_or_equal_6}，比例: {greater_3_and_less_or_equal_6_ratio:.2%}")
print(f"  - DRAIN_SQKM 平均数: {group_2_mean:.2f}, 中位数: {group_2_median:.2f}, 标准差: {group_2_std:.2f}")

print(f"大于6的数量: {greater_than_6}，比例: {greater_than_6_ratio:.2%}")
print(f"  - DRAIN_SQKM 平均数: {group_3_mean:.2f}, 中位数: {group_3_median:.2f}, 标准差: {group_3_std:.2f}")


总数量: 482
小于等于3的数量: 126，比例: 26.14%
  - DRAIN_SQKM 平均数: 89.03, 中位数: 53.85, 标准差: 108.54
大于3但小于等于6的数量: 280，比例: 58.09%
  - DRAIN_SQKM 平均数: 3224.07, 中位数: 1313.15, 标准差: 5474.93
大于6的数量: 76，比例: 15.77%
  - DRAIN_SQKM 平均数: 20520.39, 中位数: 17014.80, 标准差: 13062.61


In [None]:
import os
import pandas as pd

# 文件夹路径
folder_path = "climate_new"  # 替换为你的文件夹路径

# 定义存储站点结果的字典
station_results = {}

# 遍历文件夹中的每个站点文件
for filename in os.listdir(folder_path):
    if filename.endswith(".csv") and filename != "static_filtered.csv":  # 确保只处理 CSV 文件并排除特定文件
        file_path = os.path.join(folder_path, filename)

        # 读取站点数据
        data = pd.read_csv(file_path)

        # 确定 'date' 列为第一列，检索 '00060' 列
        data = data.rename(columns={data.columns[0]: 'date'})
        target_column = 'pr'

        if target_column in data.columns:
            # 将 'date' 转换为日期，将 '00060' 转换为数值
            data['date'] = pd.to_datetime(data['date'], errors='coerce')
            data[target_column] = pd.to_numeric(data[target_column], errors='coerce')

            # 移除无效行
            data = data.dropna(subset=['date', target_column])

            # 提取年份并按年份计算年度总和
            data['year'] = data['date'].dt.year
            yearly_sum = data.groupby('year')[target_column].sum()

            if not yearly_sum.empty:  # 确保有值后再计算
                # 计算年度平均值
                average_value = yearly_sum.mean()
                used_years = list(yearly_sum.index)

                # 保存结果
                station_name = filename.replace(".csv", "")
                station_results[station_name] = {
                    'Average_Value': average_value,
                    'Used_Years': used_years
                }
        else:
            print(f"文件 {filename} 缺少 '{target_column}' 列。")

# 计算所有站点年度平均值的范围
if station_results:
    average_values = [stats['Average_Value'] for stats in station_results.values()]
    min_average_value = min(average_values)
    max_average_value = max(average_values)
    mean_average_value = sum(average_values) / len(average_values)
    median_average_value = sorted(average_values)[len(average_values) // 2]

    # 找到最小值和最大值对应的站点
    min_station = [station for station, stats in station_results.items() if stats['Average_Value'] == min_average_value][0]
    max_station = [station for station, stats in station_results.items() if stats['Average_Value'] == max_average_value][0]

    # 将结果打印到控制台
    print(f"482个站年度平均值的范围:")
    print(f"最小值: {min_average_value:.2f} (站点: {min_station})")
    print(f"最大值: {max_average_value:.2f} (站点: {max_station})")
    print(f"所有站点年度平均值的均值: {mean_average_value:.2f}")
    print(f"所有站点年度平均值的中位数: {median_average_value:.2f}\n")

    # 打印每个站的年度平均值和使用的年份
    print("每个站的年度平均值及使用的年份:")
    for station, stats in station_results.items():
        print(f"站点: {station}, 年度平均值: {stats['Average_Value']:.2f}, 使用年份: {stats['Used_Years']}")
else:
    print("未找到有效的站点数据。")


In [6]:
import os
import pandas as pd

# 文件夹路径
folder_path = "climate_new"  # 替换为你的文件夹路径

# 定义存储站点结果的字典
station_results = {}

# 遍历文件夹中的每个站点文件
for filename in os.listdir(folder_path):
    if filename.endswith(".csv") and filename != "static_filtered.csv":  # 确保只处理 CSV 文件并排除特定文件
        file_path = os.path.join(folder_path, filename)

        # 读取站点数据
        data = pd.read_csv(file_path)

        # 确定 'date' 列为第一列，检索 '00060' 列
        data = data.rename(columns={data.columns[0]: 'date'})
        target_column = 'runoff'

        if target_column in data.columns:
            # 将 'date' 转换为日期，将 '00060' 转换为数值
            data['date'] = pd.to_datetime(data['date'], errors='coerce')
            data[target_column] = pd.to_numeric(data[target_column], errors='coerce')

            # 移除无效行
            data = data.dropna(subset=['date', target_column])

            # 提取年份并按年份计算年度总和
            data['year'] = data['date'].dt.year 
            yearly_sum = data.groupby('year')[target_column].sum() / 365 * 1000

            if not yearly_sum.empty:  # 确保有值后再计算
                # 计算年度平均值
                average_value = yearly_sum.mean()
                used_years = list(yearly_sum.index)

                # 保存结果
                station_name = filename.replace(".csv", "")
                station_results[station_name] = {
                    'Average_Value': average_value,
                    'Used_Years': used_years
                }
        else:
            print(f"文件 {filename} 缺少 '{target_column}' 列。")

# 计算所有站点年度平均值的范围
if station_results:
    average_values = [stats['Average_Value'] for stats in station_results.values()]
    min_average_value = min(average_values)
    max_average_value = max(average_values)
    mean_average_value = sum(average_values) / len(average_values)
    median_average_value = sorted(average_values)[len(average_values) // 2]

    # 找到最小值和最大值对应的站点
    min_station = [station for station, stats in station_results.items() if stats['Average_Value'] == min_average_value][0]
    max_station = [station for station, stats in station_results.items() if stats['Average_Value'] == max_average_value][0]

    # 将结果打印到控制台
    print(f"482个站年度平均值的范围:")
    print(f"最小值: {min_average_value:.4f} (站点: {min_station})")
    print(f"最大值: {max_average_value:.4f} (站点: {max_station})")
    print(f"所有站点年度平均值的均值: {mean_average_value:.4f}")
    print(f"所有站点年度平均值的中位数: {median_average_value:.4f}\n")

    # 打印每个站的年度平均值和使用的年份
    print("每个站的年度平均值及使用的年份:")
    for station, stats in station_results.items():
        print(f"站点: {station}, 年度平均值: {stats['Average_Value']:.2f}, 使用年份: {stats['Used_Years']}")
else:
    print("未找到有效的站点数据。")


482个站年度平均值的范围:
最小值: 1.6206 (站点: 08353000)
最大值: 2181.4776 (站点: 14216500)
所有站点年度平均值的均值: 348.6373
所有站点年度平均值的中位数: 318.2302

每个站的年度平均值及使用的年份:
站点: 01054200, 年度平均值: 983.40, 使用年份: [1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018]
站点: 01097000, 年度平均值: 634.27, 使用年份: [1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018]
站点: 01111500, 年度平均值: 691.86, 使用年份: [1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018]
站点: 01116500, 年度平均值: 632.94, 使用年份: [1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990,

In [None]:
import os
import pandas as pd

# 文件夹路径
folder_path = "climate_new"  # 替换为你的文件夹路径

# 定义存储站点结果的字典
station_results = {}

# 遍历文件夹中的每个站点文件
for filename in os.listdir(folder_path):
    if filename.endswith(".csv") and filename != "static_filtered.csv":  # 确保只处理 CSV 文件并排除特定文件
        file_path = os.path.join(folder_path, filename)

        # 读取站点数据
        data = pd.read_csv(file_path)

        # 确定 'date' 列为第一列，检索 'tmmn' 和 'tmmx' 列
        data = data.rename(columns={data.columns[0]: 'date'})
        tmmn_column = 'tmmn'
        tmmx_column = 'tmmx'

        if tmmn_column in data.columns and tmmx_column in data.columns:
            # 将 'date' 转换为日期，将 'tmmn' 和 'tmmx' 转换为数值
            data['date'] = pd.to_datetime(data['date'], errors='coerce')
            data['tmmn'] = pd.to_numeric(data[tmmn_column], errors='coerce')
            data['tmmx'] = pd.to_numeric(data[tmmx_column], errors='coerce')

            # 移除无效行
            data = data.dropna(subset=['date', 'tmmn', 'tmmx'])

            # 计算每天的平均温度
            data['daily_avg_temp'] = ((data['tmmn']-273.15) + (data['tmmx']-273.15)) / 2

            # 提取年份并计算每年的平均值
            data['year'] = data['date'].dt.year
            yearly_avg_temp = data.groupby('year')['daily_avg_temp'].mean()

            if not yearly_avg_temp.empty:  # 确保有值后再计算
                # 计算所有年份的总平均值
                combined_average = yearly_avg_temp.mean()
                used_years = list(yearly_avg_temp.index)

                # 保存结果
                station_name = filename.replace(".csv", "")
                station_results[station_name] = {
                    'Combined_Average': combined_average,
                    'Used_Years': used_years
                }
        else:
            print(f"文件 {filename} 缺少 'tmmn' 或 'tmmx' 列。")

# 计算所有站点的范围和统计值
if station_results:
    combined_values = [stats['Combined_Average'] for stats in station_results.values()]

    min_combined = min(combined_values)
    max_combined = max(combined_values)
    mean_combined = sum(combined_values) / len(combined_values)
    median_combined = sorted(combined_values)[len(combined_values) // 2]

    # 将结果打印到控制台
    print(f"482个站日均温综合平均值的范围及统计值:")
    print(f"最小值: {min_combined:.2f}, 最大值: {max_combined:.2f}, 均值: {mean_combined:.2f}, 中位数: {median_combined:.2f}\n")

    # 打印每个站的日均温综合平均值及使用的年份
    print("每个站的日均温综合平均值及使用的年份:")
    for station, stats in station_results.items():
        print(f"站点: {station}, 综合平均值: {stats['Combined_Average']:.2f}, 使用年份: {stats['Used_Years']}")
else:
    print("未找到有效的站点数据。")


0        1982
1        1982
2        1982
3        1982
4        1982
         ... 
13509    2018
13510    2018
13511    2018
13512    2018
13513    2018
Name: year, Length: 13514, dtype: int64
0        1982
1        1982
2        1982
3        1982
4        1982
         ... 
13509    2018
13510    2018
13511    2018
13512    2018
13513    2018
Name: year, Length: 13514, dtype: int64
0        1982
1        1982
2        1982
3        1982
4        1982
         ... 
13509    2018
13510    2018
13511    2018
13512    2018
13513    2018
Name: year, Length: 13514, dtype: int64
0        1982
1        1982
2        1982
3        1982
4        1982
         ... 
13509    2018
13510    2018
13511    2018
13512    2018
13513    2018
Name: year, Length: 13514, dtype: int64
0        1982
1        1982
2        1982
3        1982
4        1982
         ... 
13509    2018
13510    2018
13511    2018
13512    2018
13513    2018
Name: year, Length: 13514, dtype: int64
0        1982
1        1982
2 

In [2]:
import os
import pandas as pd

# 文件夹路径
folder_path = 'climate_new'

# 获取文件夹下所有以数字开头的CSV文件
files = [f for f in os.listdir(folder_path) if f.endswith('.csv') and f[0].isdigit()]

# 初始化存储结果的DataFrame
station_non_empty_stats = pd.DataFrame()

# 遍历每个文件，计算非空值比例
for file in files:
    file_path = os.path.join(folder_path, file)
    data = pd.read_csv(file_path, low_memory=False)
    
    # 确保文件中包含目标列
    columns_to_read = [col for col in data.columns if col in [
        '00010', '00095', '00300', '00400', '00405', '00600', '00605', '00618',
        '00660', '00665', '00681', '00915', '00925', '00930', '00935', '00940',
        '00945', '00955', '71846', '80154'
    ]]
    
    if columns_to_read:
        data_selected = data[columns_to_read]
        print(data_selected.shape)
        non_empty_ratios = data_selected.notna().mean() * 100
        non_empty_ratios_df = pd.DataFrame(non_empty_ratios, columns=[file]).T
        non_empty_ratios_df.index.name = 'Station'
        station_non_empty_stats = pd.concat([station_non_empty_stats, non_empty_ratios_df])

# 重置索引并保存为CSV
output_path = 'station_non_empty_ratios.csv'
station_non_empty_stats.reset_index().to_csv(output_path, index=False)

print(f'Results saved to {output_path}')


(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(13514, 20)
(135

In [5]:
import os
import pandas as pd

# 文件夹路径
folder_path = 'climate_new'

# 获取所有符合条件的 CSV 文件（以数字开头）
files = [f for f in os.listdir(folder_path) if f.endswith('.csv') and f[0].isdigit()]

# 目标变量（20个 y 变量）和 runoff 列
target_columns = [
    '00010', '00095', '00300', '00400', '00405', '00600', '00605', '00618',
    '00660', '00665', '00681', '00915', '00925', '00930', '00935', '00940',
    '00945', '00955', '71846', '80154'
]
runoff_column = 'runoff'  # 假设 runoff 列名为 'runoff'

# 结果存储（每行是一个站点，每列是 runoff 和一个 y 变量的 Pearson 相关系数）
pearson_results = pd.DataFrame(index=files, columns=target_columns)

# 记录计算失败的站点及具体的 y 变量
error_logs = []

# 遍历每个站点（CSV 文件）
for file in files:
    file_path = os.path.join(folder_path, file)
    data = pd.read_csv(file_path, low_memory=False)

    # 确保 runoff 存在
    if runoff_column not in data.columns:
        error_logs.append([file, "Runoff column missing"])
        continue

    # 确保至少一个目标变量存在
    valid_columns = [col for col in target_columns if col in data.columns]
    if not valid_columns:
        error_logs.append([file, "No target variables available"])
        continue

    # 选取 runoff 和目标变量数据
    data_selected = data[[runoff_column] + valid_columns]

    # 存储当前站点的相关性结果
    correlations = {}

    # 逐个 y 变量计算 Pearson 相关系数
    for y_var in valid_columns:
        sub_data = data_selected[[runoff_column, y_var]].dropna()

        # 只有当数据点 >= 2 时，才计算 Pearson 相关系数
        if len(sub_data) >= 2:
            correlations[y_var] = sub_data.corr(method='pearson').iloc[0, 1]
        else:
            correlations[y_var] = float('nan')
            error_logs.append([file, f"Not enough valid data for {y_var} (only {len(sub_data)} points)"])

    # 存入结果表
    pearson_results.loc[file, valid_columns] = pd.Series(correlations)

# 确保所有站点都输出，即使计算失败的站点，其相关系数填 NaN
output_pearson = 'station_runoff_pearson.csv'
pearson_results.to_csv(output_pearson)

# 记录错误日志
error_df = pd.DataFrame(error_logs, columns=["Station", "Error Reason"])
output_errors = 'station_runoff_errors.csv'
error_df.to_csv(output_errors, index=False)

print(f'Pearson correlation results saved to {output_pearson}')
print(f'Error log saved to {output_errors}')


Pearson correlation results saved to station_runoff_pearson.csv
Error log saved to station_runoff_errors.csv
