In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score

# 读取数据
file_path = '/home/aistudio/clusterresult9.csv'  # 修改为你的文件路径
data = pd.read_csv(file_path)

# 将时间戳列转换为日期时间格式
data['TIMESTAMP'] = pd.to_datetime(data['TIMESTAMP'])

# 将数据按照时间顺序排序
data.sort_values('TIMESTAMP', inplace=True)
data_2012 = data[(data['TIMESTAMP'].dt.year == 2012)]

# 准备数据
X = data_2012[['Wind Speed (m/s)100', 'Wind Direction (°)100']]
y = data_2012['cluster']

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 初始化并训练 XGBoost 分类器
clf = XGBClassifier(random_state=42)
clf.fit(X_train, y_train)

# 在测试集上进行预测
y_pred = clf.predict(X_test)

# 评估模型
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")

# 定义一个函数用于预测新的风速和风向数据的聚类结果
def predict_cluster(windspeed, winddirection):
    # 将新的风速和风向数据组合成DataFrame，确保它是二维格式
    new_data = pd.DataFrame({
        'Wind Speed (m/s)100': windspeed,
        'Wind Direction (°)100': winddirection
    })
    
    # 使用训练好的模型进行预测
    prediction = clf.predict(new_data)
    return prediction

#------场景预测----
# 加载CSV文件
data1 = pd.read_csv('/home/aistudio/wind9_data.csv')

# 查看数据的前几行，检查列名和数据类型
print(data1.head())

# 将时间戳列转换为日期时间格式
data1['TIMESTAMP'] = pd.to_datetime(data1['TIMESTAMP'])

# 将数据按照时间顺序排序
data1.sort_values('TIMESTAMP', inplace=True)

# 获取2013年2月的数据
data_201302 = data1[(data1['TIMESTAMP'].dt.year == 2013) & (data1['TIMESTAMP'].dt.month == 2)]

# 示例：预测新的风速和风向数据
new_windspeed = data_201302['Wind Speed (m/s)100']
new_winddirection = data_201302['Wind Direction (°)100']

# 调用预测函数
predicted_cluster = predict_cluster(new_windspeed, new_winddirection)

# 输出预测结果
print(f"Predicted Cluster: {predicted_cluster}")



Accuracy: 0.9791271347248577
Classification Report:
              precision    recall  f1-score   support

          -1       0.96      0.94      0.95       577
           0       0.98      0.99      0.99      1345
           1       0.99      0.99      0.99       183
           2       1.00      0.98      0.99       180
           3       0.99      0.97      0.98       252
           4       1.00      1.00      1.00        19
           5       0.98      1.00      0.99        63
           6       0.83      0.94      0.88        16

    accuracy                           0.98      2635
   macro avg       0.97      0.98      0.97      2635
weighted avg       0.98      0.98      0.98      2635

   ZONEID      TIMESTAMP  TARGETVAR       U10       V10      U100      V100  \
0       9  20120101 1:00        0.0  1.903254 -1.495002  2.516723 -2.078063   
1       9  20120101 2:00        0.0  2.173296 -0.643641  2.837109 -1.006914   
2       9  20120101 3:00        0.0  2.3

In [None]:
# 将预测结果添加到原始数据
data_201302['Predicted Cluster'] = predicted_cluster

# 保存到新的CSV文件
output_file = '/home/aistudio/WF9_cluster_201302.csv'
data_201302.to_csv(output_file, index=False)

print(f"Predicted clusters saved to: {output_file}")

Predicted clusters saved to: /home/aistudio/WF9_cluster_201302.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [5]:
import pandas as pd
#加载数据
data = pd.read_csv('/home/aistudio/WF9_cluster_201302.csv')

cluster1302 = data['Predicted Cluster']


# 打印前五行数据
print(cluster1302.head())

# 使用 .describe() 方法来查看每列的描述性统计信息
print(cluster1302.describe())

# === 获取簇的数量 ===

# 统计每个簇的数量
cluster_counts = cluster1302.value_counts()

# 输出每个簇的数量，包括噪声点（标签为-1）
for cluster_label, count in cluster_counts.items():
    print(f"簇 {cluster_label} 包含 {count} 个数据点")

# 计算总簇的数量（包括噪声点）
num_clusters = len(cluster_counts)

print(f"\n数据被分为 {num_clusters} 个簇（包括噪声点作为一个簇）")

0    0
1    0
2    0
3    0
4    0
Name: Predicted Cluster, dtype: int64
count    672.000000
mean       0.059524
std        1.387414
min       -1.000000
25%       -1.000000
50%        0.000000
75%        0.000000
max        6.000000
Name: Predicted Cluster, dtype: float64
簇 0 包含 364 个数据点
簇 -1 包含 224 个数据点
簇 5 包含 28 个数据点
簇 2 包含 21 个数据点
簇 1 包含 18 个数据点
簇 3 包含 12 个数据点
簇 6 包含 4 个数据点
簇 4 包含 1 个数据点

数据被分为 8 个簇（包括噪声点作为一个簇）
