In [None]:
import os
import re
import pandas as pd

# 设定.csv文件所在的目录
csv_directory = 'PI_data_16-20_csv'

# 正则表达式用于提取文件名中的数字部分
def extract_numbers_from_filename(filename):
    numbers = re.findall(r'\d+\.?\d*', filename)
    return [float(num) for num in numbers] if numbers else [None] * 4  # 返回4个数字

# 获取.csv文件
csv_files = [f for f in os.listdir(csv_directory) if f.endswith('.csv')]

# 批量处理.csv文件
for csv_file in csv_files:
    # 提取文件名中的数字部分
    # 假设文件名包含数字，按照顺序表示 Voltage, multiple, k, rpm
    numbers = extract_numbers_from_filename(csv_file)
    if len(numbers) >= 4:
        Voltage, multiple, k, rpm = numbers[:4]  # 获取前四个数值
    else:
        print(f"File name {csv_file} does not contain enough numbers.")
        continue
    
    # 读取.csv文件
    file_path = os.path.join(csv_directory, csv_file)
    df = pd.read_csv(file_path, header = 1)
    
    # 删除第一行
    df = df.drop([0])
    df = df.iloc[:, 0:3]
    
    # 填充每一行的后四列
    #df['Voltage'] = Voltage
    #df['multiple'] = multiple
    #df['k'] = k
    #df['rpm'] = rpm
    
    #
    df.insert(loc=0,column='Voltage',value=Voltage)
    df.insert(loc=1,column='multiple',value=multiple)
    df.insert(loc=2,column='k',value=k)
    df.insert(loc=3,column='rpm',value=rpm)
    
    # 保存修改后的.csv文件
    csv_modified_file = file_path.replace('.csv', '_modified.csv')
    csv_modified_file = csv_modified_file.replace(csv_directory,'modified')
    df.to_csv(csv_modified_file, index=False)
    
    print(f"Processed {csv_file} -> {csv_modified_file}")

In [None]:
import pandas as pd
import os

# 设定包含csv文件的目录路径
directory = 'modified'  # 请替换为存放CSV文件的路径

# 创建一个空的DataFrame用于存放汇总后的数据
all_data = pd.DataFrame()

# 遍历目录中的所有CSV文件
for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        file_path = os.path.join(directory, filename)
        # 读取每个CSV文件
        df = pd.read_csv(file_path)
        # 将数据追加到all_data中
        all_data = pd.concat([all_data, df], ignore_index=True)

# 将汇总后的数据保存到新的CSV文件中
all_data.to_csv('all_data.csv', index=False)

# 打印完成提示
print("所有CSV文件数据汇总完成并保存为'all_data.csv'")

In [None]:
import pandas as pd

# 读取 CSV 文件
df = pd.read_csv('all_data.csv')

# 删除最后一列中值为0的行
df_cleaned = df[df.iloc[:, -1] != 0]

# 保存到一个新的 CSV 文件中
df_cleaned.to_csv('cleaned_file.csv', index=False)

In [None]:
## 决策树DecisionTreeRegression(参数优化后）
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
# 1. 加载CSV数据
data1 = pd.read_csv('cleaned_file.csv')
data2 = pd.read_csv('PI_data_6-15.csv')
total_data = pd.concat([data1,data2])

# 2. 假设特征在total_data的前几列，标签在最后一列
X = total_data.iloc[:, :-1].values  # 输入特征
y = total_data.iloc[:, -1].values   # 输出标签

# 3. 数据分割为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# 5. 构建DecisionTreeRegression回归模型
regressor = DecisionTreeRegressor(max_depth = 17, min_samples_leaf = 2, min_samples_split = 19) 

# 6. 训练模型
regressor.fit(X_train, y_train)

# 7. 进行预测
y_pred = regressor.predict(X_test)

# 8. 评估模型 (使用均方误差)
mse = mean_squared_error(y_test, y_pred)
print(f"均方误差: {mse}")


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
X_test

In [None]:
y_pred

In [None]:
# 生成一个其他参数不变，PI值变化的停机时间预测图
import numpy as np

# 生成随机数组的函数
def generate_array(n):
    # 第一列 [710, 780] 范围内的随机值
    col1 = np.random.uniform(710, 780, n)
    col2 = np.full(n, 20)
    col3 = np.full(n, 5) 
    # 第四列 [60, 70] 范围内的随机值
    col4 = np.random.uniform(60, 70, n)
    
    # 第五列 [5, 50] 范围内的随机值
    col5 = np.random.uniform(5, 50, n)
    
    # 第六列 [0, 1] 范围内的随机值
    col6 = np.random.uniform(0, 1, n)
    
    # 将所有列拼接为一个数组
    array = np.column_stack((col1, col2, col3, col4, col5, col6))
    
    return array

# 生成 n 行数组
n = 10000000  # 示例行数
result = generate_array(n)

# 输出结果
print(result)

In [None]:
V = result[:,0]
rpm = result[:,3]
P = result[:,4]
I = result[:,5]

In [None]:
x=P
y=I
z = regressor.predict(result)
t = type(z) 
print(t)
print(z)

In [None]:
a = np.where(z == min(z))  #tuple
print(a,min(z))

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor, plot_tree, export_graphviz
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
import graphviz
import os

# 1. 加载CSV数据
data1 = pd.read_csv('cleaned_file.csv')
data2 = pd.read_csv('PI_data_6-15.csv')
total_data = pd.concat([data1, data2], ignore_index=True)

# 2. 假设特征在total_data的前几列，标签在最后一列
X = total_data.iloc[:, :-1].values  # 输入特征
y = total_data.iloc[:, -1].values   # 输出标签

# 提取特征名称和目标名称
feature_names = total_data.columns[:-1].tolist()
target_name = total_data.columns[-1]

# 3. 数据分割为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# 4. 构建DecisionTreeRegression回归模型
regressor = DecisionTreeRegressor(max_depth=17, min_samples_leaf=2, min_samples_split=19)

# 5. 训练模型
regressor.fit(X_train, y_train)

# 6. 进行预测
y_pred = regressor.predict(X_test)

# 7. 按照y_test的值对索引进行排序
sorted_indices = np.argsort(y_test)
sorted_y_test = y_test[sorted_indices]
sorted_y_pred = y_pred[sorted_indices]

# 8. 可视化预测结果与实际值的对比，横坐标使用序号
plt.figure(figsize=(10, 6))

# 使用排序后的索引作为横坐标
indices = range(len(sorted_y_test))

plt.scatter(indices, sorted_y_test, color="blue", label="Actual Stop Duration")
plt.scatter(indices, sorted_y_pred, color="red", label="Predicted Stop Duration", alpha=0.2)
plt.xlabel("Sample Index (Sorted by Actual Stop Duration)")
plt.ylabel(target_name)
plt.title("Actual vs Predicted Stop Duration (Sorted)")
plt.legend()
plt.show()

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
import shap

# 加载CSV数据
data1 = pd.read_csv('cleaned_file.csv')
data2 = pd.read_csv('PI_data_6-15.csv')
total_data = pd.concat([data1, data2], ignore_index=True)

# 特征和标签
X = total_data.iloc[:, :-1]  # 输入特征
y = total_data.iloc[:, -1].values  # 输出标签

# 数据分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# 构建并训练模型
regressor = DecisionTreeRegressor(max_depth=17, min_samples_leaf=2, min_samples_split=19)
regressor.fit(X_train, y_train)

# 加载随机数据
Data1 = pd.read_excel('Multiple16.xlsx')
Data2 = pd.read_excel('Multiple17.xlsx')
Data3 = pd.read_excel('Multiple18.xlsx')
Data4 = pd.read_excel('Multiple19.xlsx')
Data5 = pd.read_excel('Multiple20.xlsx')
Total_Data_Random = pd.concat([Data1, Data2, Data3, Data4, Data5], ignore_index=True)

# 确保随机数据的特征与训练数据的特征相匹配
random_data = Total_Data_Random[X.columns]

# 使用 SHAP 分析特征重要性
explainer = shap.TreeExplainer(regressor)
shap_values = explainer.shap_values(random_data)

# 创建一个图形和轴对象
fig, ax = plt.subplots(figsize=(12, 8))

# 使用指定的轴对象绘制SHAP值的蜂群图
shap.summary_plot(shap_values, random_data, feature_names=random_data.columns.tolist(), plot_type="violin", show=False)

# 修改图表标题
ax.set_title('SHAP Value Summary Plot for All Features Using Random Data')

# 显示图表
plt.show()

In [None]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
import shutil

# 加载CSV数据
data1 = pd.read_csv('cleaned_file.csv')
data2 = pd.read_csv('PI_data_6-15.csv')
total_data = pd.concat([data1, data2], ignore_index=True)

# 特征和标签
X = total_data.iloc[:, :-1]  # 输入特征
y = total_data.iloc[:, -1].values  # 输出标签

# 数据分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# 构建并训练模型
regressor = DecisionTreeRegressor(max_depth=17, min_samples_leaf=2, min_samples_split=19)
regressor.fit(X_train, y_train)

# 加载随机数据（从Excel文件）
Data1 = pd.read_excel('Multiple20.xlsx')

# 提取特征列（假设这些是用于预测的特征）
features = Data1[X.columns]  # 确保特征顺序和名称匹配

# 进行预测
predictions = regressor.predict(features)

# 找到最小预测值及其所有索引
min_pred_value = min(predictions)
min_pred_indices = np.where(predictions == min_pred_value)[0]

# 打印最小预测值及其索引
print("Indices of minimum prediction:", min_pred_indices)
print("Minimum prediction value:", min_pred_value)

In [None]:
if len(min_pred_indices) > 0:
    # 收集所有满足最小预测值的行的数据
    data_list = []
    for idx in min_pred_indices:
        row_data = {
            'Voltage': Data1.loc[idx, 'Voltage'],
            'multiple': 20,
            'k': 5,
            'rpm': Data1.loc[idx, 'rpm'],
            'P': Data1.loc[idx, 'P'],
            'I': Data1.loc[idx, 'I'],
            'Stop Duration': 0.35830818181818175
        }
        data_list.append(row_data)

    # 创建DataFrame
    df = pd.DataFrame(data_list)

    # 指定Excel文件名
    filename = 'Multiple20预测.xlsx'

    # 将DataFrame保存为Excel文件
    df.to_excel(filename, index=False)

    # 指定目标文件夹路径
    target_folder = '/Users/liulinxuan/Desktop'

    # 将文件移动到目标文件夹
    shutil.move(filename, target_folder)

    print(f"All rows with minimum prediction value saved and moved to {target_folder}")
else:
    print("No predictions match the minimum value.")

In [None]:
#读取（可视化）
import pandas as pd
# 加载文件
df = pd.read_excel('Multiple20预测.xlsx')
#显示数据
print(df)

In [None]:
## 决策树DecisionTreeRegression （用for套for找min）
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, max_error, mean_absolute_error, r2_score
# 5. 构建DecisionTreeRegression回归模型
regressor = DecisionTreeRegressor(max_depth = 17, min_samples_leaf = 1, min_samples_split = 16) 

# 6. 训练模型
regressor.fit(X_train, y_train)

# 7. 进行预测
y_pred = regressor.predict(X_test)

# 8. 评估模型 (使用均方误差、最大相对误差、平均相对误差和R²得分)
# 计算均方误差
mse = mean_squared_error(y_test, y_pred)
print(f"均方误差: {mse}")
# 计算最大相对误差
max_rel_error = max(abs((y_test[i] - y_pred[i]) / (y_test[i] + 1e-6)) for i in range(len(y_test)))
print(f"最大相对误差:{max_rel_error}")
# 计算平均相对误差
def mean_relative_error(y_true, y_pre, epsilon=1e-6):
    relative_errors = np.abs((y_true - y_pre) / (y_true + epsilon))
    return np.mean(relative_errors)
mean_rel_error = mean_relative_error(y_test, y_pred)
print(f"平均相对误差:{mean_rel_error}")
# 计算R²得分
r2 = r2_score(y_test, y_pred)
print(f"R²得分:{r2}")