# 导入部分

In [1]:
import pandas as pd
import numpy as np

import os
import joblib

from sklearn.metrics import mean_squared_error
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import accuracy_score, recall_score, f1_score, classification_report, roc_curve, auc

# 导入模型训练和评估工具
from sklearn.model_selection import train_test_split

# 导入可视化库
import matplotlib.pyplot as plt
import seaborn as sns
# 设置警告过滤器以忽略不必要的警告
import warnings
warnings.filterwarnings('ignore')

# 配置matplotlib以支持中文字符，并进行美观调整
plt.rcParams['font.sans-serif'] = ['simhei']  # 设置sans-serif字体为simhei
plt.rcParams['font.serif'] = ['simhei']       # 设置serif字体为simhei
plt.rcParams['axes.unicode_minus'] = False    # 确保负号显示正常

# 配置seaborn以保持一致的美学风格和字体支持
sns.set_style("darkgrid", {"font.sans-serif": ['simhei', 'Droid Sans Fallback']})

## 数据读取

In [2]:
data = pd.read_csv('./Datasets/附件1/M101.csv')

In [3]:
data.head(10)

Unnamed: 0,日期,时间,生产线编号,物料推送气缸推送状态,物料推送气缸收回状态,物料推送数,物料待抓取数,放置容器数,容器上传检测数,填装检测数,...,不合格数,物料推送装置故障1001,物料检测装置故障2001,填装装置检测故障4001,填装装置定位故障4002,填装装置填装故障4003,加盖装置定位故障5001,加盖装置加盖故障5002,拧盖装置定位故障6001,拧盖装置拧盖故障6002
0,1,0,M101,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,M101,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,2,M101,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,3,M101,1,0,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,4,M101,0,1,2,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5,1,5,M101,1,0,3,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
6,1,6,M101,0,1,3,2,1,0,0,...,0,0,0,0,0,0,0,0,0,0
7,1,7,M101,0,1,3,2,2,0,0,...,0,0,0,0,0,0,0,0,0,0
8,1,8,M101,0,1,3,3,2,1,0,...,0,0,0,0,0,0,0,0,0,0
9,1,9,M101,0,1,3,3,2,1,0,...,0,0,0,0,0,0,0,0,0,0


## 处理缺失值

In [4]:
data.fillna(0, inplace=True)

# 探索性数据分析（EDA）

In [5]:
import warnings

warnings.filterwarnings('ignore', category=FutureWarning)

summary_df = pd.DataFrame(columns=['故障类型', '正常运行', '发生故障'])


for column in data.columns:
    zero_count = data[column].value_counts().get(0, 0)  # 计算值为0的个数，正确使用.get()
    
    try:
        fault_code = int(column[-4:])  # 尝试将列名最后4个字符转换为整数
        fault_count = data[column].value_counts().get(fault_code, 0)  # 计算故障码对应的个数
    except ValueError:
        continue  # 跳过当前列的处理

    # 将当前故障的统计信息作为新行添加到summary_df中
    new_row = pd.DataFrame([[column, zero_count, fault_count]], columns=['故障类型', '正常运行', '发生故障'])
    summary_df = pd.concat([summary_df, new_row], ignore_index=True)

summary_df

Unnamed: 0,故障类型,正常运行,发生故障
0,物料推送装置故障1001,632235,4274
1,物料检测装置故障2001,636005,504
2,填装装置检测故障4001,634998,1511
3,填装装置定位故障4002,633576,2933
4,填装装置填装故障4003,633797,2712
5,加盖装置定位故障5001,633964,2545
6,加盖装置加盖故障5002,634300,2209
7,拧盖装置定位故障6001,634480,2029
8,拧盖装置拧盖故障6002,633848,2661


# 特征工程

## 计算故障总数及故障率

In [6]:
# 计算总条目数
total_entries = len(data)
# 计算故障率
fault_columns = ['物料推送装置故障1001', '物料检测装置故障2001', '填装装置检测故障4001', 
                 '填装装置定位故障4002', '填装装置填装故障4003', '加盖装置定位故障5001', 
                 '加盖装置加盖故障5002', '拧盖装置定位故障6001', '拧盖装置拧盖故障6002']

data['故障总数'] = data[fault_columns].sum(axis=1)
data['故障率'] = data['故障总数'] / total_entries*100

In [7]:
data['故障总数'].value_counts()

故障总数
0       615131
1001      4274
4002      2933
4003      2712
6002      2661
5001      2545
5002      2209
6001      2029
4001      1511
2001       504
Name: count, dtype: int64

In [8]:
data['故障率'].value_counts()

故障率
0.000000    615131
0.157264      4274
0.628742      2933
0.628899      2712
0.942956      2661
0.785692      2545
0.785849      2209
0.942799      2029
0.628585      1511
0.314371       504
Name: count, dtype: int64

## 封装构建ARIMA模型

In [9]:
def train_arima_model(train_data, test_data, order=(5, 1, 0), save_path=None, model_name="ARIMA_Model"):
    # 创建并训练 ARIMA 模型
    model = ARIMA(train_data, order=order)
    model_fit = model.fit()
    
    # 进行预测
    predictions = model_fit.forecast(steps=len(test_data))[0]
    print("预测值:", predictions)  # 添加对预测值的打印
    
    # 计算评估指标
    mse = mean_squared_error(test_data, predictions)
    rmse = np.sqrt(mse)
    print("均方误差（MSE）:", mse)
    print("均方根误差（RMSE）:", rmse)
    
    # 可视化实际值与预测值
    plt.plot(test_data, label='实际值')
    plt.plot(predictions, color='red', label='预测值')
    plt.legend()
    plt.show()
    
    # 保存模型
    if save_path:
        os.makedirs(save_path, exist_ok=True)
        model_file_path = os.path.join(save_path, f"{model_name}.joblib")
        joblib.dump(model_fit, model_file_path)
    
    return model_fit

# 模型汇总

## 原数据

In [10]:
train_data = data['物料推送装置故障1001'].values
#75% 作为训练集，25% 作为测试集
train_ratio = 0.75
split_index = int(len(train_data) * train_ratio)
train_set = train_data[:split_index]
test_set = train_data[split_index:]

In [11]:
train_data

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [12]:
save_path = "./Models/1001/"
train_arima_model(train_set,test_set,order=(5, 1, 0), save_path=save_path)

预测值: 0.0


InvalidParameterError: The 'y_pred' parameter of mean_squared_error must be an array-like. Got 0.0 instead.