### 角度转换

In [1]:
import pandas as pd
import numpy as np
import math


# 更新acc_x,acc_y,acc_z

def rad(degree: float):
    return degree / 180 * math.pi

def angle_changing(acc: tuple, data: list) -> tuple:
    """
    AHRS, modify acc value to related angles
    alpha, beta, theta in degree
    :param acc: (x, y, z)
    :param alpha:
    :param beta:
    :param theta:
    :return:
    """
    # Rotation list: theta -> beta -> alpha
    alpha=data[0]
    beta=data[1] 
    theta=data[2]
    
    alpha = rad(alpha)
    beta = rad(beta)
    theta = rad(theta)

    x, y, z = acc

    z1 = z
    x1 = x * math.cos(theta) - y * math.sin(theta)
    y1 = x * math.sin(theta) - y * math.cos(theta)

    z2 = z1 * math.cos(beta) - x1 * math.sin(beta)
    x2 = x1 * math.cos(beta) + z1 * math.sin(beta)
    y2 = y1

    z3 = z2 * math.cos(alpha) + y2 * math.sin(alpha)
    y3 = y2 * math.cos(alpha) - z2 * math.sin(alpha)
    x3 = x2

    y3 = -y3

    return x3, y3, z3



if __name__ == '__main__':
    # Acc readings: acc_x, acc_y, acc_z
    acc = (0.5, 0.5, 1.0)

    # Angle in degrees: alpha, beta, gama(theta)
    res = angle_changing(acc, [-1.1, 1.5, -40])

    # Corrected Acc: acc_x, acc_y, acc_z
    print(res)


(0.7303515889315664, 0.6854493376600327, 0.994560022599271)


### 导入角度转换和数据

In [14]:
import os 

def get_angle():
    filepath=os.getcwd()+"//Original Data//Original Data//angle_change.txt"
    angle={}
    with open(filepath,'r') as f:
        lines=f.readlines()
    
    for l in lines: 
        line=l.split(":")
        #print(line[0],line[1][1:-2])
        a=[]
        for i in line[1][1:-2].split(','):
            a.append(float(i))
        angle[line[0]]=a
    return angle
    
get_angle()

{'861193047289427': [-1.16, -0.67, -77.69],
 '861193041076366': [-11.16, 0.74, -84.47],
 '861193041076531': [-9.65, 3.51, -79.25],
 '861193041083446': [9.04, 1.91, -76.6],
 '861193041076630': [-4.83, -0.28, 86.75],
 '861193041083735': [-5.04, -9.79, -88.72],
 '861193041084543': [-6.07, -1.1, -90.55],
 '863293055538448': [-0.18, -0.29, -90.0],
 '861193041076440': [-3.95, -2.76, -79.2],
 '861193041076424': [8.45, 1.67, -74.51],
 '861193041076515': [-0.61, -1.12, -9.0]}

### 保存转换角度后的数据

In [71]:
# 需要进行转换的角度
angle=get_angle()


# 获取所有文件名
def get_filenames(p=None):
    import os
    if p==None:
        path=os.getcwd()
        filenames=os.listdir(path)
    else:
        if '.' in p:
            return [p]
        filenames=os.listdir(p) 
    return filenames
 


# 输入参数为文件下的所有文件名、需要处理角度的数据，保存的路径
def change_angle(angle,read_path,save_path):
    filename=get_filenames(read_path)
    evento=[]
    for f in filename:
        if f[:15] in angle.keys():
                data=pd.read_csv(read_path+f) #读取文件
                
                if data.shape[0]<20: # 过滤掉数据小于一个窗口的数据
                    continue
                    
                newfile=save_path+f #存放文件
                # 去掉_unknown
                if '_unknown' in newfile:
                    newfile=newfile[:-12]+newfile[-4:]
                   
                
                
                feature='acc_x,acc_y,acc_z,gyro_x,gyro_y,gyro_z,timestamp,evento'
                with open(newfile,'w+') as w:
                    w.write(feature+"\n")
                for i in range(data.shape[0]):
                    acc=(data.loc[i,'acc_x'],data.loc[i,'acc_y'],data.loc[i,'acc_z']) # 以元组形式读入数据
                    gyro=(data.loc[i,'gyro_x'],data.loc[i,'gyro_y'],data.loc[i,'gyro_z'])
                    
                    temp_a_x,temp_a_y,temp_a_z=angle_changing(acc,angle[f[:15]])
                    temp_g_x,temp_g_y,temp_g_z=angle_changing(gyro,angle[f[:15]])
                    
                    # 加入正常事件，对于碰撞和其他文件
                    if f[29:-4]=='COLLISION':
                        if i>=179 and i<=239: 
                            evento=f[29:-4]
                        else:
                            evento='Normal'
                    else:
                        if i>=59 and i<=79:
                            evento=f[29:-4]
                        else:
                            evento='Normal'
                            
                    # 把数据少于3秒的直接判定为事件
                    if data.shape[0]<59:
                        evento=f[29:-4]
                        
                    if '_unknown' in evento: 
                        evento=evento[:-8]
                            
                    timestamp=f[16:28]
                    device_name=f[:15]
                    
                    tempstr=','.join(list(map(str,[temp_a_x,temp_a_y,temp_a_z,temp_g_x,temp_g_y,temp_g_z])))
                    others=','.join([timestamp,evento])
                    tempstr=tempstr+','+others
                    with open(newfile,'a+') as wf:
                        wf.write(tempstr+"\n")
                        
        else:
            print("This file:%s do not need to read!"%f) #表示该文件不需要读取
                

                
import os
read_path=os.getcwd()+"//Original Data//Original Data//"
save_path=os.getcwd()+"//Processed Data//processed_angle_data//"
change_angle(angle,read_path,save_path)

This file:.ipynb_checkpoints do not need to read!
This file:angle_change.txt do not need to read!
This file:README.txt do not need to read!


### 进行滑动窗口的求出均值、中位数、方差以及Tendency
### 需要计算的值有四个（均值、中值、方差、tendency）
### 需要计算的列有acc三列、gyro三列
### 总共计算4x3x3也就是36个值


In [72]:
import numpy as np
### 滑动窗口大小(w,每次计算就增加一个窗口)、步长(h)、读取文件位置、保存文件位置

def caculate_all_value(w,h,read_path,save_path):
    filename=get_filenames(read_path)
    for f in filename:
        if f[:15] in angle.keys():
            data=pd.read_csv(read_path+f)
            if data.shape[0]<20: 
                continue

            newfile=save_path+f
            
            # 写入特征名字
            origin_name=['mean','median','std','tendency']
            acc_x=[]
            acc_y=[]
            acc_z=[]
            gyro_x=[]
            gyro_y=[]
            gyro_z=[]
            for i in origin_name:
                for j in range(1,5):
                    if i=='tendency' and j==4:
                        break
                    acc_x.append('acc_x_%s_%d'%(i,j))
                    acc_y.append('acc_y_%s_%d'%(i,j))
                    acc_z.append('acc_z_%s_%d'%(i,j)) 
                    gyro_x.append('gyro_x_%s_%d'%(i,j)) 
                    gyro_y.append('gyro_y_%s_%d'%(i,j)) 
                    gyro_z.append('gyro_z_%s_%d'%(i,j)) 
                    
            # 其他，装置名称、时间戳以及时间命名
            others=['timeStamp','evento']
            feature=[]
            for i in [acc_x,acc_y,acc_z,gyro_x,gyro_y,gyro_z,others]:
                feature.extend(i)
            
            feature=','.join(feature)
            with open(newfile,'w+') as wr:
                wr.write(feature+"\n")
           
        
            z=0 #观察记录个数
            for i in range(0,data.shape[0]//h-3):
                acc_x,acc_y,acc_z=data.loc[h*i:w+i*h-1,'acc_x'].tolist(),data.loc[h*i:w+i*h-1,'acc_y'].tolist(),data.loc[h*i:w+i*h-1,'acc_z'].tolist()
                gyro_x,gyro_y,gyro_z=data.loc[h*i:w+i*h-1,'gyro_x'].tolist(),data.loc[h*i:w+i*h-1,'gyro_y'].tolist(),data.loc[h*i:w+i*h-1,'gyro_z'].tolist()
                # 计算acc的三坐标轴的四值
                res=[]
                   
                acc_x_means=[]
                acc_x_medians=[]
                acc_x_stds=[]
                acc_x_ts=[]
                
                acc_y_means=[]
                acc_y_medians=[]
                acc_y_stds=[]
                acc_y_ts=[]
               
                acc_z_means=[]
                acc_z_medians=[]
                acc_z_stds=[]
                acc_z_ts=[]

                gyro_x_means=[]
                gyro_x_medians=[]
                gyro_x_stds=[]
                gyro_x_ts=[]

                gyro_y_means=[]
                gyro_y_medians=[]
                gyro_y_stds=[]
                gyro_y_ts=[]
                
                gyro_z_means=[]
                gyro_z_medians=[]
                gyro_z_stds=[]
                gyro_z_ts=[]
                
                res_acc=[]
                target=1 #用于判断是否有数值的分母为0
                
                # 逆序求解
                for i in range(w//h-1,-1,-1):
                    # acc x轴
                    acc_x_mean=np.mean(acc_x[i*h:]) #m(i-3)
                    acc_x_means.append(acc_x_mean)                
                    acc_x_median=np.median(acc_x[i*h:])
                    acc_x_medians.append(acc_x_median)
                    acc_x_std=np.std(acc_x[i*h:])
                    acc_x_stds.append(acc_x_std)
                    if i!=w//h-1:
                        if acc_x_means[0]==0:
                            target=0
                            break
                        acc_x_t=acc_x_mean/acc_x_means[0]
                        acc_x_ts.append(acc_x_t)
                    
                    
                    
                    # acc y轴
                    acc_y_mean=np.mean(acc_y[i*h:]) #m(i-3)
                    acc_y_means.append(acc_y_mean)                
                    acc_y_median=np.median(acc_y[i*h:])
                    acc_y_medians.append(acc_y_median)
                    acc_y_std=np.std(acc_y[i*h:])
                    acc_y_stds.append(acc_y_std)
                    if i!=w//h-1:
                        if acc_x_means[0]==0:
                            target=0
                            break
                        acc_y_t=acc_y_mean/acc_y_means[0]
                        acc_y_ts.append(acc_y_t)
                                   
                    
                    # acc z轴
                    acc_z_mean=np.mean(acc_z[i*h:]) #m(i-3)
                    acc_z_means.append(acc_z_mean)                
                    acc_z_median=np.median(acc_z[i*h:])
                    acc_z_medians.append(acc_z_median)
                    acc_z_std=np.std(acc_z[i*h:])
                    acc_z_stds.append(acc_z_std)
                    if i!=w//h-1:
                        if acc_x_means[0]==0:
                            target=0
                            break
                        acc_z_t=acc_z_mean/acc_z_means[0]
                        acc_z_ts.append(acc_z_t)
                
                if target==0: #如果有某一行后五个数均值为0，退出循环
                    break
                
                
                res.extend(acc_x_means)
                res.extend(acc_x_medians)
                res.extend(acc_x_stds)
                res.extend(acc_x_ts)
    
                res.extend(acc_y_means)
                res.extend(acc_y_medians)
                res.extend(acc_y_stds)
                res.extend(acc_y_ts)
                
                res.extend(acc_z_means)
                res.extend(acc_z_medians)
                res.extend(acc_z_stds)
                res.extend(acc_z_ts)
                
        
                # 计算gyro三坐标轴的四值
                for i in range(w//h-1,-1,-1):
                    # gyro x轴
                    gyro_x_mean=np.mean(gyro_x[i*h:]) #m(i-3)
                    gyro_x_means.append(gyro_x_mean)                
                    gyro_x_median=np.median(gyro_x[i*h:])
                    gyro_x_medians.append(gyro_x_median)
                    gyro_x_std=np.std(gyro_x[i*h:])
                    gyro_x_stds.append(gyro_x_std)
                    if i!=w//h-1:
                        if acc_x_means[0]==0:
                            target=0
                            break
                        gyro_x_t=gyro_x_mean/gyro_x_means[0]
                        gyro_x_ts.append(gyro_x_t)
                        
                    # y轴
                    gyro_y_mean=np.mean(gyro_y[i*h:]) #m(i-3)
                    gyro_y_means.append(gyro_y_mean)                
                    gyro_y_median=np.median(gyro_y[i*h:])
                    gyro_y_medians.append(gyro_y_median)
                    gyro_y_std=np.std(gyro_y[i*h:])
                    gyro_y_stds.append(gyro_y_std)
                    if i!=w//h-1:
                        if acc_x_means[0]==0:
                            target=0
                            break
                        gyro_y_t=gyro_y_mean/gyro_y_means[0]
                        gyro_y_ts.append(gyro_y_t)
                    
                    
                    # z轴
                    gyro_z_mean=np.mean(gyro_z[i*h:]) #m(i-3)
                    gyro_z_means.append(gyro_z_mean)                
                    gyro_z_median=np.median(gyro_z[i*h:])
                    gyro_z_medians.append(gyro_z_median)
                    gyro_z_std=np.std(gyro_z[i*h:])
                    gyro_z_stds.append(gyro_z_std)
                    if i!=w//h-1:
                        if acc_x_means[0]==0:
                            target=0
                            break
                        gyro_z_t=gyro_z_mean/gyro_z_means[0]
                        gyro_z_ts.append(gyro_z_t)
                
                if target==0:
                    break
      
                res.extend(gyro_x_means)
                res.extend(gyro_x_medians)
                res.extend(gyro_x_stds)
                res.extend(gyro_x_ts)
                
                
                res.extend(gyro_y_means)
                res.extend(gyro_y_medians)
                res.extend(gyro_y_stds)
                res.extend(gyro_y_ts)   
                
                res.extend(gyro_z_means)
                res.extend(gyro_z_medians)
                res.extend(gyro_z_stds)
                res.extend(gyro_z_ts)


                # 以每一个窗口的最中间的帧数节点的事件类型作为整个窗口的事件类型
                
                # 偏向左边的第10个帧的事件
                eventol=data.loc[(z+1)*h+h-1,'evento']
                # 偏向右边的第11个帧的事件
                eventor=data.loc[(z+1)*h+h,'evento']
                # 判断两个事件是否相同，相同就对事件赋值，不同则说明该节点位于两个事件的交界处
                if eventol==eventor:
                    evento=eventol
                else:
                    evento= eventol
                
                z+=1
                
                timestamp=f[16:28]
                

                tempstr=','.join(list(map(str,res)))
                others=','.join([timestamp,evento])
                tempstr=tempstr+','+others
                with open(newfile,'a+') as wf:
                    wf.write(tempstr+"\n")
        else:
            print("This file:%s do not need to read!"%f) #表示该文件不需要读取

# 滑动窗口大小为20，步长为5 
read_path=os.getcwd()+"//Processed Data//processed_angle_data//"
save_path=os.getcwd()+"//Processed Data//processed_window_data//"
caculate_all_value(20,5,read_path,save_path)          

### 对于手动划分的数据集用同样处理方式进行处理

In [73]:
import numpy as np


# 参数：读取文件路径，保存路径、窗口大小、步长
def process_window(read_path,save_path,w,k):
    filenames=get_filenames(read_path)
    res=0
    for f in filenames:
        if f[-4:]!='.csv':
            continue
        data=pd.read_csv(read_path+f)
        
        if data.shape[0]<20: #将样本数少于20的文件直接忽略掉
            continue
            
        new_file=save_path+f[:-4]+"-BUMP"+f[-4:]
        
        # 特征名字
        origin_name=['mean','median','std','tendency']
        acc_x=[]
        acc_y=[]
        acc_z=[]
        gyro_x=[]
        gyro_y=[]
        gyro_z=[]
        for i in origin_name:
            for j in range(1,5):
                if i=='tendency' and j==4:
                    break
                acc_x.append('acc_x_%s_%d'%(i,j))
                acc_y.append('acc_y_%s_%d'%(i,j))
                acc_z.append('acc_z_%s_%d'%(i,j)) 
                gyro_x.append('gyro_x_%s_%d'%(i,j)) 
                gyro_y.append('gyro_y_%s_%d'%(i,j)) 
                gyro_z.append('gyro_z_%s_%d'%(i,j)) 
        

        others=["timeStamp","evento"]
        
        # 将特征进行拼接
        feature=[]
        for i in [acc_x,acc_y,acc_z,gyro_x,gyro_y,gyro_z,others]:
            feature.extend(i)

        feature=','.join(feature)
        with open(new_file,'w+') as fw:
            fw.write(feature+"\n")
        
        # 处理窗口
        z=0
        for i in range(data.shape[0]//k-3):
            # 取出每个窗口的最后一个值进行赋值
            timestamp=data.loc[w+(z-1)*k-1,'timestamp']
            z+=1
            add=[timestamp]
            evento='BUMP'
            # 用来暂时存放指定窗口大小处理后的数据
            accx=[]
            accy=[]
            accz=[]
            gyrox=[]
            gyroy=[]
            gyroz=[]
            addx=[]
            addy=[]
            accx_temp=data.loc[i*k:w+i*k-1,'acc_x'].tolist()
            accy_temp=data.loc[i*k:w+i*k-1,'acc_y'].tolist()
            accz_temp=data.loc[i*k:w+i*k-1,'acc_z'].tolist()
            gyrox_temp=data.loc[i*k:w+i*k-1,'gyro_x'].tolist()
            gyroy_temp=data.loc[i*k:w+i*k-1,'gyro_y'].tolist()
            gyroz_temp=data.loc[i*k:w+i*k-1,'gyro_z'].tolist()
            # 逆序求解
            # 均值
            for i in range(w//k-1,-1,-1):
                accx.append(np.mean(accx_temp[i*k:]))
                accy.append(np.mean(accy_temp[i*k:]))
                accz.append(np.mean(accz_temp[i*k:]))
                gyrox.append(np.mean(gyrox_temp[i*k:]))
                gyroy.append(np.mean(gyroy_temp[i*k:]))
                gyroz.append(np.mean(gyroz_temp[i*k:]))
           # print(accx)
           # break
            
            # 中位数
            for i in range(w//k-1,-1,-1):
                accx.append(np.median(accx_temp[i*k:]))
                accy.append(np.median(accy_temp[i*k:]))
                accz.append(np.median(accz_temp[i*k:]))
                gyrox.append(np.median(gyrox_temp[i*k:]))
                gyroy.append(np.median(gyroy_temp[i*k:]))
                gyroz.append(np.median(gyroz_temp[i*k:]))
            
            # 方差
            for i in range(w//k-1,-1,-1):
                accx.append(np.std(accx_temp[i*k:]))
                accy.append(np.std(accy_temp[i*k:]))
                accz.append(np.std(accz_temp[i*k:]))
                gyrox.append(np.std(gyrox_temp[i*k:]))
                gyroy.append(np.std(gyroy_temp[i*k:]))
                gyroz.append(np.std(gyroz_temp[i*k:]))
            
            # tendency
            target=1
            for val in [accx[0],accy[0],accz[0],gyrox[0],gyroy[0],gyroz[0]]:
                if val==0:
                    target=0
                    break
            if target==0:
                break
            
            for i in range(1,w//k):
                accx.append(accx[i]/accx[0])
                accy.append(accy[i]/accy[0])
                accz.append(accz[i]/accz[0])
                gyrox.append(gyrox[i]/gyrox[0])
                gyroy.append(gyroy[i]/gyroy[0])
                gyroz.append(gyroz[i]/gyroz[0])
            
            # 整合
            other=[timestamp,evento]
        
            
            # 将所有数据整合并存进文件
            newdata=[]
            for i in [accx,accy,accz,gyrox,gyroy,gyroz,add]:
                newdata.extend(i)
                
            newdata=list(map(str,newdata))

            finaldata=",".join(newdata)
            finaldata=finaldata+","+evento
 

            with open(new_file,'a+') as fw:
                fw.write(finaldata+"\n")   


# 注意：因为事先手动进行了标注所以需要分别读取
import os
read_total=os.listdir(os.getcwd()+"//Original Data//Processed Data//")
for f in read_total:
    read_path=os.getcwd()+"//Original Data//Processed Data//"+f+"//"
    save_path=os.getcwd()+"//Processed Data//classify_data//BUMP//"
    process_window(read_path,save_path,20,5)

### 划分测试集和训练集

In [75]:
# 将所有数据进行分包（用于划分训练集与测试集
import pandas as pd

def file_split(save_path,read_path):
    f=get_filenames(read_path)
    for i in f:
        if i[-4:]!='.csv':
            continue
        data=pd.read_csv(read_path+i)
        if i[29:-4]=="COLLISION":
            data.to_csv(save_path+"//COLLISION//"+i,index=None)
        elif i[29:-4]=='ACC':
            data.to_csv(save_path+"//ACC//"+i,index=None)
        elif i[29:-4]=='BRAKE':
            data.to_csv(save_path+"//BRAKE//"+i,index=None)
        elif i[29:-4]=='TURN':
            data.to_csv(save_path+"//TURN//"+i,index=None)
        elif i[29:-4]=='LINE_CHANGE':
            data.to_csv(save_path+"//LINE_CHANGE//"+i,index=None)
        elif i[27:-4]=='BUMP':
            data.to_csv(save_path+"//BUMP//"+i,index=None)
            
# 将数据存入指定文件夹
import os
read_path=os.getcwd()+"//Processed Data//processed_window_data//"
save_path=os.getcwd()+"//Processed Data//classify_data//"
file_split(save_path,read_path)

### 最终目录为data
### 目录下包含测试集和训练集两个文件夹
### 每个文件夹下存放各自的数据集

In [82]:
def train_test_split(test_size,read_path,save_path,classify=None):
    files=get_filenames(read_path) 
    n=0 #查看训练集样本数
    tag=0 #用于合并训练集
    for f in files: #获取文件夹中的文件名
        if '.' not in f: #过滤ipython文件
            if classify==None:
                filename=os.listdir(read_path+f) #某一个文件夹所有文件名
                test_files=filename[:int(round((test_size)*len(filename),0))]
                train_files=[i for i in filename if i not in test_files]
            else:
                if f not in classify:
                    continue
                else:
                    filename=os.listdir(read_path+f) #某一个文件夹所有文件名
                    test_files=filename[:int(round((test_size)*len(filename),0))]
                    train_files=[i for i in filename if i not in test_files]

            for tr_f in train_files:
                df=pd.read_csv(read_path+f+"//"+tr_f)
                # 合并数据
                if tag==0:
                    tempdata=df
                    tag=1
                else:
                    tempdata=pd.concat([tempdata,df],axis=0)
                df.to_csv(save_path+"//train//"+tr_f,index=None)
                n+=df.shape[0]
            
            tempdata.to_csv(save_path+"//train//train.csv",index=None)
            for te_f in test_files:
                dt=pd.read_csv(read_path+f+"//"+te_f)
                # dt.index=range(dt.shape[0])
                dt.to_csv(save_path+"//Test//"+te_f,index=None)
    return n

import os
# 七分类
save_path=os.getcwd()+"//Processed Data//Train_Test_Data//"
read_path=os.getcwd()+"//Processed Data//classify_data//"
train_test_split(0.3,read_path,save_path) #划分训练集与测试集

# 三分类
t_save_path=os.getcwd()+"//Processed Data//Three_classify//"
t_read_path=os.getcwd()+"//Processed Data//classify_data//"
train_test_split(0.3,t_read_path,t_save_path,classify=['BUMP','COLLISION']) #划分训练集与测试集

7748

In [77]:
# 对训练集和测试集进行贴标签处理（多分类）
import os
train_data=pd.read_csv(os.getcwd()+"//Processed Data//Train_Test_Data//train//train.csv")
train_data

Unnamed: 0,acc_x_mean_1,acc_x_mean_2,acc_x_mean_3,acc_x_mean_4,acc_x_median_1,acc_x_median_2,acc_x_median_3,acc_x_median_4,acc_x_std_1,acc_x_std_2,...,gyro_z_median_4,gyro_z_std_1,gyro_z_std_2,gyro_z_std_3,gyro_z_std_4,gyro_z_tendency_1,gyro_z_tendency_2,gyro_z_tendency_3,timeStamp,evento
0,0.115088,0.105260,0.110842,0.109612,0.118883,0.107558,0.116278,0.114882,0.017393,0.018115,...,-16.709502,0.207187,0.307210,0.439706,0.439363,1.005763,1.021625,1.029445,1.625619e+09,Normal
1,0.127857,0.121473,0.112792,0.115095,0.127463,0.122582,0.116278,0.117798,0.009896,0.015523,...,-16.143802,0.538806,0.514598,0.524017,0.628650,1.020348,1.031129,1.047400,1.625619e+09,Normal
2,0.145079,0.136468,0.129341,0.120864,0.141056,0.139360,0.132939,0.122582,0.006569,0.012029,...,-15.770779,0.413988,0.845247,0.937576,0.951106,1.049639,1.081097,1.100123,1.625619e+09,Normal
3,0.150091,0.147585,0.141009,0.134529,0.152189,0.150301,0.141056,0.136219,0.010391,0.009047,...,-14.497734,0.613915,1.237493,1.587467,1.701143,1.095291,1.166455,1.215352,1.625619e+09,Normal
4,0.146590,0.148341,0.147253,0.142404,0.151844,0.151940,0.150301,0.142264,0.009307,0.010018,...,-13.100437,0.647050,1.144901,1.789915,2.191438,1.096950,1.205113,1.294473,1.625619e+09,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8430,-0.154969,-0.154615,-0.154174,-0.116596,-0.154939,-0.154697,-0.154349,-0.154572,0.000921,0.000868,...,-0.769851,0.006165,0.013371,0.011820,0.041053,1.011887,1.010623,0.996873,1.625177e+09,Normal
8431,-0.154429,-0.154699,-0.154553,-0.154238,-0.154570,-0.154697,-0.154573,-0.154460,0.000480,0.000782,...,-0.785071,0.000022,0.007551,0.011018,0.010391,0.992146,0.997328,0.996060,1.625177e+09,Normal
8432,-0.155211,-0.154820,-0.154870,-0.154718,-0.154941,-0.154695,-0.154821,-0.154697,0.000860,0.000799,...,-0.785095,0.000083,0.007636,0.011839,0.012150,0.990459,0.982142,0.983723,1.625177e+09,Normal
8433,-0.154868,-0.155039,-0.154836,-0.154869,-0.155174,-0.155058,-0.154821,-0.154876,0.001131,0.001019,...,-0.785081,0.007549,0.013342,0.010988,0.011778,1.015758,1.014450,1.009822,1.625177e+09,Normal


### 对训练集和测试集进行贴标签处理（多分类） 

In [78]:
train_labels=train_data['evento'].unique().tolist()
train_dl=train_data['evento'].apply(lambda x:train_labels.index(x))


# 导入需要的包
# 模型包
from sklearn.ensemble import RandomForestClassifier as RFC #随机森林
from sklearn.tree import DecisionTreeClassifier #决策分类树
from sklearn.neural_network import MLPClassifier # 多层感知机
from sklearn.svm import SVC #支持向量机中的一种
from sklearn.model_selection import GridSearchCV #网格搜索  

# 数据划分以及评分包
from sklearn.model_selection import cross_val_score #交叉验证
from sklearn.metrics import roc_auc_score as auc,recall_score as recall

# 数据处理以及计算画图的包
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

train_labels #标签

['Normal', 'ACC', 'BRAKE', 'BUMP', 'COLLISION', 'LINE_CHANGE', 'TURN']

### 建立模型并进行评估

In [79]:
%%time
def randForst(train_x,test_x,train_dl,test_dl):
    
    clf = RFC(n_estimators=200,max_depth=100, random_state=0)
    clf.fit(train_x,train_dl)
    score=clf.score(test_x,test_dl)
    y_pred=clf.predict_proba(test_x)
    y_pred=y_pred.tolist()
    y_pred=[y_pred[i].index(max(y_pred[i])) for i in range(len(y_pred))]
       
    t=sorted(y_pred,key=lambda x: y_pred.count(x),reverse=True) #取出占比最多的标签(正常除外)
    auc=0
    print("预测标签：",y_pred)
    print("原始标签：",test_dl)
    test=test_dl
    res=0
    for i in range(len(test_dl)):
        if test[i]==y_pred[i]:
            res+=1
    auc=res/len(y_pred)
    return clf,score,auc,res,t #预测最多的标签


# 批量读取文件，计算得分
import os
path=os.getcwd()+"//Processed Data//Train_Test_Data//test//"
filename=os.listdir(path)


s=0 #记录判断正确样本数 
l=0 #记录总的样本数

y_pred=[[] for i in range(len(train_labels))]
y_true=[[] for i in range(len(train_labels))] #分文件

# 汇总
y_true_t=[]
y_pred_t=[]


for f in filename:
    test_dl=[]
    x_test=pd.read_csv(path+f)
    test_x=x_test.iloc[:,:-2]
    for k in range(x_test.shape[0]):
        test_dl.append(train_labels.index(x_test.loc[k,'evento']))
    
    clf,score,auc,res,t=randForst(train_data.iloc[:,:-2],test_x,train_dl,test_dl)

    if f[27:-4]!="BUMP":            
        index_i=train_labels.index(f[29:-4])
        y_true[index_i].append(f[29:-4])
        y_true_t.append(f[29:-4])
    else:
        index_i=train_labels.index(f[27:-4])
        y_true[index_i].append(f[27:-4])
        y_true_t.append(f[27:-4])
    
    most_p=0
    for i in t:
        if i==0:
            continue
        most_p=i
    y_pred[index_i].append(train_labels[most_p])
    
    y_pred_t.append(train_labels[most_p])
    
    s+=res
    l+=test_x.shape[0]
    print('%s auc: %f'%(f,auc))
    
print("total:auc %f"%(s/l))

预测标签： [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0]
原始标签： [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
2021-06-23-09-42-18-045900-BUMP.csv auc: 0.944444
预测标签： [0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
原始标签： [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
2021-06-23-09-42-27-162263-BUMP.csv auc: 0.805556
预测标签： [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
原始标签： [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
2021-06-23-09-42-36-695837-BUMP.csv auc: 1.000000
预测标签： [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
原始标签： [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,

预测标签： [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
原始标签： [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
861193041076515-1620602187.0-COLLISION.csv auc: 0.786885
预测标签： [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
原始标签： [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
861193041076515-1621291586.0-COLLISION.csv auc: 0.786885
预测标签： [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

预测标签： [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
原始标签： [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
861193041083446-1625774840.0-COLLISION.csv auc: 0.803279
预测标签： [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
原始标签： [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
861193041083446-1626139535.0-COLLISION.csv auc: 0.786885
预测标签： [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4,

预测标签： [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
原始标签： [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
861193041083446-1626913447.0-COLLISION.csv auc: 0.852459
预测标签： [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 4, 4, 4, 0, 0, 0, 3, 3, 3, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3]
原始标签： [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
861193041083735-1622126888.0-COLLISION.csv auc: 0.704918
预测标签： [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [80]:
from sklearn.metrics import confusion_matrix,classification_report
print(classification_report(y_true_t,y_pred_t)) #查看总的情况

              precision    recall  f1-score   support

         ACC       0.30      0.50      0.37         6
       BRAKE       0.00      0.00      0.00         7
        BUMP       0.76      1.00      0.86        16
   COLLISION       0.81      0.49      0.61        45
 LINE_CHANGE       0.00      0.00      0.00         2
      Normal       0.00      0.00      0.00         0
        TURN       0.00      0.00      0.00         3

    accuracy                           0.52        79
   macro avg       0.27      0.28      0.26        79
weighted avg       0.64      0.52      0.55        79



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
for i in range(len(y_pred)): #查看每个文件的情况
    print(classification_report(y_true[i],y_pred[i]))

### 保存模型


In [None]:
import joblib
joblib.dump(clf,"RFC.dat") #需要保存的模型，保存路径与形式

### 加载模型

In [None]:
load_modl=joblib.load(r'RFC.dat') #可以用来预测和判断