## データの読み込みと加工

### データの読み込み

In [None]:
from pandas._libs.tslibs.vectorized import normalize_i8_timestamps
import io
import pandas as pd
import csv
import time

start = time.perf_counter() #実行時間のカウント開始

# 列名のついた配列にデータを格納
f2 = "(データのパス)"
col_names = [ 'c{0:02d}'.format(i) for i in range(402) ]
auto = pd.read_csv(f2,names=col_names)

print(time.perf_counter() - start)#実行時間のカウント終了、表示
auto

### 必要のない列を削除

In [None]:
# # # 必要のない列を削除(説明変数：z座標、ワイヤー座標、ドリフト時間)
delete_names = ['c01','c05','c09']
for i in range(3,100):
  delete_names.append('c%d' % (1+4*i)) #面番号

# # 必要のない列を削除(説明変数：ワイヤー座標、ドリフト時間)
# delete_names = ['c01','c02','c05','c06','c09','c10']
# for i in range(3,100):
#   delete_names.append('c%d' % (1+4*i))
#   delete_names.append('c%d' % (2+4*i)) #z座標

# # 必要のない列を削除(説明変数：z座標、ワイヤー座標)
# delete_names = ['c01','c04','c05','c08','c09','c400']
# for i in range(3,100):
#   delete_names.append('c%d' % (1+4*i))
#   delete_names.append('c%d' % (4*i)) #ドリフト時間

# # 必要のない列を削除(説明変数：z座標、ドリフト時間)
# delete_names = ['c01','c03','c04','c05','c07','c09','c11']
# for i in range(3,100):
#   delete_names.append('c%d' % (1+4*i))
#   delete_names.append('c%d' % (3+4*i)) #ワイヤー座標

# # 必要のない列を削除(説明変数：z座標)
# delete_names = ['c01','c03','c04','c05','c07','c08','c09','c11','c400']
# for i in range(3,100):
#   delete_names.append('c%d' % (1+4*i))
#   delete_names.append('c%d' % (3+4*i))
#   delete_names.append('c%d' % (4*i))

# # 必要のない列を削除(説明変数：ワイヤー座標)
# delete_names = ['c01','c02','c04','c05','c06','c08','c09','c10','c400']
# for i in range(3,100):
#   delete_names.append('c%d' % (1+4*i))
#   delete_names.append('c%d' % (2+4*i))
#   delete_names.append('c%d' % (4*i))

# # 必要のない列を削除(説明変数：ドリフト時間)
# delete_names = ['c01','c02','c03','c05','c06','c07','c09','c10','c11']
# for i in range(3,100):
#   delete_names.append('c%d' % (1+4*i))
#   delete_names.append('c%d' % (2+4*i))
#   delete_names.append('c%d' % (3+4*i))


auto = auto.drop(delete_names, axis=1) #この行で削除を行っている。上では、どの列を削除するかをしている。

# # ランダムにデータを並び替え（なくてもよい）
auto = auto.sample(frac=1, ignore_index=True, random_state=42)

## リアルデータでtrack有り無しのみを判定したいとき
# c401列で0以外の値を持つ行をすべて1に変更
auto.loc[auto['c401'] != 0, 'c401'] = 1

print(auto)

### データのクラス割合を調整

In [None]:
import pandas as pd
from sklearn.utils import resample

# 0と1のデータをそれぞれ抽出
data_0 = auto[auto['c401'] == 0]
data_1 = auto[auto['c401'] == 1]

# データ数を取得
count_0 = len(data_0)
count_1 = len(data_1)

# サンプリング後の目標データ数を設定
target_count = min(100000, min(count_0, count_1))  # 最大200,000になるように調整

# resample関数を使ってサンプリング
sampled_data_0 = resample(data_0, replace=False, n_samples=target_count, random_state=42)
sampled_data_1 = resample(data_1, replace=False, n_samples=target_count, random_state=42)

# 新しいデータフレームに結合
auto = pd.concat([sampled_data_0, sampled_data_1])

# 調整前と調整後のトラック有り無しのデータ数を表示
print(f'Original 0s: {count_0}, Original 1s: {count_1}')
print(f'Sampled 0s: {target_count}, Sampled 1s: {target_count}')

### 入力層に取り込む列のラベルのリストを作成

In [None]:
col_names = auto.columns.values.tolist()
col_names.remove('c00') #面番号
col_names.remove('c401') #正解ラベル
print(col_names)

## tensowflowで行う機械学習

### パッケージの読み込み

In [None]:
import sys
sys.setrecursionlimit(2000)
import numpy as np
from __future__ import print_function
from matplotlib import pyplot as plt
import tensorflow as tf
print(tf.__version__)
from tensorflow.python.client import device_lib
import tensorflow.keras as kera
from tensorflow.keras import backend
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import RMSprop
from sklearn.model_selection import train_test_split
import hyperopt
from hyperopt import fmin, tpe, hp
from sklearn.metrics import f1_score
from keras.callbacks import LearningRateScheduler

2.15.0


### 学習用データと検証用データに分割

In [None]:

num_classes = 2


X = auto[col_names] # 説明変数の列の指定
y = auto['c401']

# 列"c00"を別の変数に保存（データ番号の保存のため）
c00_info = auto['c00']

# 訓練データとテストデータに分ける
x_train, x_test, y_train, y_test, c00_train, c00_test = train_test_split(X, y, c00_info, test_size=0.3, random_state=0)
y_train = kera.utils.to_categorical(y_train, num_classes)
y_test = kera.utils.to_categorical(y_test, num_classes)

#### ハイパーパラメータ探索

In [None]:
start = time.perf_counter()
epochs = 20

# Step 1: ハイパーパラメータ探索範囲を定義
space = {
    'batch_size': hp.choice('batch_size', [64, 128, 256, 512]), #バッチサイズ
    'layers': hp.randint('layers', 30) + 1,  # 中間層
    'neurons': hp.choice('neurons', [64, 128, 256, 512,1024]), #ニューロン数
    'lr': hp.loguniform('lr', -9, -2), #学習率
}

# Step 2: 目的関数を定義
def objective(params):
    batch_size = params['batch_size']
    layers = params['layers']
    neurons = params['neurons']
    lr = params['lr']

    class_weights = {0: 1.0, 1: 5.0}  # 重みを調整

    # モデルの構築と訓練
    model = Sequential()
    model.add(Dense(300, activation='relu', input_shape=(300,))) #データの入力数を指定
    model.add(Dropout(0.2))
    for i in range(1, layers-1):
        model.add(Dense(neurons, activation='relu'))
        model.add(Dropout(0.2))
    model.add(Dense(num_classes, activation='softmax'))

    model.compile(loss='categorical_crossentropy',
                  optimizer=kera.optimizers.RMSprop(lr=lr),
                  metrics=['accuracy'])

    history = model.fit(x_train, y_train,
                        batch_size=batch_size,
                        epochs=epochs,
                        verbose=0,
                        validation_data=(x_test, y_test),
                        class_weight=class_weights #重みづけ
    )

    # F1スコアを評価指標
    y_pred = model.predict(x_test)
    y_pred_classes = np.argmax(y_pred, axis=1)
    y_true = np.argmax(y_test, axis=1)
    f1 = f1_score(y_true, y_pred_classes) # F1スコアを計算

    return -f1  # 目的関数は最大化する必要があるため、負のF1スコアを返す

# Step 3: hyperoptを使用して最適なハイパーパラメータを探索
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=10)  # max_evalsで検証回数を指定
print("Best Hyperparameters:", best)

print(time.perf_counter() - start)

#### 学習

In [None]:
start = time.perf_counter()

#GPUの非決定的ランダム性を排除(再現性の確保)
tf.keras.utils.set_random_seed(1)
tf.config.experimental.enable_op_determinism()

num_classes = 2
batch_size = 256
epochs = 8
layers = 21
neurons = 128
lr = 0.001393445233225502

class_weights = {0: 1.0, 1: 6.0}  # class1の重みを調整

model = Sequential()

model.add(Dense(300, activation='relu', input_shape=(300,)))
model.add(Dropout(0.2))

for i in range(1, layers-1, 1):
    model.add(Dense(neurons, activation='relu'))
    model.add(Dropout(0.2))

model.add(Dense(num_classes, activation='softmax'))

model.summary()

model.compile(loss='categorical_crossentropy',
              optimizer=kera.optimizers.RMSprop(lr=lr),
              metrics=['accuracy'])

history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_data=(x_test, y_test),
                    class_weight=class_weights,
)

score = model.evaluate(x_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

# クラスごとの正答率を計算
y_pred = model.predict(x_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true = np.argmax(y_test, axis=1)

class_accuracy = {}
for i in range(num_classes):
    class_indices = np.where(y_true == i)
    class_correct = np.sum(y_pred_classes[class_indices] == i)
    class_total = len(class_indices[0])
    class_accuracy[i] = class_correct / class_total
print('Class Accuracy:')
for i, acc in class_accuracy.items():
    print(f'Class {i}: {acc}')

# F1スコアを計算
y_pred_classes = np.argmax(y_pred, axis=1)
y_true = np.argmax(y_test, axis=1)
f1 = f1_score(y_true, y_pred_classes)
print("f1_score",f1)

plt.plot(history.history['accuracy'], marker='.', label='acc')
plt.plot(history.history['val_accuracy'], marker='.', label='val_acc')

plt.title('model accuracy')
plt.grid()
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.legend(loc='best')
plt.show()

plt.plot(history.history['loss'], marker='.', label='loss')
plt.plot(history.history['val_loss'], marker='.', label='val_loss')
plt.title('model loss')
plt.grid()
plt.xlabel('epoch')
plt.ylabel('loss')
plt.legend(loc='best')
plt.show()

print(time.perf_counter() - start)

## おまけ

### モデルの保存

In [None]:
import pickle

# 構築したモデルの保存
filename = 'real_model.pkl'
pickle.dump(model,open(filename,'wb'))

### 正解・不正解データを保存

In [None]:
# Class 1で不正解のデータを抽出してファイルに保存
incorrect_indices = np.where((y_pred_classes != y_true) & (y_true == 1))[0]
incorrect_data = x_test.iloc[incorrect_indices]
incorrect_data.insert(0, 'c00', c00_test.iloc[incorrect_indices])  # c00列を追加
incorrect_data.to_csv('incorrect_data_class1.csv', index=False)

# Class 1で正解のデータを抽出してファイルに保存
correct_indices = np.where((y_pred_classes == y_true) & (y_true == 1))[0]
correct_data = x_test.iloc[correct_indices]
correct_data.insert(0, 'c00', c00_test.iloc[correct_indices])  # c00列を追加
correct_data.to_csv('correct_data_class1.csv', index=False)

# Class 0で正解のデータを抽出してファイルに保存
other_indices = np.where((y_pred_classes == y_true) & (y_true == 0))[0]
other_data = x_test.iloc[other_indices]
other_data.insert(0, 'c00', c00_test.iloc[other_indices])  # c00列を追加
other_data.to_csv('correct_data_class0.csv', index=False)

# Class 0で不正解のデータを抽出してファイルに保存
other_indices = np.where((y_pred_classes != y_true) & (y_true == 0))[0]
other_data = x_test.iloc[other_indices]
other_data.insert(0, 'c00', c00_test.iloc[other_indices])  # c00列を追加
other_data.to_csv('incorrect_data_class0.csv', index=False)

### データ解析

#### ヒット数別0の占有割合

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# データを読み込む
incorrect_data_class1 = pd.read_csv('incorrect_data_class1.csv')
correct_data_class1 = pd.read_csv('correct_data_class1.csv')
incorrect_data_class0 = pd.read_csv('incorrect_data_class0.csv')
correct_data_class0 = pd.read_csv('correct_data_class0.csv')

all_counts = pd.concat([incorrect_data_class1, correct_data_class1, incorrect_data_class0, correct_data_class0]) # 結合

# 1ヒットごとの説明変数を1つにする
delete_names = ['c00','c02','c03','c06','c07','c10','c11']
for i in range(3,100):
  delete_names.append('c%d' % (2+4*i))
  delete_names.append('c%d' % (3+4*i))

all_counts = all_counts.drop(delete_names, axis=1)

# 各列ごとの0の占有率を計算する
zero_counts = all_counts.eq(0).sum(axis=0) / 60000

plt.figure(figsize=(10, 6))
plt.hist(range(100), bins=100, weights=zero_counts, alpha=0.5, color='green')

plt.xlabel('hit')
plt.ylabel('Frequency')
plt.legend()
plt.show()

#### ヒット数別正答率またはヒット数確率分布の分析

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# データを読み込む
incorrect_data_class1 = pd.read_csv('incorrect_data_class1.csv')
correct_data_class1 = pd.read_csv('correct_data_class1.csv')
incorrect_data_class0 = pd.read_csv('incorrect_data_class0.csv')
correct_data_class0 = pd.read_csv('correct_data_class0.csv')

# 0を含むものの数をインデックスごとに数える
incorrect_counts_class1 = 100 - (incorrect_data_class1 == 0).sum(axis=1) / 3
correct_counts_class1 = 100 - (correct_data_class1 == 0).sum(axis=1) / 3
incorrect_counts_class0 = 100 - (incorrect_data_class0 == 0).sum(axis=1) / 3
correct_counts_class0 = 100 - (correct_data_class0 == 0).sum(axis=1) / 3

print(incorrect_counts_class1)

incorrect_countsByhit_class1 = []
correct_countsByhit_class1 = []
incorrect_countsByhit_class0 = []
correct_countsByhit_class0 = []

for i in range(101):
    incorrect_countsByhit_class1.append((incorrect_counts_class1 == i).sum(axis=0))
    correct_countsByhit_class1.append((correct_counts_class1 == i).sum(axis=0))
    incorrect_countsByhit_class0.append((incorrect_counts_class0 == i).sum(axis=0))
    correct_countsByhit_class0.append((correct_counts_class0 == i).sum(axis=0))

# all_counts = pd.concat([incorrect_counts_class1, correct_counts_class1, incorrect_counts_class0, correct_counts_class0])
class1_countsByhit = incorrect_countsByhit_class1 + correct_countsByhit_class1
class0_countsByhit = incorrect_countsByhit_class0 + correct_countsByhit_class0

print(class1_countsByhit)

# 正答率の計算
class1_countsByhit = [incorrect + correct for incorrect, correct in zip(incorrect_countsByhit_class1, correct_countsByhit_class1)]
class0_countsByhit = [incorrect + correct for incorrect, correct in zip(incorrect_countsByhit_class0, correct_countsByhit_class0)]

# ゼロ割りを防ぐため、ゼロ除算の箇所はNaNに置き換える
class1_accuracyByhit = [correct / total if total != 0 else np.nan for correct, total in zip(correct_countsByhit_class1, class1_countsByhit)]
class0_accuracyByhit = [correct / total if total != 0 else np.nan for correct, total in zip(correct_countsByhit_class0, class0_countsByhit)]


#ヒット数別正答率
plt.figure(figsize=(10, 6))
plt.hist(range(101), bins=101, weights=class1_accuracyByhit, alpha=0.7, label='Class 1 Accuracy')
plt.hist(range(101), bins=101, weights=class0_accuracyByhit, alpha=0.5, label='Class 0 Accuracy')
plt.xlabel('Index')
plt.ylabel('Accuracy')
plt.legend()

plt.hlines(y=0.5, xmin=0, xmax=100, alpha=0.5, color='black', linestyle='--')
plt.vlines(x=18, ymin=0, ymax=1, color='red', linestyle='--')
plt.show()

# ヒット数確率分布
# plt.figure(figsize=(10, 6))
# plt.hist(range(101), bins=101, weights=class1_countsByhit, alpha=0.7, label='Class 1 Accuracy')
# plt.hist(range(101), bins=101, weights=class0_countsByhit, alpha=0.5, label='Class 0 Accuracy')
# plt.xlabel('Index')
# plt.ylabel('probability')
# plt.legend()
# plt.vlines(x=18, ymin=0, ymax=1, color='red', linestyle='--')
# plt.show()
