In [None]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

# 设置随机种子保证可重复性
tf.random.set_seed(42)
np.random.seed(42)

# 数据目录
TRAIN_DIR = "/kaggle/input/1111111/data/train"
TEST_DIR = "/kaggle/input/1111111/data/test"

# 参数配置
BATCH_SIZE = 128  # 增大batch size以利用GPU并行计算
EPOCHS = 50  # 增加epochs
PATIENCE = 10  # 调整早停耐心值
LEARNING_RATE = 0.001
INPUT_SHAPE = (40, 7)  # 时间步长40，特征数7

def load_data_from_dir(directory):
    """
    加载CSV格式的时序数据
    目录结构:
    directory/
        legit/
            file1.csv
            file2.csv
            ...
        rise/
            file1.csv
            file2.csv
            ...
    每个CSV文件应为 (40, 7) 的形状
    """
    X = []
    y = []
    
    class_mapping = {'legit': 0, 'rise': 1}
    
    for class_name in ['legit', 'rise']:
        class_dir = os.path.join(directory, class_name)
        if not os.path.exists(class_dir):
            print(f"Warning: Directory {class_dir} not found")
            continue
            
        class_idx = class_mapping[class_name]
        
        for filename in sorted(os.listdir(class_dir)):
            if not filename.endswith('.csv'):
                continue
                
            filepath = os.path.join(class_dir, filename)
            
            try:
                data = pd.read_csv(filepath, header=None).values
                
                if data.shape != INPUT_SHAPE:
                    print(f"Warning: File {filepath} has shape {data.shape}, expected {INPUT_SHAPE}")
                    continue
                    
                X.append(data)
                y.append(class_idx)
                
            except Exception as e:
                print(f"Error loading {filepath}: {str(e)}")
                continue
    
    X = np.array(X, dtype=np.float32)
    y = np.array(y, dtype=np.int32)
    
    return X, y

# 加载训练和测试数据
print("Loading training data...")
X_train, y_train = load_data_from_dir(TRAIN_DIR)
print(f"Training data shape: {X_train.shape}, Labels shape: {y_train.shape}")

print("\nLoading test data...")
X_test, y_test = load_data_from_dir(TEST_DIR)
print(f"Test data shape: {X_test.shape}, Labels shape: {y_test.shape}")

# 检查数据平衡性
print("\nClass distribution:")
print(f"Train - Legit: {sum(y_train == 0)}, Rise: {sum(y_train == 1)}")
print(f"Test - Legit: {sum(y_test == 0)}, Rise: {sum(y_test == 1)}")

# 划分训练集和验证集
print("\nSplitting training set into train/validation...")
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, 
    test_size=0.2, 
    random_state=42, 
    stratify=y_train
)
print(f"Final shapes - Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")



# 构建修正后的CNN模型
def build_optimized_cnn(input_shape):
    input_layer = layers.Input(shape=input_shape)
    
    # 第一卷积块
    x = layers.Conv1D(filters=64, kernel_size=5, padding='same', activation='relu')(input_layer)
    x = layers.BatchNormalization()(x)
    x = layers.Conv1D(filters=64, kernel_size=5, padding='same', activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPooling1D(pool_size=2)(x)
    x = layers.Dropout(0.3)(x)
    
    # 第二卷积块
    x = layers.Conv1D(filters=128, kernel_size=3, padding='same', activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Conv1D(filters=128, kernel_size=3, padding='same', activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPooling1D(pool_size=2)(x)
    x = layers.Dropout(0.4)(x)
    
    # 第三卷积块
    x = layers.Conv1D(filters=256, kernel_size=3, padding='same', activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Conv1D(filters=256, kernel_size=3, padding='same', activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPooling1D(pool_size=2)(x)
    x = layers.Dropout(0.5)(x)
    
    # 修正后的注意力机制
    # 获取特征图的形状 (batch_size, timesteps, features)
    _, timesteps, features = x.shape
    
    # 计算注意力权重
    attention = layers.GlobalAveragePooling1D()(x)  # (batch_size, features)
    attention = layers.Dense(features, activation='sigmoid')(attention)  # (batch_size, features)
    attention = layers.Reshape((1, features))(attention)  # (batch_size, 1, features)
    
    # 应用注意力权重
    x = layers.Multiply()([x, attention])  # (batch_size, timesteps, features)
    
    # 全局平均池化和全连接层
    x = layers.GlobalAveragePooling1D()(x)
    
    x = layers.Dense(128, activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.5)(x)
    
    x = layers.Dense(64, activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.5)(x)
    
    output_layer = layers.Dense(1, activation='sigmoid')(x)
    
    model = models.Model(inputs=input_layer, outputs=output_layer)
    
    return model

# 构建模型
model = build_optimized_cnn(INPUT_SHAPE)

# 打印模型结构
model.summary()

# 编译模型
optimizer = tf.keras.optimizers.Adam(
    learning_rate=LEARNING_RATE,
    beta_1=0.9,
    beta_2=0.999,
    epsilon=1e-07
)

model.compile(
    optimizer=optimizer,
    loss='binary_crossentropy',
    metrics=[
        'accuracy',
        tf.keras.metrics.Precision(name='precision'),
        tf.keras.metrics.Recall(name='recall'),
        tf.keras.metrics.AUC(name='auc'),
        tf.keras.metrics.AUC(name='pr_auc', curve='PR')
    ]
)

# 定义回调函数
early_stopping = callbacks.EarlyStopping(
    monitor='val_pr_auc',
    patience=PATIENCE,
    restore_best_weights=True,
    mode='max'
)

reduce_lr = callbacks.ReduceLROnPlateau(
    monitor='val_pr_auc',
    factor=0.5,
    patience=5,
    min_lr=1e-6,
    mode='max'
)

model_checkpoint = callbacks.ModelCheckpoint(
    'best_model.keras',
    monitor='val_pr_auc',
    save_best_only=True,
    mode='max'
)

# 训练模型
history = model.fit(
    X_train, y_train,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=(X_val, y_val),
    callbacks=[early_stopping, reduce_lr, model_checkpoint],
    verbose=1
)

# 绘制训练曲线
def plot_training_history(history):
    plt.figure(figsize=(12, 8))
    
    # 准确率
    plt.subplot(2, 2, 1)
    plt.plot(history.history['accuracy'], label='Train Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title('Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    
    # 损失
    plt.subplot(2, 2, 2)
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    
    # AUC
    plt.subplot(2, 2, 3)
    plt.plot(history.history['auc'], label='Train AUC')
    plt.plot(history.history['val_auc'], label='Validation AUC')
    plt.title('ROC AUC')
    plt.xlabel('Epoch')
    plt.ylabel('AUC')
    plt.legend()
    
    # PR AUC
    plt.subplot(2, 2, 4)
    plt.plot(history.history['pr_auc'], label='Train PR AUC')
    plt.plot(history.history['val_pr_auc'], label='Validation PR AUC')
    plt.title('PR AUC')
    plt.xlabel('Epoch')
    plt.ylabel('PR AUC')
    plt.legend()
    
    plt.tight_layout()
    plt.savefig('training_metrics.png')
    plt.show()

plot_training_history(history)

# 评估模型
print("\nEvaluating on test set...")
test_metrics = model.evaluate(X_test, y_test, verbose=0)
metrics_names = model.metrics_names

for name, value in zip(metrics_names, test_metrics):
    print(f"Test {name}: {value:.4f}")

# 生成预测结果
y_pred = model.predict(X_test)
y_pred_classes = (y_pred > 0.5).astype(int)

# 打印分类报告
print("\nClassification Report:")
print(classification_report(y_test, y_pred_classes))

# 绘制混淆矩阵
cm = confusion_matrix(y_test, y_pred_classes)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.savefig('confusion_matrix.png')
plt.show()

# 保存模型
model.save('optimized_time_series_cnn_model.keras')
print("Model saved successfully.")


In [None]:
import os
import numpy as np
import pandas as pd
from tensorflow.keras.models import load_model

def load_trained_model(model_path='/kaggle/working/best_model.keras'):
    """加载训练好的模型"""
    model = load_model(model_path)
    return model

def load_and_preprocess_data_from_directory(directory, expected_shape=(40, 7)):
    """从目录加载并预处理数据"""
    data = []
    file_list = sorted([f for f in os.listdir(directory) 
                       if f.endswith('.csv') and not f.startswith('.')])

    if not file_list:
        print(f"警告: 目录 {directory} 中没有CSV文件！")
        return np.array([])

    for file in file_list:
        file_path = os.path.join(directory, file)
        try:
            df = pd.read_csv(file_path, header=None)
            if df.shape != expected_shape:
                print(f"跳过 {file}: 形状不符合要求")
                continue
            data.append(df.values.astype('float32'))
        except Exception as e:
            print(f"读取 {file} 时发生错误: {str(e)}")

    return np.array(data)

def classify_and_print_results(model, data):
    """执行分类并显示带概率的结果"""
    if data.ndim != 3:
        print("错误：数据维度不正确")
        return

    # 获取原始概率预测结果
    predictions = model.predict(data)
    predicted_labels = (predictions > 0.5).astype(int)

    # 初始化计数器
    rise_count, legit_count = 0, 0

    print("\n详细预测结果:")
    for idx, prob in enumerate(predictions.flatten()):
        # 解析预测结果
        label = 'rise' if prob > 0.5 else 'legit'
        confidence = prob if label == 'rise' else 1 - prob
        
        # 更新计数器
        if label == 'rise':
            rise_count += 1
        else:
            legit_count += 1
            
        # 显示带概率的结果
        print(f"文件 {idx+1}:")
        print(f"  ▪ 类别预测: {label.upper()}")
        print(f"  ▪ Rise概率: {prob:.4f}" if predictions.ndim == 2 else f"  ▪ Rise概率: {prob:.4f}")  # 根据维度调整访问方式
        print(f"  ▪ 置信程度: {confidence:.2%}")
        print("-" * 40)

    # 显示统计摘要
    print("\n预测统计摘要:")
    print(f"Rise 类别数量: {rise_count} ({rise_count/len(predictions):.1%})")
    print(f"Legit 类别数量: {legit_count} ({legit_count/len(predictions):.1%})")
    print(f"总样本数量: {len(predictions)}")

def main():
    # 配置路径
    DATA_DIR = '/kaggle/input/fdp-temp/fdp'
    MODEL_PATH = '/kaggle/working/best_model.keras'

    # 加载模型
    print("🔄 正在加载训练好的模型...")
    try:
        model = load_trained_model(MODEL_PATH)
        print("✅ 模型加载成功")
    except Exception as e:
        print(f"❌ 模型加载失败: {str(e)}")
        return

    # 加载数据
    print(f"\n📂 正在从 {DATA_DIR} 加载数据...")
    data = load_and_preprocess_data_from_directory(DATA_DIR)
    if len(data) == 0:
        print("⚠️ 未找到有效数据")
        return
    print(f"✔️ 成功加载 {len(data)} 个样本")

    # 执行预测
    print("\n🔮 开始执行预测...")
    classify_and_print_results(model, data)

if __name__ == "__main__":
    main()


In [None]:
#评估v2
import numpy as np
import pandas as pd
import os
from tensorflow.keras.models import load_model
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt

def load_data_from_directory(directory, label, expected_shape=(40, 7)):
    """
    从指定目录加载CSV文件，并自动过滤不符合要求的数据
    参数：
        directory: 数据目录路径
        label: 数据标签
        expected_shape: 预期的数据形状（行，列）
    返回：
        data_array: 三维numpy数组 (样本数, 行, 列)
        labels_array: 一维numpy数组
    """
    data = []
    labels = []
    valid_count = 0
    skip_count = 0
    
    # 获取目录下所有CSV文件（自动过滤隐藏文件）
    file_list = sorted([f for f in os.listdir(directory) 
                      if f.endswith('.csv') and not f.startswith('.')])
    
    if not file_list:
        print(f"警告: 目录 {directory} 中没有CSV文件！")
        return np.array([]), np.array([])
    
    for file in file_list:
        file_path = os.path.join(directory, file)
        try:
            df = pd.read_csv(file_path, header=None)
            
            # 形状检查
            if df.shape != expected_shape:
                print(f"跳过 {file}: 期望形状 {expected_shape}，实际形状 {df.shape}")
                skip_count += 1
                continue
                
            data.append(df.values.astype('float32'))  # 转换为numpy数组
            labels.append(label)
            valid_count += 1
            
        except Exception as e:
            print(f"读取 {file} 时发生错误: {str(e)}")
            skip_count += 1
    
    print(f"从 {directory} 成功加载 {valid_count} 个样本，跳过 {skip_count} 个无效文件")
    return np.array(data), np.array(labels)

def reshape_data(data):
    """确保数据形状为 (样本数, 时间步长, 特征数)"""
    # 如果数据已经是3D形状则直接返回
    if data.ndim == 3:
        return data
    # 否则自动重塑为3D（例如：假设原始形状是 (样本数, 40*7)）
    return data.reshape(-1, 40, 7)  # 使用实际的时间步长和特征数

# 加载模型
model = load_model('optimized_time_series_cnn_model.keras')

# 配置参数
TEST_DIR = '/kaggle/input/1111111/data/test'  # 替换为你的测试集路径
EXPECTED_SHAPE = (40, 7)  # 根据实际数据形状修改

# 加载测试数据
print("正在加载legit测试数据...")
legit_data, legit_labels = load_data_from_directory(
    os.path.join(TEST_DIR, 'legit'), 
    label=0,
    expected_shape=EXPECTED_SHAPE
)

print("\n正在加载rise测试数据...")
rise_data, rise_labels = load_data_from_directory(
    os.path.join(TEST_DIR, 'fdp'), 
    label=1,
    expected_shape=EXPECTED_SHAPE
)

# 检查数据是否加载成功
if len(legit_data) == 0:
    raise ValueError("没有加载到legit测试数据，请检查路径和文件格式！")
if len(rise_data) == 0:
    raise ValueError("没有加载到rise测试数据，请检查路径和文件格式！")

# 合并数据并保持原始顺序
X_test = np.vstack((legit_data, rise_data))
y_test = np.hstack((legit_labels, rise_labels))

# 修正数据形状
X_test = reshape_data(X_test)

# 进行预测
y_pred = (model.predict(X_test) > 0.5).astype("int32")

# 计算评估指标
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

# 计算准确率
accuracy = np.mean(y_pred.flatten() == y_test)
print(f"Test Accuracy: {accuracy:.4f}")

# 绘制混淆矩阵
cm = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(6, 6))
cax = ax.matshow(cm, cmap='Blues')
fig.colorbar(cax)
ax.set_xticks([0, 1])
ax.set_yticks([0, 1])
ax.set_xlabel('Predicted')
ax.set_ylabel('True')
ax.set_title('Confusion Matrix')
plt.show()
