# Pattern Recognition 24H1
#### Runze Ji, Jiashuo Tian, Ziqian Liu

In [13]:
import os
import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from itertools import islice
import numpy as np

TRAIN_FILES_COUNT = 1000
TRAIN_FILES_OFFSET = 0
EPOCHS = 20

TRAIN_FILES_PATH = '../../PR/train'
TEST_DATASET_PATH = '../../PR/test_dataset'
MODEL_PATH = '../../PR/model3.ptm'
LOGS_PATH = '../../PR/eval3.csv'

TRAIN_FILES = os.listdir(TRAIN_FILES_PATH)
TRAIN_FILES_END = TRAIN_FILES_COUNT + TRAIN_FILES_OFFSET
NUMBER_OF_EPOCHS = 0
print(f'[init] Found {len(TRAIN_FILES)} Training Files\n')
print(f'[init] Reading from Index-{TRAIN_FILES_OFFSET} to Index-{TRAIN_FILES_END-1}')
train_files_pb = tqdm(islice(TRAIN_FILES, TRAIN_FILES_OFFSET, TRAIN_FILES_END), '[preproc.loadCSV] Loading CSV Files...')

all_labels = []

for file in train_files_pb:
    file_path = os.path.join(TRAIN_FILES_PATH, file)
    data = pd.read_csv(file_path)
    all_labels.extend(data['type'].unique())

label_encoder = LabelEncoder()
label_encoder.fit(all_labels)

# 打印标签和对应的编码
for label, encoded_label in zip(label_encoder.classes_, range(len(label_encoder.classes_))):
    print(f"Label: {label} --> Encoded Label: {encoded_label}")
    
X_all = []
y_all = []

train_files_pb = tqdm(islice(TRAIN_FILES, TRAIN_FILES_COUNT),'[preproc.transform] Transforming Data...', TRAIN_FILES_COUNT)

for file in train_files_pb:
    file_path = os.path.join(TRAIN_FILES_PATH, file)
    data = pd.read_csv(file_path)

    # 转换时间列，提取特征等
    data['time'] = pd.to_datetime(data['time'])
    data['hour'] = data['time'].dt.hour
    #data['day_of_week'] = data['time'].dt.dayofweek
    data['month'] = data['time'].dt.month

    # 使用转换后的标签
    data['type_encoded'] = label_encoder.transform(data['type'])

    X = data[['lat', 'lon', '速度', '方向', 'hour', 'month']]
    y = data['type_encoded']

    X_all.append(X)
    y_all.append(y)

# 将所有数据合并为一个大的 DataFrame
X = pd.concat(X_all, ignore_index=True)
y = pd.concat(y_all, ignore_index=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=37)

def create_sequences(data, seq_length):
    xs = []
    ys = []
    for i in range(len(data)-seq_length):
        x = data[i:(i+seq_length), :-1]
        y = data[i+seq_length, -1]
        xs.append(x)
        ys.append(y)
    return np.array(xs), np.array(ys)

# 假设我们有一个序列长度
seq_length = 10  # 这是一个假设值，具体需要根据任务调整


[init] Found 18329 Training Files

[init] Reading from Index-0 to Index-999


[preproc.loadCSV] Loading CSV Files...: 1000it [00:06, 166.41it/s]


Label: 刺网 --> Encoded Label: 0
Label: 围网 --> Encoded Label: 1
Label: 拖网 --> Encoded Label: 2


  data['time'] = pd.to_datetime(data['time'])
[preproc.transform] Transforming Data...: 100%|██████████| 1000/1000 [00:12<00:00, 77.17it/s]


#### Define dataset structure
* Create Datasets for training and verifying

In [9]:
from torch.utils.data import TensorDataset, DataLoader
import torch

# 假设 X_train, y_train, X_test, y_test 已经准备好了
# 将数据转换为 PyTorch tensors
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

# 创建 Dataset
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

# 创建 DataLoader
train_loader = DataLoader(dataset=train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=64, shuffle=False)

#### Define Neural Network Structure, Loss Function, and  Optimizer

In [10]:
import torch
import torch.nn as nn

class FishingVesselLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(FishingVesselLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
    
    def forward(self, x):
        # LSTM输出大小为 (batch, seq_len, hidden_size)
        # 只取序列的最后一个时间步的输出
        _, (hn, _) = self.lstm(x)
        # hn的大小为 (num_layers, batch, hidden_size)，取最后一层的输出
        out = self.fc(hn[-1])
        return out

# 假设的参数
input_size = 6  # 特征数量
hidden_size = 128  # LSTM隐藏层的大小
num_layers = 2  # LSTM层的数量
num_classes = len(label_encoder.classes_)  # 输出的分类数

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f'[torch.cuda] Availability: {torch.cuda.is_available()}')

model = FishingVesselLSTM(input_size, hidden_size, num_layers, num_classes).to(device)


import torch.optim as optim

# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0015)

[torch.cuda] Availability: False


  from .autonotebook import tqdm as notebook_tqdm


#### Load Pre-trained Model

In [None]:
model = FishingVesselNet(num_features, num_classes)
optimizer = optim.Adam(model.parameters(), lr=0.001)

checkpoint = torch.load(MODEL_PATH)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
NUMBER_OF_EPOCHS = checkpoint['epoch']
loss = checkpoint['loss']

##model.eval()
# - or -
model.train()
NUMBER_OF_EPOCHS

#### Train a single epoch and evaluate accuracy

In [11]:
# Train single epoch
train_single_pb = tqdm(train_loader)
train_single_pb.set_description(f'[torch.train.single] Training Single Epoch {NUMBER_OF_EPOCHS + 1}')

for sequences, labels in train_single_pb:  # 假设sequences的形状是(batch, seq_len, input_size)
    optimizer.zero_grad()
    outputs = model(sequences)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()
NUMBER_OF_EPOCHS += 1
print(f'[torch.train.step] Epoch {NUMBER_OF_EPOCHS}, Loss: {loss.item()}, Total Number of Epochs: {NUMBER_OF_EPOCHS}')


# Test current accuracy
correct = 0
total = 0

test_loader_pb = tqdm(test_loader)
test_loader_pb.set_description('[torch.test] Testing Accuracy')

with torch.no_grad():
    for inputs, labels in test_loader_pb:
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f'\n[torch.test] Accuracy on test set: {100 * correct / total}%')

torch.save({
            'epoch': NUMBER_OF_EPOCHS,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss,
            }, MODEL_PATH)

with open('../../PR/eval.csv', 'a') as eval_file:
    eval_file.writelines(f'{NUMBER_OF_EPOCHS},{format(100 * correct / total, ".2f")},{format(loss.item(), ".2f")}\n')
    eval_file.close()

[torch.train.single] Training Single Epoch 1:   0%|          | 0/21398 [00:00<?, ?it/s]


RuntimeError: size mismatch (got input: [3], target: [64])

#### Evaluate Current Accuracy

In [None]:
# Test current accuracy
correct = 0
total = 0

test_loader_pb = tqdm(test_loader)
test_loader_pb.set_description('[torch.test] Testing Accuracy')

with torch.no_grad():
    for inputs, labels in test_loader_pb:
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f'\n[torch.test] Accuracy on test set: {100 * correct / total}%')

#### Train Specified Number of Epochs and Evaluate Accuracy on each Epoch

In [None]:
# Train Specified Number of Epochs and Evaluate Accuracy on each Epoch
for epoch in range(EPOCHS):
    train_single_pb = tqdm(train_loader)
    train_single_pb.set_description(f'[torch.train.single] Training Single Epoch {NUMBER_OF_EPOCHS + 1}')

    for inputs, labels in train_single_pb:
        # 清除之前的梯度
        optimizer.zero_grad()
        
        # 前向传播
        outputs = model(inputs)
        
        # 计算损失
        loss = criterion(outputs, labels)
        
        # 后向传播和优化
        loss.backward()
        optimizer.step()
    NUMBER_OF_EPOCHS += 1
    print(f'[torch.train.step] Epoch {NUMBER_OF_EPOCHS}, Loss: {loss.item()}, Total Number of Epochs: {NUMBER_OF_EPOCHS}')

    # Test current accuracy
    correct = 0
    total = 0

    test_loader_pb = tqdm(test_loader, '[torch.test] Testing Accuracy')

    with torch.no_grad():
        for inputs, labels in test_loader_pb:
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    print(f'\n[torch.test.result] Epoch:{NUMBER_OF_EPOCHS}, Loss:{loss.item()}, Accuracy:{100 * correct / total}%\n')

    torch.save({
                'epoch': NUMBER_OF_EPOCHS,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': loss,
                }, MODEL_PATH)

    with open(LOGS_PATH, 'a') as eval_file:
        eval_file.writelines(f'{NUMBER_OF_EPOCHS},{format(100 * correct / total, ".2f")},{format(loss.item(), ".2f")}\n')
        eval_file.close()

#### Model Saving

In [None]:
torch.save({
            'epoch': NUMBER_OF_EPOCHS,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss,
            }, MODEL_PATH)

#### Prediction

In [None]:
TEST_DATASET_PATH = '../../PR/test_dataset'
MODEL_PATH = '../../PR/model2.ptm'
TEST_FILES = os.listdir(TEST_DATASET_PATH)

print(f'[verify] Found {len(TEST_FILES)} Test Files\n')

X_verify_all = []

test_files_pb = tqdm(TEST_FILES,'[preproc.transform] Transforming Data...')

for file in test_files_pb:
    file_path = os.path.join(TEST_DATASET_PATH, file)
    data = pd.read_csv(file_path)

    # 转换时间列，提取特征等
    data['time'] = pd.to_datetime(data['time'])
    data['hour'] = data['time'].dt.hour
    #data['day_of_week'] = data['time'].dt.dayofweek
    data['month'] = data['time'].dt.month

    X_verify = data[['渔船ID', 'lat', 'lon', '速度', '方向', 'hour', 'month']]
    #X_test = data[['lat', 'lon', '速度', '方向', 'hour', 'day_of_week', 'month']]

    X_verify_all.append(X_verify)

# 将所有数据合并为一个大的 DataFrame
X_verify = pd.concat(X_verify_all, ignore_index=True)
X_verify

In [None]:
X_verify_tensor = torch.tensor(X_verify[['lat', 'lon', '速度', '方向', 'hour', 'day_of_week', 'month']].values, dtype=torch.float32)
class TestDataset(Dataset):
    def __init__(self, features):
        self.features = features
    
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        return self.features[idx]
    
verify_dataset = TestDataset(X_verify_tensor)
verify_loader = DataLoader(dataset=verify_dataset, batch_size=64, shuffle=False)
    
# 实例化模型
num_features = X_verify.drop('渔船ID', axis=1).shape[1]
num_classes = 3

import torch.optim as optim
model = FishingVesselNet(num_features, num_classes)
optimizer = optim.Adam(model.parameters(), lr=0.001)
checkpoint = torch.load(MODEL_PATH)

model.eval()

In [None]:
predictions = []
verify_loader_pb = tqdm(verify_loader, '[torch.test] Testing Accuracy')
with torch.no_grad():
    for inputs in verify_loader_pb:
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        predictions.extend(predicted.cpu().numpy())

In [None]:
ids = X_verify['渔船ID'].values

# 创建一个DataFrame来存储预测结果
predictions_df = pd.DataFrame({
    '渔船ID': ids,
    'type': predictions
})

# 显示DataFrame的前几行以确认
predictions_df.head()

In [None]:
X_replaced = predictions_df[['渔船ID', 'type']].replace({0:"刺网", 1:"围网", 2:"拖网"})
X_replaced

In [None]:
X_final = X_replaced.drop_duplicates(subset='渔船ID', keep='first')
X_final

In [None]:
X_final.to_csv('../../PR/submissions/nn/submission_nn.csv', index=False)