In [1]:
import warnings
import pandas as pd
import datetime
import random
import re
import numpy as np
import math
from math import radians, cos, sin, asin, atan2, sqrt, degrees
from tqdm import tqdm
warnings.filterwarnings("ignore")

# Prepare dataset

In [None]:
def haversine_distance(lon1, lat1, lon2, lat2):
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    c = 2 * asin(sqrt(sin((lat2 - lat1)/2)**2 + cos(lat1) * cos(lat2) * sin((lon2 - lon1)/2)**2))  # haversine公式
    r = 6371.393
    return c * r * 1000


def conut_gird_num(tracks_data, grid_distance):
    Lon1 = tracks_data['Lon'].min()
    Lat1 = tracks_data['Lat'].min()
    Lon2 = tracks_data['Lon'].max()
    Lat2 = tracks_data['Lat'].max()
    low = haversine_distance(Lon1,Lat1,Lon2,Lat1)
    high = haversine_distance(Lon1,Lat2,Lon2,Lat2)
    left = haversine_distance(Lon1,Lat1,Lon1,Lat2)
    right = haversine_distance(Lon2,Lat1,Lon2,Lat2)
    lon_grid_num = int((low + high) / 2 / grid_distance)
    lat_grid_num = int((left + right) / 2 /grid_distance)
    print("before total:", lon_grid_num, '*', lat_grid_num, '=', lon_grid_num * lat_grid_num, 'grid')
    return lon_grid_num, lat_grid_num


def grid_process(tracks_data, grid_distance):
    lon_grid_num, lat_grid_num = conut_gird_num(tracks_data, grid_distance)
    Lon1 = tracks_data['Lon'].min()
    Lon2 = tracks_data['Lon'].max()
    Lat1 = tracks_data['Lat'].min()
    Lat2 = tracks_data['Lat'].max()
    Lon_gap = (Lon2 - Lon1)/lon_grid_num
    Lat_gap = (Lat2 - Lat1)/lat_grid_num
    tracks_data['grid_ID'] = tracks_data.apply(lambda x: int((x['Lat']-Lat1)/Lat_gap) * lon_grid_num + int((x['Lon']-Lon1)/Lon_gap) + 1, axis=1)
    sort_grid = sorted(set(tracks_data['grid_ID']))
    tracks_data['grid_ID'] = [sort_grid.index(num) for num in tqdm(tracks_data['grid_ID'])]
    print('after total:', len(sort_grid), 'grid')
    return tracks_data

raw_path = '..\data\shenzhen\shenzhen-all.csv'
grid_distance = 111

tracks_data = pd.read_csv(raw_path, sep='\t')
tracks_data = grid_process(tracks_data, grid_distance)

In [None]:
all_gps = tracks_data.apply(lambda x: str(x['Lon'])+'_'+str(x['Lat']), axis=1).drop_duplicates().values.tolist()
gps2idx = {gps:idx+1 for idx, gps in enumerate(all_gps)}
gps2idx['pad'] = 0


all_grid = tracks_data['grid_ID'].drop_duplicates().values.tolist()
grid2idx = {grid:idx+1 for idx, grid in enumerate(all_grid)}
grid2idx['pad'] = 0

In [None]:
num_test = 0
split_ratio  = 0.6
valid_size = 0.5


user_list = tracks_data['ObjectID'].drop_duplicates().values.tolist()
user_traj_dict = {key:[] for key in user_list}
for user_id in tqdm(tracks_data['ObjectID'].drop_duplicates().values.tolist()):
    one_user_data = tracks_data.loc[tracks_data.ObjectID == user_id, :]
    for traj_id in one_user_data['TrajNumber'].drop_duplicates().values.tolist():
        single_grid_data = [grid2idx[grid] for grid in one_user_data.loc[tracks_data.TrajNumber == traj_id, 'grid_ID'].values.tolist()]
        single_traj_data = [gps2idx[str(x)+'_'+str(y)] for x,y in zip(one_user_data.loc[tracks_data.TrajNumber == traj_id, 'Lon'].values.tolist(), one_user_data.loc[tracks_data.TrajNumber == traj_id, 'Lat'].values.tolist())]
        user_traj_dict[user_id].append((single_grid_data, single_traj_data))

user_traj_train, user_traj_test = {key:[] for key in user_list}, {key:[] for key in user_list}

for key in user_traj_dict:
    traj_num = len(user_traj_dict[key])
    num_test += traj_num - int(traj_num * split_ratio)
    
    for idx in list(range(traj_num))[:int(traj_num * split_ratio)]:
        user_traj_train[key].append(user_traj_dict[key][idx])
    
    for idx in list(range(traj_num))[int(traj_num * split_ratio):]:
        user_traj_test[key].append(user_traj_dict[key][idx])


print('test::', num_test)

# prepare DataLoader

In [5]:
import torch
import numpy as np
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.utils.data.sampler import SubsetRandomSampler

class myDataset(Dataset):
    def __init__(self, train):
        alldata = user_traj_train if train else user_traj_test
        self.x_seq, self.g_seq, self.label = [], [], []
        for key in alldata:
            for one_data in alldata[key]:
                single_grid_data, single_traj_data = one_data
                self.x_seq.append(single_traj_data)
                self.g_seq.append(single_grid_data)
                self.label.append(key)

    def __getitem__(self, index):
        return self.x_seq[index], self.g_seq[index], self.label[index], len(self.x_seq[index])

    def __len__(self):
        return len(self.x_seq)

def collate_fn(batch):
    batch = sorted(batch, key=lambda x: x[-1], reverse=True)
    x_contents, g_contents, labels, input_lengths = zip(*batch)
    max_len = max([len(content) for content in g_contents])
    x_contents = torch.LongTensor([content + [0] * (max_len - len(content)) if len(content) < max_len else content for content in x_contents])
    g_contents = torch.LongTensor([content + [0] * (max_len - len(content)) if len(content) < max_len else content for content in g_contents])
    labels = torch.LongTensor(labels)
    input_lengths = torch.LongTensor(input_lengths)
    return x_contents, g_contents, labels, input_lengths


indices = list(range(num_test))
np.random.seed(555)
np.random.shuffle(indices)
split = int(np.floor(num_test * valid_size))
valid_idx, test_idx = indices[split:], indices[:split]
valid_sampler = SubsetRandomSampler(valid_idx)
test_sampler = SubsetRandomSampler(test_idx)


train_dataset = myDataset(train=True)
test_dataset = myDataset(train=False)

# prepare model

In [6]:
n_class = len(user_traj_dict)
n_hidden = 128
grid_embedding_dim = 128
print(n_class)

71


In [7]:
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence,pad_packed_sequence

class T3S(nn.Module):
    def __init__(self):
        super(T3S, self).__init__()
        # self.emb1 = nn.Embedding(len(gps2idx), 250, padding_idx=0)
        self.emb1 = nn.Embedding(len(grid2idx), 250, padding_idx=0)
        self.lstm = nn.LSTM(input_size=250, hidden_size=n_hidden, batch_first=True, bidirectional=False)
        self.emb2 = nn.Embedding(len(grid2idx), 128, padding_idx=0)
        encoder_layer = nn.TransformerEncoderLayer(d_model=128, nhead=16)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=2)
        self.fc = nn.Linear(n_hidden, n_class)

    def forward(self, seq_x, seq_g, input_length):
        # seq_x = self.emb1(seq_x)
        input = self.emb1(seq_g)
        x_packed = pack_padded_sequence(input, input_length, batch_first=True)
        o_n, (h_n, _) = self.lstm(x_packed)
        lstm_output = h_n[-1, :, :]
        seq_g = self.emb2(seq_g)
        attn_output = self.transformer_encoder(seq_g).mean(dim=1)
        
        output = self.fc((lstm_output + attn_output))
        # output = self.fc(attn_output)
        return F.log_softmax(output, dim=-1)

In [8]:
def accuracy_1(pred, targ):
    pred = torch.max(pred, 1)[1]
    ac = ((pred == targ).float()).sum().item() / targ.size()[0]
    return ac

def accuracy_5(pred,targ):
    pred = torch.topk(pred, k=5, dim=1, largest=True, sorted=True)[1]
    ac = (torch.tensor([t in p for p,t in zip(pred,targ)]).float()).sum().item() / targ.size()[0]
    return ac

In [9]:
class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience."""
    def __init__(self, patience=7, verbose=False, delta=0):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement. 
                            Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                            Default: 0
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta

    def __call__(self, val_loss, model):

        score = -val_loss
        
        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        '''Saves model when validation loss decrease.'''
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), '../temp/checkpoint.pt')
        self.val_loss_min = val_loss

In [10]:
from sklearn.metrics import f1_score, precision_score, recall_score

def train_model(model, patience, n_epochs):
    avg_train_losses = []
    avg_valid_losses = []
    early_stopping = EarlyStopping(patience=patience, verbose=True)

    for epoch in range(n_epochs):
        
        model.train()
        train_loss_list, y_predict_list, y_true_list, acc1_list, acc5_list = [], [], [], [], []
        train_dataloader = DataLoader(dataset=train_dataset, batch_size=16, shuffle=True, drop_last=False, collate_fn=collate_fn)
        for idx, (seq_x, seq_g, y_true, input_length) in enumerate(train_dataloader):
            # seq_x, seq_g, y_true = seq_x.cuda(), seq_g.cuda(), y_true.cuda()
            seq_g, y_true = seq_g.cuda(), y_true.cuda()
            y_predict = model(seq_x, seq_g, input_length)
            y_predict_list.extend(torch.max(y_predict, 1)[1].cpu().numpy().tolist())
            y_true_list.extend(y_true.cpu().numpy().tolist())
            acc1_list.append(accuracy_1(y_predict, y_true))
            acc5_list.append(accuracy_5(y_predict, y_true))
            
            loss = F.nll_loss(y_predict, y_true)
            train_loss_list.append(loss.item())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print('Epoch: {}'.format(epoch))
        print('train_loss:{:.5f}  acc1:{:.4f}  acc5:{:.4f}  Macro-P:{:.4f}  Macro-R:{:.4f}  Macro-F1:{:.4f}'.format(np.mean(train_loss_list), np.mean(acc1_list), np.mean(acc5_list), precision_score(y_true_list, y_predict_list, average='macro'), recall_score(y_true_list, y_predict_list, average='macro'), f1_score( y_true_list, y_predict_list, average='macro')))
        
        model.eval()
        vaild_loss_list, y_predict_list, y_true_list, acc1_list, acc5_list = [], [], [], [], []
        vaild_dataloader = DataLoader(dataset=test_dataset, batch_size=16, sampler=valid_sampler, drop_last=False, collate_fn=collate_fn)
        for idx, (seq_x, seq_g, y_true, input_length) in enumerate(vaild_dataloader):
            # seq_x, seq_g, y_true = seq_x.cuda(), seq_g.cuda(), y_true.cuda()
            seq_g, y_true = seq_g.cuda(), y_true.cuda()
            with torch.no_grad():
                y_predict = model(seq_x, seq_g, input_length)

                y_predict_list.extend(torch.max(y_predict, 1)[1].cpu().numpy().tolist())
                y_true_list.extend(y_true.cpu().numpy().tolist())
                acc1_list.append(accuracy_1(y_predict, y_true))
                acc5_list.append(accuracy_5(y_predict, y_true))

                loss = F.nll_loss(y_predict, y_true)
                vaild_loss_list.append(loss.item())

        print('vaild_loss:{:.5f}  acc1:{:.4f}  acc5:{:.4f}  Macro-P:{:.4f}  Macro-R:{:.4f}  Macro-F1:{:.4f}'.format(np.mean(vaild_loss_list), np.mean(acc1_list), np.mean(acc5_list), precision_score(y_true_list, y_predict_list, average='macro'), recall_score(y_true_list, y_predict_list, average='macro'), f1_score( y_true_list, y_predict_list, average='macro')))

        avg_train_losses.append(np.mean(train_loss_list))
        avg_valid_losses.append(np.mean(vaild_loss_list))

        early_stopping(np.mean(vaild_loss_list), model)

        if early_stopping.early_stop:
            print('Early Stop!')
            break

        model.load_state_dict(torch.load('../temp/checkpoint.pt'))

    return  model, avg_train_losses, avg_valid_losses

In [None]:
import random

patience = 10
n_epochs = 60
times = 10
sum_acc1, sum_acc5, sum_p, sum_r, sum_f1 = 0, 0, 0, 0, 0

for idx, seed in enumerate(random.sample(range(0, 1000), times)):

    # seed = 555
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    model = T3S().cuda()
    optimizer = torch.optim.Adam(model.parameters(),lr=5e-3)
    model, train_loss, valid_loss = train_model(model, patience, n_epochs)

    model.eval()
    test_dataloader = DataLoader(dataset=test_dataset, batch_size=16, sampler=test_sampler, drop_last=False, collate_fn=collate_fn)
    loss_list, acc1_list, acc5_list, y_predict_list, y_true_list = [], [], [], [], []
    for idx, (seq_x, seq_g, y_true, input_length) in enumerate(test_dataloader):
        # seq_x, seq_g, y_true = seq_x.cuda(), seq_g.cuda(), y_true.cuda()
        seq_g, y_true = seq_g.cuda(), y_true.cuda()
        with torch.no_grad():
            y_predict = model(seq_x, seq_g, input_length)
            y_predict_list.extend(torch.max(y_predict, 1)[1].cpu().numpy().tolist())
            y_true_list.extend(y_true.cpu().numpy().tolist())
            loss = F.nll_loss(y_predict, y_true)
            loss_list.append(loss.item())
            acc1_list.append(accuracy_1(y_predict, y_true))
            acc5_list.append(accuracy_5(y_predict, y_true))
    p = precision_score(y_true_list, y_predict_list, average='macro')
    r = recall_score(y_true_list, y_predict_list, average='macro')
    f1 = f1_score( y_true_list, y_predict_list, average='macro')
    sum_acc1 += np.mean(acc1_list)
    sum_acc5 += np.mean(acc5_list)
    sum_p += p
    sum_r += r
    sum_f1 += f1
    print("Testset result:\n loss_test:{:.5f} \t acc1_test:{:.4f} \t  acc5_test:{:.4f} \t Macro-P:{:.4f} \t Macro-R:{:.4f} \t Macro-F1:{:.4f}".format(np.mean(loss_list), np.mean(acc1_list), np.mean(acc5_list), p, r ,f1))

print('acc1:{:.4f}, acc5:{:.4f}, p:{:.4f}, r:{:.4f}, f1:{:.4f}'.format(sum_acc1/5, sum_acc5/5, sum_p/5, sum_r/5, sum_f1/5))