In [5]:
import torch
from torchtext.datasets import IMDB
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
# https://github.com/pytorch/text/blob/master/examples/legacy_tutorial/migration_tutorial.ipynb

# 第一步，创造数据集

In [2]:
rootdir = '/mnt/HDD4/lyp/Dataset/team/fornlptest' # 存放数据集的位置
train_iter, test_iter= IMDB(rootdir,split=('train', 'test'))

# 第二步 创建数据处理

In [8]:
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset
# 把 iterable 类型数据集转化为 map-style 

train_dataset = to_map_style_dataset(train_iter)
train_dataset = to_map_style_dataset(test_iter)

In [11]:
num_train = int(len(train_dataset) * 0.95)
num_train

23750

In [12]:
split_train, split_valid = random_split(train_dataset, [num_train, len(train_dataset) - num_train])

In [13]:
type(split_train)

torch.utils.data.dataset.Subset

In [16]:
tokenizer = get_tokenizer('basic_english')
def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)
vocab = build_vocab_from_iterator(yield_tokens(split_train), specials=["<unk>", "<BOS>", "<EOS>", "<PAD>"])

In [17]:
len(vocab)

96140

In [None]:
for a, b in split_train:
    print(b)
    break

from collections import Counter
from torchtext.vocab import Vocab

counter = Counter()
for (label, line) in train_iter:
    counter.update(tokenizer(line))

In [4]:
# 设置，当有未知 token 时候， 用 <unk> 的值代替
vocab.set_default_index(vocab["<unk>"])

In [5]:
len(vocab)

100686

In [6]:
# 检查 token 是否在 vocab 里面
vocab.__contains__('<BOS>')

True

In [8]:
vocab(['<EOS>'])

[2]

In [16]:
from collections import Counter
from torchtext.vocab import Vocab

counter = Counter()
for (label, line) in train_iter:
    counter.update(tokenizer(line)) # 统计单词和它出现的次数

# 3. 创建 Pipeline

In [6]:
text_pipeline = lambda x: [vocab['BOS']] + vocab(tokenizer(x)) + [vocab['<EOS>']]
label_pipeline = lambda x: int(x) -1

In [7]:
# 把 string 转化为 index
text_pipeline('Even Ingmar Bergman, arguably their answer to good old boy John Ford')

[0, 65, 14518, 4443, 6, 4636, 75, 1478, 10, 59, 178, 437, 306, 1723, 2]

In [8]:
label_pipeline('10')

9

# 3. 设置 iterator

In [9]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence


In [10]:
list(train_iter)

[]

In [3]:
import os
url = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
_, filename = os.path.split(url)

In [2]:
from torch.utils.data.dataset import random_split

a, b = random_split(range(10), [3, 7])

In [7]:
a

<torch.utils.data.dataset.Subset at 0x7f1904578be0>

In [6]:
print(list(b))

[9, 7, 5, 4, 6, 3, 0]


In [8]:
import os
url = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
os.path.split(url)

('http://ai.stanford.edu/~amaas/data/sentiment', 'aclImdb_v1.tar.gz')

In [10]:
root = '/home/lyp/team/personal_code/fornlptest/data'
os.path.abspath(root)

'/home/lyp/team/personal_code/fornlptest/data'

In [None]:
filename = 'aclImdb_v1.tar.gz'
root = '/home/lyp/team/personal_code/fornlptest/data'
path = '/home/lyp/team/personal_code/fornlptest/data/aclImdb_v1.tar.gz'

In [11]:
from torchtext.utils import download_from_url, extract_archive
url = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
data_dir = '/home/lyp/team/personal_code/fornlptest/data'
MD5 = '7c2ac02c03563afcf9b574c7e56c153a'
dataset_tar = download_from_url(url, root=data_dir,
                                    hash_value=MD5, hash_type='md5')

100%|██████████| 84.1M/84.1M [08:30<00:00, 165kB/s]


In [12]:
dataset_tar

'/home/lyp/team/personal_code/fornlptest/data/aclImdb_v1.tar.gz'

In [14]:
extracted_files = extract_archive(dataset_tar)

In [13]:
os.path.dirname(dataset_tar)

'/home/lyp/team/personal_code/fornlptest/data'

In [15]:
type(extracted_files)

list

In [16]:
len(extracted_files)

100011

In [21]:
extracted_files[188]

'/home/lyp/team/personal_code/fornlptest/data/aclImdb/test/neg/206_2.txt'

In [22]:
from pathlib import Path

a = extracted_files[188]
Path(a).parts

('/',
 'home',
 'lyp',
 'team',
 'personal_code',
 'fornlptest',
 'data',
 'aclImdb',
 'test',
 'neg',
 '206_2.txt')

In [37]:
a = list(Path('/home/lyp/team/personal_code/fornlptest/data/aclImdb/train').glob('*/*.txt'))

In [40]:
a[-1]

PosixPath('/home/lyp/team/personal_code/fornlptest/data/aclImdb/train/unsup/49589_0.txt')

In [29]:
b = glob('/home/lyp/team/personal_code/fornlptest/data/aclImdb/train/pos/*.txt')
c = glob('/home/lyp/team/personal_code/fornlptest/data/aclImdb/train/neg/*.txt')
len(b) + len(c)

25000

In [30]:
len(b)

12500

In [34]:
len(c)

12500

In [1]:
file_path = '/home/lyp/team/personal_code/fornlptest/data/aclImdb/train/pos/1_7.txt'
with open(file_path) as f:
    data = f.read()

In [2]:
data

"If you like adult comedy cartoons, like South Park, then this is nearly a similar format about the small adventures of three teenage girls at Bromwell High. Keisha, Natella and Latrina have given exploding sweets and behaved like bitches, I think Keisha is a good leader. There are also small stories going on with the teachers of the school. There's the idiotic principal, Mr. Bip, the nervous Maths teacher and many others. The cast is also fantastic, Lenny Henry's Gina Yashere, EastEnders Chrissie Watts, Tracy-Ann Oberman, Smack The Pony's Doon Mackichan, Dead Ringers' Mark Perry and Blunder's Nina Conti. I didn't know this came from Canada, but it is very good. Very good!"

In [1]:
from pathlib import Path
data_path = '/home/lyp/team/personal_code/fornlptest/data/aclImdb/train/pos/0_9.txt'

d = Path(data_path)

In [4]:
d.parts

('/',
 'home',
 'lyp',
 'team',
 'personal_code',
 'fornlptest',
 'data',
 'aclImdb',
 'train',
 'pos',
 '0_9.txt')

In [5]:
d

PosixPath('/home/lyp/team/personal_code/fornlptest/data/aclImdb/train/pos/0_9.txt')

In [6]:
'pos' in d

TypeError: argument of type 'PosixPath' is not iterable

In [10]:
p1 = Path('/home/lyp/team/personal_code/fornlptest/data/aclImdb/train/pos')

a = list(p1.glob('*.txt'))

In [12]:
type(a[0])

pathlib.PosixPath

In [13]:
type(p1)

pathlib.PosixPath

In [16]:
a[0].parts[-2]

'pos'

In [17]:
for split in ['train', 'test']:
    print(split)

train
test


In [24]:
datas = {}
data_dir = Path('/home/lyp/team/personal_code/fornlptest/data/aclImdb')
for split in ['train', 'test']:
    split_data = []
    
    # 2. 排除 unsup 的影响
    for label in ['neg', 'pos']:
        
        # 3. 读入 dir 中的数据
        for file_path in list((data_dir / split / label).glob('*.txt')):
            
            with file_path.open() as f:
                sentence = f.read()
            
            split_data.append([label, sentence])

    datas[split] = split_data

In [26]:
datas.keys()

dict_keys(['train', 'test'])

In [27]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
tokenizer = get_tokenizer('basic_english')

In [29]:
def yield_tokens(data_dir):
    for _, text in data_dir:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(datas['train']), specials=['<unk>', '<bos>', '<eos>', '<pad>'])

In [30]:
vocab.set_default_index(vocab['<unk>'])

In [31]:
text_pipeline = lambda x: [vocab['BOS']] + vocab(tokenizer(x)) + [vocab['<EOS>']]
label_pipeline = lambda x: int(x) -1
text_pipeline('Even Ingmar Bergman, arguably their answer to good old boy John Ford')

[0, 65, 14518, 4443, 6, 4636, 75, 1478, 10, 59, 178, 437, 306, 1723, 0]

In [15]:
import torch

#torch.save(vocab, '/home/lyp/team/personal_code/fornlptest/data/v.pth')

In [18]:
voca = torch.load('/home/lyp/team/personal_code/fornlptest/data/v.pth')

In [19]:
len(voca)

100686

In [17]:
text_pipeline = lambda x: [voca['BOS']] + voca(tokenizer(x)) + [voca['<EOS>']]
text_pipeline('Even Ingmar Bergman, arguably their answer to good old boy John Ford')

NameError: name 'voca' is not defined

In [37]:
p2 = Path('/home/lyp/team/personal_code/fornlptest/data/v2.pth')
p2.is_file()

False

In [41]:
label = 'pos'
label = 0 if label == 'neg' else 1

In [42]:
label

1

In [2]:
datas = [[0, 2], [1, 3],[4, 100]]
for d1, d2 in datas:
    print(d1)

0
1
4


In [5]:
import torch
torch.tensor([0], dtype=torch.int64)

tensor([0])

In [8]:
from torch.nn.utils.rnn import pad_sequence

torch.randn(3).shape

torch.Size([3])

In [9]:
deflist = [torch.randn(3),torch.randn(5), torch.randn(7)]
deflist

[tensor([ 0.5388, -0.4979, -0.7980]),
 tensor([0.4048, 0.3887, 0.2158, 0.2000, 0.1697]),
 tensor([ 0.5574,  0.3855,  2.7263,  0.5070,  1.9087, -0.6883,  0.5938])]

In [14]:
pad_sequence(deflist, batch_first=True)

tensor([[ 0.5388, -0.4979, -0.7980,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.4048,  0.3887,  0.2158,  0.2000,  0.1697,  0.0000,  0.0000],
        [ 0.5574,  0.3855,  2.7263,  0.5070,  1.9087, -0.6883,  0.5938]])

In [21]:
from typing import Tuple
('a', 'b') == Tuple

False

In [23]:
from pathlib import Path
p = Path('/home/lyp/team/personal_code/fornlptest/data/aclImdb_v1.tar.gz')
p.is_file()

True

In [24]:
data = [0, 'I love China']

In [25]:
a, b = data

In [26]:
a

0

In [27]:
b

'I love China'

In [5]:
from torchtext.vocab import vocab
from collections import OrderedDict, Counter

# counter = Counter(['neg', 'pos'])
# sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: x[1], reverse=True)
# ordered_dict = OrderedDict(sorted_by_freq_tuples)
# v1 = vocab(ordered_dict)

In [6]:
tokens = ['e', 'd', 'c', 'b', 'a']
v2 = vocab(OrderedDict([(token, 1) for token in tokens]))

In [3]:
v2.lookup_token(3)

'b'

In [8]:
' '.join(v2.lookup_tokens([2,1,3]))

'c d b'

In [2]:
import torch

logits = torch.randn(10, 3)
hypothesis = logits.topk(1, dim=1).indices
hypothesis.shape

torch.Size([10, 1])

In [4]:
h = hypothesis.squeeze(1)
h.tolist()

[0, 0, 2, 0, 1, 0, 2, 0, 0, 0]