This repository has been archived by the owner on Jul 4, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 258
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
7aafdfa
commit b348344
Showing
8 changed files
with
144 additions
and
15 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
NUM:dist How far is it from Denver to Aspen ? | ||
LOC:city What county is Modesto , California in ? | ||
HUM:desc Who was Galileo ? | ||
DESC:def What is an atom ? | ||
NUM:date When did Hawaii become a state ? | ||
NUM:dist How tall is the Sears Building ? | ||
HUM:gr George Bush purchased a small interest in which baseball team ? | ||
ENTY:plant What is Australia 's national flower ? | ||
DESC:reason Why does the moon turn orange ? | ||
DESC:def What is autism ? | ||
LOC:city What city had a world fair in 1900 ? | ||
HUM:ind What person 's head is on a dime ? | ||
NUM:weight What is the average weight of a Yellow Labrador ? | ||
HUM:ind Who was the first man to fly across the Pacific Ocean ? | ||
NUM:date When did Idaho become a state ? |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
DESC:manner How did serfdom develop in and then leave Russia ? | ||
ENTY:cremat What films featured the character Popeye Doyle ? | ||
DESC:manner How can I find a list of celebrities ' real names ? | ||
ENTY:animal What fowl grabs the spotlight after the Chinese Year of the Monkey ? | ||
ABBR:exp What is the full form of .com ? | ||
HUM:ind What contemptible scoundrel stole the cork from my lunch ? | ||
HUM:gr What team did baseball 's St. Louis Browns become ? | ||
HUM:title What is the oldest profession ? | ||
DESC:def What are liver enzymes ? | ||
HUM:ind Name the scar-faced bounty hunter of The Old West . | ||
NUM:date When was Ozzy Osbourne born ? | ||
DESC:reason Why do heavier objects travel downhill faster ? | ||
HUM:ind Who was The Pride of the Yankees ? | ||
HUM:ind Who killed Gandhi ? | ||
ENTY:event What is considered the costliest disaster the insurance industry has ever faced ? |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
import mock | ||
|
||
from torchnlp.datasets import trec_dataset | ||
from tests.datasets.utils import urlretrieve_side_effect | ||
|
||
directory = 'tests/_test_data/trec' | ||
|
||
|
||
@mock.patch("urllib.request.urlretrieve") | ||
def test_penn_treebank_dataset_row(mock_urlretrieve): | ||
mock_urlretrieve.side_effect = urlretrieve_side_effect | ||
|
||
# Check a row are parsed correctly | ||
train, test = trec_dataset(directory=directory, test=True, train=True, check_file=None) | ||
assert len(train) > 0 | ||
assert len(test) > 0 | ||
assert train[:2] == [{ | ||
'label_fine': 'manner', | ||
'label': 'DESC', | ||
'text': 'How did serfdom develop in and then leave Russia ?' | ||
}, { | ||
'label_fine': 'cremat', | ||
'label': 'ENTY', | ||
'text': 'What films featured the character Popeye Doyle ?' | ||
}] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
import os | ||
|
||
from torchnlp.utils import download_urls | ||
from torchnlp.datasets.dataset import Dataset | ||
|
||
|
||
def trec_dataset(directory='data/trec/', | ||
train=False, | ||
test=False, | ||
train_filename='train_5500.label', | ||
test_filename='TREC_10.label', | ||
check_file='train_5500.label', | ||
urls=[ | ||
'http://cogcomp.org/Data/QA/QC/train_5500.label', | ||
'http://cogcomp.org/Data/QA/QC/TREC_10.label' | ||
]): | ||
""" | ||
Load the Text REtrieval Conference (TREC) Question Classification dataset. | ||
TREC dataset contains 5500 labeled questions in training set and another 500 for test set. The | ||
dataset has 6 labels, 50 level-2 labels. Average length of each sentence is 10, vocabulary size | ||
of 8700. | ||
More details: | ||
https://nlp.stanford.edu/courses/cs224n/2004/may-steinberg-project.pdf | ||
http://cogcomp.org/Data/QA/QC/ | ||
http://www.aclweb.org/anthology/C02-1150 | ||
Citation: | ||
Xin Li, Dan Roth, Learning Question Classifiers. COLING'02, Aug., 2002. | ||
Args: | ||
directory (str, optional): Directory to cache the dataset. | ||
train (bool, optional): If to load the training split of the dataset. | ||
test (bool, optional): If to load the test split of the dataset. | ||
train_filename (str, optional): The filename of the training split. | ||
test_filename (str, optional): The filename of the test split. | ||
check_file (str, optional): Check this file exists if download was successful. | ||
urls (str, optional): URLs to download. | ||
Returns: | ||
:class:`tuple` of :class:`list` of :class:`str`: Tuple with the training tokens, dev tokens | ||
and test tokens in order if their respective boolean argument is true. | ||
Example: | ||
>>> from torchnlp.datasets import trec_dataset | ||
>>> train = trec_dataset(train=True) | ||
>>> train[:2] # Sentence at index 17 is shortish | ||
[{ | ||
'label_fine': 'manner', | ||
'label': 'DESC', | ||
'text': 'How did serfdom develop in and then leave Russia ?' | ||
}, { | ||
'label_fine': 'cremat', | ||
'label': 'ENTY', | ||
'text': 'What films featured the character Popeye Doyle ?' | ||
}] | ||
""" | ||
download_urls(directory=directory, urls=urls, check_file=check_file) | ||
|
||
ret = [] | ||
splits = [(train, train_filename), (test, test_filename)] | ||
split_filenames = [dir_ for (requested, dir_) in splits if requested] | ||
for filename in split_filenames: | ||
full_path = os.path.join(directory, filename) | ||
examples = [] | ||
for line in open(full_path, 'rb'): | ||
# there is one non-ASCII byte: sisterBADBYTEcity; replaced with space | ||
label, _, text = line.replace(b'\xf0', b' ').strip().decode().partition(' ') | ||
label, _, label_fine = label.partition(':') | ||
examples.append({'label_fine': label_fine, 'label': label, 'text': text}) | ||
ret.append(Dataset(examples)) | ||
|
||
if len(ret) == 1: | ||
return ret[0] | ||
else: | ||
return tuple(ret) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters