Skip to content

Commit

Permalink
Better filenames
Browse files Browse the repository at this point in the history
  • Loading branch information
NirantK committed Jun 14, 2018
1 parent 39ad40d commit b55822d
Show file tree
Hide file tree
Showing 2 changed files with 299 additions and 1 deletion.
2 changes: 1 addition & 1 deletion Scratchpad.ipynb → Part-10 Bonus Content.ipynb
Expand Up @@ -183,7 +183,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
"version": "3.6.5"
}
},
"nbformat": 4,
Expand Down
298 changes: 298 additions & 0 deletions Playground.ipynb
@@ -0,0 +1,298 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"from torchtext import data\n",
"from torchtext import datasets\n",
"from torchtext.vocab import GloVe"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from torch.utils.data.dataset import Dataset"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"import logging\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"import spacy\n",
"import torch\n",
"from joblib import Memory\n",
"from torchtext import data\n",
"from sklearn.model_selection import KFold"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"NLP = spacy.load('en')\n",
"MAX_CHARS = 20000\n",
"LOGGER = logging.getLogger(\"imdb_dataset\")\n",
"MEMORY = Memory(cachedir=\"cache/\", verbose=1)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"def tokenizer(review):\n",
" review = re.sub(\n",
" r\"[\\*\\\"“”\\n\\\\\\+\\-\\/\\=\\(\\)‘•:\\[\\]\\|’\\!;]\", \" \", str(review))\n",
" review = re.sub(r\"[ ]+\", \" \", review)\n",
" review = re.sub(r\"\\!+\", \"!\", review)\n",
" review = re.sub(r\"\\,+\", \",\", review)\n",
" review = re.sub(r\"\\?+\", \"?\", review)\n",
" if (len(review) > MAX_CHARS):\n",
" review = review[:MAX_CHARS]\n",
" return [x.text for x in NLP.tokenizer(review) if x.text != \" \"]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['This',\n",
" 'is',\n",
" 'an',\n",
" 'amazing',\n",
" 'review',\n",
" ',',\n",
" 'but',\n",
" 'ca',\n",
" \"n't\",\n",
" '54help',\n",
" 'it']"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tokenizer(\"This is an amazing review, but can't 54help!it\")"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"def prepare_csv(train_csv_path=\"data/train.csv\", test_csv_path=\"data/test.csv\", VAL_RATIO = 0.2, seed=37):\n",
" \n",
" df_train = pd.read_csv(train_csv_path)\n",
" df_train[\"text\"] = df_train.text.str.replace(\"\\n\", \" \")\n",
" idx = np.arange(df_train.shape[0])\n",
" np.random.seed(seed)\n",
" np.random.shuffle(idx)\n",
" val_size = int(len(idx) * VAL_RATIO)\n",
" df_train.iloc[idx[val_size:], :].to_csv(\n",
" \"cache/dataset_train.csv\", index=False)\n",
" df_train.iloc[idx[:val_size], :].to_csv(\n",
" \"cache/dataset_val.csv\", index=False)\n",
" \n",
" # repeat this for test\n",
" df_test = pd.read_csv(test_csv_path)\n",
" df_test[\"text\"] = df_test.text.str.replace(\"\\n\", \" \")\n",
" df_test.to_csv(\"cache/dataset_test.csv\", index=False)\n",
"\n",
"prepare_csv() "
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"@MEMORY.cache\n",
"def read_files(fix_length=100, lower=False, vectors=None):\n",
" if vectors is not None:\n",
" # pretrain vectors only support all lower case\n",
" lower = True\n",
" LOGGER.debug(\"Preparing CSV files...\")\n",
" prepare_csv()\n",
" comment = data.Field(\n",
" sequential=True,\n",
" fix_length=fix_length,\n",
" tokenize=tokenizer,\n",
" pad_first=True,\n",
" tensor_type=torch.cuda.LongTensor,\n",
" lower=lower\n",
" )\n",
" LOGGER.debug(\"Reading train csv file...\")\n",
" train = data.TabularDataset(\n",
" path='cache/dataset_train.csv', format='csv', skip_header=True,\n",
" fields=[\n",
"# ('id', None),\n",
" ('text', review),\n",
" ('label', data.Field(\n",
" use_vocab=False, sequential=False, tensor_type=torch.cuda.ByteTensor)),\n",
" ])\n",
" LOGGER.debug(\"Reading test csv file...\")\n",
" test = data.TabularDataset(\n",
" path='cache/dataset_test.csv', format='csv', skip_header=True,\n",
" fields=[\n",
"# ('id', None),\n",
" ('text', review)\n",
" ])\n",
" LOGGER.debug(\"Building vocabulary...\")\n",
" review.build_vocab(\n",
" train, test,\n",
" max_size=20000,\n",
" min_freq=50,\n",
" vectors=vectors\n",
" )\n",
" LOGGER.debug(\"Done preparing the datasets\")\n",
"\n",
" return train.examples, test.examples, review"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"def get_dataset(fix_length=100, lower=False, vectors=None, n_folds=3, seed=37):\n",
" train_exs, test_exs, review = read_files(\n",
" fix_length=fix_length, lower=lower, vectors=vectors)\n",
"\n",
" kf = KFold(n_splits=n_folds, random_state=seed)\n",
"\n",
" fields = [\n",
"# ('id', None),\n",
" ('text', review),\n",
" ('label', data.Field(\n",
" use_vocab=False, sequential=False, tensor_type=torch.cuda.ByteTensor)),\n",
" ]\n",
"\n",
" def iter_folds():\n",
" train_exs_arr = np.array(train_exs)\n",
" for train_idx, val_idx in kf.split(train_exs_arr):\n",
" yield (\n",
" data.Dataset(train_exs_arr[train_idx], fields),\n",
" data.Dataset(train_exs_arr[val_idx], fields),\n",
" )\n",
"\n",
" test = data.Dataset(test_exs, fields[:2])\n",
" return iter_folds(), test"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"def get_iterator(dataset, batch_size, train=True, shuffle=True, repeat=False):\n",
" dataset_iter = data.Iterator(\n",
" dataset, batch_size=batch_size, device=0,\n",
" train=train, shuffle=shuffle, repeat=repeat,\n",
" sort=False\n",
" )\n",
" return dataset_iter"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'self' is not defined",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-11-d2ae5e6566b8>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m 1\u001b[0m for examples in get_iterator(\n\u001b[1;32m----> 2\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtrain_dataset\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbatch_size\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtrain\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 3\u001b[0m \u001b[0mshuffle\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mrepeat\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mFalse\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m ):\n\u001b[0;32m 5\u001b[0m \u001b[0mx\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mexamples\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcomment_text\u001b[0m \u001b[1;31m# (fix_length, batch_size) Tensor\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mNameError\u001b[0m: name 'self' is not defined"
]
}
],
"source": [
"for examples in get_iterator(\n",
" self.train_dataset, batch_size, train=True,\n",
" shuffle=True, repeat=False\n",
" ):\n",
" x = examples.comment_text # (fix_length, batch_size) Tensor\n",
" y = torch.stack([\n",
" examples.toxic, examples.severe_toxic, \n",
" examples.obscene,\n",
" examples.threat, examples.insult, \n",
" examples.identity_hate\n",
" ], dim=1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "fastAI",
"language": "python",
"name": "fastai"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

0 comments on commit b55822d

Please sign in to comment.