Better filenames

NirantK · Jun 14, 2018 · b55822d · b55822d
1 parent 39ad40d
commit b55822d
Show file tree

Hide file tree

Showing 2 changed files with 299 additions and 1 deletion.
diff --git a/Scratchpad.ipynb → Part-10 Bonus Content.ipynb b/Scratchpad.ipynb → Part-10 Bonus Content.ipynb
@@ -183,7 +183,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.4"
+   "version": "3.6.5"
   }
  },
  "nbformat": 4,

diff --git a/Playground.ipynb b/Playground.ipynb
@@ -0,0 +1,298 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from torchtext import data\n",
+    "from torchtext import datasets\n",
+    "from torchtext.vocab import GloVe"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from torch.utils.data.dataset import Dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re\n",
+    "import logging\n",
+    "\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import spacy\n",
+    "import torch\n",
+    "from joblib import Memory\n",
+    "from torchtext import data\n",
+    "from sklearn.model_selection import KFold"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "NLP = spacy.load('en')\n",
+    "MAX_CHARS = 20000\n",
+    "LOGGER = logging.getLogger(\"imdb_dataset\")\n",
+    "MEMORY = Memory(cachedir=\"cache/\", verbose=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def tokenizer(review):\n",
+    "    review = re.sub(\n",
+    "        r\"[\\*\\\"“”\\n\\\\…\\+\\-\\/\\=\\(\\)‘•:\\[\\]\\|’\\!;]\", \" \", str(review))\n",
+    "    review = re.sub(r\"[ ]+\", \" \", review)\n",
+    "    review = re.sub(r\"\\!+\", \"!\", review)\n",
+    "    review = re.sub(r\"\\,+\", \",\", review)\n",
+    "    review = re.sub(r\"\\?+\", \"?\", review)\n",
+    "    if (len(review) > MAX_CHARS):\n",
+    "        review = review[:MAX_CHARS]\n",
+    "    return [x.text for x in NLP.tokenizer(review) if x.text != \" \"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['This',\n",
+       " 'is',\n",
+       " 'an',\n",
+       " 'amazing',\n",
+       " 'review',\n",
+       " ',',\n",
+       " 'but',\n",
+       " 'ca',\n",
+       " \"n't\",\n",
+       " '54help',\n",
+       " 'it']"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tokenizer(\"This is an amazing review, but can't 54help!it\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def prepare_csv(train_csv_path=\"data/train.csv\", test_csv_path=\"data/test.csv\", VAL_RATIO = 0.2, seed=37):\n",
+    "    \n",
+    "    df_train = pd.read_csv(train_csv_path)\n",
+    "    df_train[\"text\"] = df_train.text.str.replace(\"\\n\", \" \")\n",
+    "    idx = np.arange(df_train.shape[0])\n",
+    "    np.random.seed(seed)\n",
+    "    np.random.shuffle(idx)\n",
+    "    val_size = int(len(idx) * VAL_RATIO)\n",
+    "    df_train.iloc[idx[val_size:], :].to_csv(\n",
+    "        \"cache/dataset_train.csv\", index=False)\n",
+    "    df_train.iloc[idx[:val_size], :].to_csv(\n",
+    "        \"cache/dataset_val.csv\", index=False)\n",
+    "    \n",
+    "    # repeat this for test\n",
+    "    df_test = pd.read_csv(test_csv_path)\n",
+    "    df_test[\"text\"] = df_test.text.str.replace(\"\\n\", \" \")\n",
+    "    df_test.to_csv(\"cache/dataset_test.csv\", index=False)\n",
+    "\n",
+    "prepare_csv() "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@MEMORY.cache\n",
+    "def read_files(fix_length=100, lower=False, vectors=None):\n",
+    "    if vectors is not None:\n",
+    "        # pretrain vectors only support all lower case\n",
+    "        lower = True\n",
+    "    LOGGER.debug(\"Preparing CSV files...\")\n",
+    "    prepare_csv()\n",
+    "    comment = data.Field(\n",
+    "        sequential=True,\n",
+    "        fix_length=fix_length,\n",
+    "        tokenize=tokenizer,\n",
+    "        pad_first=True,\n",
+    "        tensor_type=torch.cuda.LongTensor,\n",
+    "        lower=lower\n",
+    "    )\n",
+    "    LOGGER.debug(\"Reading train csv file...\")\n",
+    "    train = data.TabularDataset(\n",
+    "        path='cache/dataset_train.csv', format='csv', skip_header=True,\n",
+    "        fields=[\n",
+    "#             ('id', None),\n",
+    "            ('text', review),\n",
+    "            ('label', data.Field(\n",
+    "                use_vocab=False, sequential=False, tensor_type=torch.cuda.ByteTensor)),\n",
+    "        ])\n",
+    "    LOGGER.debug(\"Reading test csv file...\")\n",
+    "    test = data.TabularDataset(\n",
+    "        path='cache/dataset_test.csv', format='csv', skip_header=True,\n",
+    "        fields=[\n",
+    "#             ('id', None),\n",
+    "            ('text', review)\n",
+    "        ])\n",
+    "    LOGGER.debug(\"Building vocabulary...\")\n",
+    "    review.build_vocab(\n",
+    "        train, test,\n",
+    "        max_size=20000,\n",
+    "        min_freq=50,\n",
+    "        vectors=vectors\n",
+    "    )\n",
+    "    LOGGER.debug(\"Done preparing the datasets\")\n",
+    "\n",
+    "    return train.examples, test.examples, review"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_dataset(fix_length=100, lower=False, vectors=None, n_folds=3, seed=37):\n",
+    "    train_exs, test_exs, review = read_files(\n",
+    "        fix_length=fix_length, lower=lower, vectors=vectors)\n",
+    "\n",
+    "    kf = KFold(n_splits=n_folds, random_state=seed)\n",
+    "\n",
+    "    fields = [\n",
+    "#         ('id', None),\n",
+    "        ('text', review),\n",
+    "        ('label', data.Field(\n",
+    "            use_vocab=False, sequential=False, tensor_type=torch.cuda.ByteTensor)),\n",
+    "   ]\n",
+    "\n",
+    "    def iter_folds():\n",
+    "        train_exs_arr = np.array(train_exs)\n",
+    "        for train_idx, val_idx in kf.split(train_exs_arr):\n",
+    "            yield (\n",
+    "                data.Dataset(train_exs_arr[train_idx], fields),\n",
+    "                data.Dataset(train_exs_arr[val_idx], fields),\n",
+    "            )\n",
+    "\n",
+    "    test = data.Dataset(test_exs, fields[:2])\n",
+    "    return iter_folds(), test"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_iterator(dataset, batch_size, train=True, shuffle=True, repeat=False):\n",
+    "    dataset_iter = data.Iterator(\n",
+    "        dataset, batch_size=batch_size, device=0,\n",
+    "        train=train, shuffle=shuffle, repeat=repeat,\n",
+    "        sort=False\n",
+    "    )\n",
+    "    return dataset_iter"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'self' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "\u001b[1;32m<ipython-input-11-d2ae5e6566b8>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m      1\u001b[0m for examples in get_iterator(\n\u001b[1;32m----> 2\u001b[1;33m             \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtrain_dataset\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbatch_size\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtrain\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      3\u001b[0m             \u001b[0mshuffle\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mrepeat\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mFalse\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      4\u001b[0m         ):\n\u001b[0;32m      5\u001b[0m     \u001b[0mx\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mexamples\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcomment_text\u001b[0m \u001b[1;31m# (fix_length, batch_size) Tensor\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;31mNameError\u001b[0m: name 'self' is not defined"
+     ]
+    }
+   ],
+   "source": [
+    "for examples in get_iterator(\n",
+    "            self.train_dataset, batch_size, train=True,\n",
+    "            shuffle=True, repeat=False\n",
+    "        ):\n",
+    "    x = examples.comment_text # (fix_length, batch_size) Tensor\n",
+    "    y = torch.stack([\n",
+    "        examples.toxic, examples.severe_toxic, \n",
+    "        examples.obscene,\n",
+    "        examples.threat, examples.insult, \n",
+    "        examples.identity_hate\n",
+    "    ], dim=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "fastAI",
+   "language": "python",
+   "name": "fastai"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}