Skip to content

Commit

Permalink
Working recurrent neural network
Browse files Browse the repository at this point in the history
  • Loading branch information
WillKoehrsen committed Oct 14, 2018
1 parent 97cf918 commit af9e7c9
Show file tree
Hide file tree
Showing 8 changed files with 3,200 additions and 1,123 deletions.
1,461 changes: 554 additions & 907 deletions data/found_tech_patents.ndjson

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion notebooks/Book Recommendation System.ipynb
Expand Up @@ -3394,7 +3394,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "Python [default]",
"language": "python",
"name": "python3"
},
Expand Down
246 changes: 246 additions & 0 deletions notebooks/Download Abstracts.ipynb
@@ -0,0 +1,246 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Downloading all Wikipedia Articles \n",
"\n",
"This notebook implements the downloading of all Wikipedia articles. I kept the actual download out of the main notebook because of the lengthy output. "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Find Files to Download"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"There are 27 files to download.\n"
]
}
],
"source": [
"import requests\n",
"from bs4 import BeautifulSoup\n",
"from timeit import default_timer as timer\n",
"import os\n",
"\n",
"base_url = 'https://dumps.wikimedia.org/enwiki/'\n",
"index = requests.get(base_url).text\n",
"soup_index = BeautifulSoup(index, 'html.parser')\n",
"\n",
"# Find the links that are dates of dumps\n",
"dumps = [a['href'] for a in soup_index.find_all('a') if \n",
" a.text == '20181001/']\n",
"\n",
"dumps_url = base_url + dumps[0]\n",
"\n",
"# Retrieve the html\n",
"dump_html = requests.get(dumps_url).text\n",
"\n",
"# Convert to a soup\n",
"soup_dump = BeautifulSoup(dump_html, 'html.parser')\n",
"\n",
"files = []\n",
"for file in soup_dump.find_all('li', {'class': 'file'}):\n",
" text = file.text\n",
" if 'abstract' in text:\n",
" files.append((text.split()[0], text.split()[1:]))\n",
" \n",
"files_to_download = [file[0] for file in files if file[0] != 'enwiki-20181001-abstract.xml.gz']\n",
"print(f'There are {len(files_to_download)} files to download.')"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['enwiki-20181001-abstract1.xml.gz',\n",
" 'enwiki-20181001-abstract2.xml.gz',\n",
" 'enwiki-20181001-abstract3.xml.gz',\n",
" 'enwiki-20181001-abstract4.xml.gz',\n",
" 'enwiki-20181001-abstract5.xml.gz',\n",
" 'enwiki-20181001-abstract6.xml.gz',\n",
" 'enwiki-20181001-abstract7.xml.gz',\n",
" 'enwiki-20181001-abstract8.xml.gz',\n",
" 'enwiki-20181001-abstract9.xml.gz',\n",
" 'enwiki-20181001-abstract10.xml.gz',\n",
" 'enwiki-20181001-abstract11.xml.gz',\n",
" 'enwiki-20181001-abstract12.xml.gz',\n",
" 'enwiki-20181001-abstract13.xml.gz',\n",
" 'enwiki-20181001-abstract14.xml.gz',\n",
" 'enwiki-20181001-abstract15.xml.gz',\n",
" 'enwiki-20181001-abstract16.xml.gz',\n",
" 'enwiki-20181001-abstract17.xml.gz',\n",
" 'enwiki-20181001-abstract18.xml.gz',\n",
" 'enwiki-20181001-abstract19.xml.gz',\n",
" 'enwiki-20181001-abstract20.xml.gz',\n",
" 'enwiki-20181001-abstract21.xml.gz',\n",
" 'enwiki-20181001-abstract22.xml.gz',\n",
" 'enwiki-20181001-abstract23.xml.gz',\n",
" 'enwiki-20181001-abstract24.xml.gz',\n",
" 'enwiki-20181001-abstract25.xml.gz',\n",
" 'enwiki-20181001-abstract26.xml.gz',\n",
" 'enwiki-20181001-abstract27.xml.gz']"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"files_to_download"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Download Files Using Keras\n",
"\n",
"Files will be saved in `/.keras/datasets`."
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Using TensorFlow backend.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading data from https://dumps.wikimedia.org/enwiki/20181001/enwiki-20181001-abstract1.xml.gz\n",
"102514688/102514678 [==============================] - 49s 0us/step\n",
"Downloading data from https://dumps.wikimedia.org/enwiki/20181001/enwiki-20181001-abstract2.xml.gz\n",
"51953664/51952869 [==============================] - 25s 0us/step\n",
"Downloading data from https://dumps.wikimedia.org/enwiki/20181001/enwiki-20181001-abstract3.xml.gz\n",
"37773312/37770207 [==============================] - 18s 0us/step\n",
"Downloading data from https://dumps.wikimedia.org/enwiki/20181001/enwiki-20181001-abstract4.xml.gz\n",
"29802496/29798632 [==============================] - 14s 0us/step\n",
"Downloading data from https://dumps.wikimedia.org/enwiki/20181001/enwiki-20181001-abstract5.xml.gz\n",
"24551424/24544524 [==============================] - 11s 0us/step\n",
"Downloading data from https://dumps.wikimedia.org/enwiki/20181001/enwiki-20181001-abstract6.xml.gz\n",
"27664384/27657748 [==============================] - 13s 0us/step\n",
"Downloading data from https://dumps.wikimedia.org/enwiki/20181001/enwiki-20181001-abstract7.xml.gz\n",
"23977984/23972617 [==============================] - 11s 0us/step\n",
"Downloading data from https://dumps.wikimedia.org/enwiki/20181001/enwiki-20181001-abstract8.xml.gz\n",
"21225472/21219401 [==============================] - 10s 0us/step\n",
"Downloading data from https://dumps.wikimedia.org/enwiki/20181001/enwiki-20181001-abstract9.xml.gz\n",
"22937600/22935433 [==============================] - 11s 0us/step\n",
"Downloading data from https://dumps.wikimedia.org/enwiki/20181001/enwiki-20181001-abstract10.xml.gz\n",
"22396928/22396363 [==============================] - 10s 0us/step\n",
"Downloading data from https://dumps.wikimedia.org/enwiki/20181001/enwiki-20181001-abstract11.xml.gz\n",
"24543232/24536724 [==============================] - 11s 0us/step\n",
"Downloading data from https://dumps.wikimedia.org/enwiki/20181001/enwiki-20181001-abstract12.xml.gz\n",
"24576000/24574622 [==============================] - 11s 0us/step\n",
"Downloading data from https://dumps.wikimedia.org/enwiki/20181001/enwiki-20181001-abstract13.xml.gz\n",
"22495232/22491546 [==============================] - 10s 0us/step\n",
"Downloading data from https://dumps.wikimedia.org/enwiki/20181001/enwiki-20181001-abstract14.xml.gz\n",
"19849216/19848294 [==============================] - 9s 0us/step\n",
"Downloading data from https://dumps.wikimedia.org/enwiki/20181001/enwiki-20181001-abstract15.xml.gz\n",
"23339008/23333169 [==============================] - 11s 0us/step\n",
"Downloading data from https://dumps.wikimedia.org/enwiki/20181001/enwiki-20181001-abstract16.xml.gz\n",
"21733376/21726546 [==============================] - 10s 0us/step\n",
"Downloading data from https://dumps.wikimedia.org/enwiki/20181001/enwiki-20181001-abstract17.xml.gz\n",
"21233664/21226386 [==============================] - 10s 0us/step\n",
"Downloading data from https://dumps.wikimedia.org/enwiki/20181001/enwiki-20181001-abstract18.xml.gz\n",
"20799488/20796096 [==============================] - 10s 0us/step\n",
"Downloading data from https://dumps.wikimedia.org/enwiki/20181001/enwiki-20181001-abstract19.xml.gz\n",
"22077440/22072387 [==============================] - 10s 0us/step\n",
"Downloading data from https://dumps.wikimedia.org/enwiki/20181001/enwiki-20181001-abstract20.xml.gz\n",
"22421504/22413378 [==============================] - 10s 0us/step\n",
"Downloading data from https://dumps.wikimedia.org/enwiki/20181001/enwiki-20181001-abstract21.xml.gz\n",
"20611072/20605711 [==============================] - 10s 0us/step\n",
"Downloading data from https://dumps.wikimedia.org/enwiki/20181001/enwiki-20181001-abstract22.xml.gz\n",
"17702912/17697401 [==============================] - 8s 0us/step\n",
"Downloading data from https://dumps.wikimedia.org/enwiki/20181001/enwiki-20181001-abstract23.xml.gz\n",
"18251776/18245686 [==============================] - 8s 0us/step\n",
"Downloading data from https://dumps.wikimedia.org/enwiki/20181001/enwiki-20181001-abstract24.xml.gz\n",
"19136512/19136386 [==============================] - 9s 0us/step\n",
"Downloading data from https://dumps.wikimedia.org/enwiki/20181001/enwiki-20181001-abstract25.xml.gz\n",
"19701760/19701290 [==============================] - 9s 0us/step\n",
"Downloading data from https://dumps.wikimedia.org/enwiki/20181001/enwiki-20181001-abstract26.xml.gz\n",
"17891328/17890611 [==============================] - 8s 0us/step\n",
"Downloading data from https://dumps.wikimedia.org/enwiki/20181001/enwiki-20181001-abstract27.xml.gz\n",
"18423808/18416852 [==============================] - 8s 0us/step\n",
"336 total seconds elapsed.\n"
]
}
],
"source": [
"from keras.utils import get_file\n",
"\n",
"data_paths = []\n",
"\n",
"start = timer()\n",
"for file in files_to_download:\n",
" data_paths.append(get_file(file, dumps_url + file))\n",
" \n",
"end = timer()\n",
"print(f'{round(end - start)} total seconds elapsed.')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The total download time was just over 2 hours. That's not bad for all of Wikipedia (at leas the English articles).\n",
"\n",
"This process could also be done in parallel using multithreading or multiprocessing. However, I have run into issues running parallel jobs donwloading files because the code was making too many requests to the server."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
" "
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:tensorflow_p36]",
"language": "python",
"name": "conda-env-tensorflow_p36-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
4 changes: 2 additions & 2 deletions notebooks/Download Only.ipynb
Expand Up @@ -240,9 +240,9 @@
],
"metadata": {
"kernelspec": {
"display_name": "Environment (conda_tensorflow_p36)",
"display_name": "Python [conda env:tensorflow_p36]",
"language": "python",
"name": "conda_tensorflow_p36"
"name": "conda-env-tensorflow_p36-py"
},
"language_info": {
"codemirror_mode": {
Expand Down

0 comments on commit af9e7c9

Please sign in to comment.