Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
97cf918
commit af9e7c9
Showing
8 changed files
with
3,200 additions
and
1,123 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,246 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"# Downloading all Wikipedia Articles \n", | ||
"\n", | ||
"This notebook implements the downloading of all Wikipedia articles. I kept the actual download out of the main notebook because of the lengthy output. " | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## Find Files to Download" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 6, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"There are 27 files to download.\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"import requests\n", | ||
"from bs4 import BeautifulSoup\n", | ||
"from timeit import default_timer as timer\n", | ||
"import os\n", | ||
"\n", | ||
"base_url = 'https://dumps.wikimedia.org/enwiki/'\n", | ||
"index = requests.get(base_url).text\n", | ||
"soup_index = BeautifulSoup(index, 'html.parser')\n", | ||
"\n", | ||
"# Find the links that are dates of dumps\n", | ||
"dumps = [a['href'] for a in soup_index.find_all('a') if \n", | ||
" a.text == '20181001/']\n", | ||
"\n", | ||
"dumps_url = base_url + dumps[0]\n", | ||
"\n", | ||
"# Retrieve the html\n", | ||
"dump_html = requests.get(dumps_url).text\n", | ||
"\n", | ||
"# Convert to a soup\n", | ||
"soup_dump = BeautifulSoup(dump_html, 'html.parser')\n", | ||
"\n", | ||
"files = []\n", | ||
"for file in soup_dump.find_all('li', {'class': 'file'}):\n", | ||
" text = file.text\n", | ||
" if 'abstract' in text:\n", | ||
" files.append((text.split()[0], text.split()[1:]))\n", | ||
" \n", | ||
"files_to_download = [file[0] for file in files if file[0] != 'enwiki-20181001-abstract.xml.gz']\n", | ||
"print(f'There are {len(files_to_download)} files to download.')" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 7, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"['enwiki-20181001-abstract1.xml.gz',\n", | ||
" 'enwiki-20181001-abstract2.xml.gz',\n", | ||
" 'enwiki-20181001-abstract3.xml.gz',\n", | ||
" 'enwiki-20181001-abstract4.xml.gz',\n", | ||
" 'enwiki-20181001-abstract5.xml.gz',\n", | ||
" 'enwiki-20181001-abstract6.xml.gz',\n", | ||
" 'enwiki-20181001-abstract7.xml.gz',\n", | ||
" 'enwiki-20181001-abstract8.xml.gz',\n", | ||
" 'enwiki-20181001-abstract9.xml.gz',\n", | ||
" 'enwiki-20181001-abstract10.xml.gz',\n", | ||
" 'enwiki-20181001-abstract11.xml.gz',\n", | ||
" 'enwiki-20181001-abstract12.xml.gz',\n", | ||
" 'enwiki-20181001-abstract13.xml.gz',\n", | ||
" 'enwiki-20181001-abstract14.xml.gz',\n", | ||
" 'enwiki-20181001-abstract15.xml.gz',\n", | ||
" 'enwiki-20181001-abstract16.xml.gz',\n", | ||
" 'enwiki-20181001-abstract17.xml.gz',\n", | ||
" 'enwiki-20181001-abstract18.xml.gz',\n", | ||
" 'enwiki-20181001-abstract19.xml.gz',\n", | ||
" 'enwiki-20181001-abstract20.xml.gz',\n", | ||
" 'enwiki-20181001-abstract21.xml.gz',\n", | ||
" 'enwiki-20181001-abstract22.xml.gz',\n", | ||
" 'enwiki-20181001-abstract23.xml.gz',\n", | ||
" 'enwiki-20181001-abstract24.xml.gz',\n", | ||
" 'enwiki-20181001-abstract25.xml.gz',\n", | ||
" 'enwiki-20181001-abstract26.xml.gz',\n", | ||
" 'enwiki-20181001-abstract27.xml.gz']" | ||
] | ||
}, | ||
"execution_count": 7, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"files_to_download" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## Download Files Using Keras\n", | ||
"\n", | ||
"Files will be saved in `/.keras/datasets`." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 8, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"Using TensorFlow backend.\n" | ||
] | ||
}, | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Downloading data from https://dumps.wikimedia.org/enwiki/20181001/enwiki-20181001-abstract1.xml.gz\n", | ||
"102514688/102514678 [==============================] - 49s 0us/step\n", | ||
"Downloading data from https://dumps.wikimedia.org/enwiki/20181001/enwiki-20181001-abstract2.xml.gz\n", | ||
"51953664/51952869 [==============================] - 25s 0us/step\n", | ||
"Downloading data from https://dumps.wikimedia.org/enwiki/20181001/enwiki-20181001-abstract3.xml.gz\n", | ||
"37773312/37770207 [==============================] - 18s 0us/step\n", | ||
"Downloading data from https://dumps.wikimedia.org/enwiki/20181001/enwiki-20181001-abstract4.xml.gz\n", | ||
"29802496/29798632 [==============================] - 14s 0us/step\n", | ||
"Downloading data from https://dumps.wikimedia.org/enwiki/20181001/enwiki-20181001-abstract5.xml.gz\n", | ||
"24551424/24544524 [==============================] - 11s 0us/step\n", | ||
"Downloading data from https://dumps.wikimedia.org/enwiki/20181001/enwiki-20181001-abstract6.xml.gz\n", | ||
"27664384/27657748 [==============================] - 13s 0us/step\n", | ||
"Downloading data from https://dumps.wikimedia.org/enwiki/20181001/enwiki-20181001-abstract7.xml.gz\n", | ||
"23977984/23972617 [==============================] - 11s 0us/step\n", | ||
"Downloading data from https://dumps.wikimedia.org/enwiki/20181001/enwiki-20181001-abstract8.xml.gz\n", | ||
"21225472/21219401 [==============================] - 10s 0us/step\n", | ||
"Downloading data from https://dumps.wikimedia.org/enwiki/20181001/enwiki-20181001-abstract9.xml.gz\n", | ||
"22937600/22935433 [==============================] - 11s 0us/step\n", | ||
"Downloading data from https://dumps.wikimedia.org/enwiki/20181001/enwiki-20181001-abstract10.xml.gz\n", | ||
"22396928/22396363 [==============================] - 10s 0us/step\n", | ||
"Downloading data from https://dumps.wikimedia.org/enwiki/20181001/enwiki-20181001-abstract11.xml.gz\n", | ||
"24543232/24536724 [==============================] - 11s 0us/step\n", | ||
"Downloading data from https://dumps.wikimedia.org/enwiki/20181001/enwiki-20181001-abstract12.xml.gz\n", | ||
"24576000/24574622 [==============================] - 11s 0us/step\n", | ||
"Downloading data from https://dumps.wikimedia.org/enwiki/20181001/enwiki-20181001-abstract13.xml.gz\n", | ||
"22495232/22491546 [==============================] - 10s 0us/step\n", | ||
"Downloading data from https://dumps.wikimedia.org/enwiki/20181001/enwiki-20181001-abstract14.xml.gz\n", | ||
"19849216/19848294 [==============================] - 9s 0us/step\n", | ||
"Downloading data from https://dumps.wikimedia.org/enwiki/20181001/enwiki-20181001-abstract15.xml.gz\n", | ||
"23339008/23333169 [==============================] - 11s 0us/step\n", | ||
"Downloading data from https://dumps.wikimedia.org/enwiki/20181001/enwiki-20181001-abstract16.xml.gz\n", | ||
"21733376/21726546 [==============================] - 10s 0us/step\n", | ||
"Downloading data from https://dumps.wikimedia.org/enwiki/20181001/enwiki-20181001-abstract17.xml.gz\n", | ||
"21233664/21226386 [==============================] - 10s 0us/step\n", | ||
"Downloading data from https://dumps.wikimedia.org/enwiki/20181001/enwiki-20181001-abstract18.xml.gz\n", | ||
"20799488/20796096 [==============================] - 10s 0us/step\n", | ||
"Downloading data from https://dumps.wikimedia.org/enwiki/20181001/enwiki-20181001-abstract19.xml.gz\n", | ||
"22077440/22072387 [==============================] - 10s 0us/step\n", | ||
"Downloading data from https://dumps.wikimedia.org/enwiki/20181001/enwiki-20181001-abstract20.xml.gz\n", | ||
"22421504/22413378 [==============================] - 10s 0us/step\n", | ||
"Downloading data from https://dumps.wikimedia.org/enwiki/20181001/enwiki-20181001-abstract21.xml.gz\n", | ||
"20611072/20605711 [==============================] - 10s 0us/step\n", | ||
"Downloading data from https://dumps.wikimedia.org/enwiki/20181001/enwiki-20181001-abstract22.xml.gz\n", | ||
"17702912/17697401 [==============================] - 8s 0us/step\n", | ||
"Downloading data from https://dumps.wikimedia.org/enwiki/20181001/enwiki-20181001-abstract23.xml.gz\n", | ||
"18251776/18245686 [==============================] - 8s 0us/step\n", | ||
"Downloading data from https://dumps.wikimedia.org/enwiki/20181001/enwiki-20181001-abstract24.xml.gz\n", | ||
"19136512/19136386 [==============================] - 9s 0us/step\n", | ||
"Downloading data from https://dumps.wikimedia.org/enwiki/20181001/enwiki-20181001-abstract25.xml.gz\n", | ||
"19701760/19701290 [==============================] - 9s 0us/step\n", | ||
"Downloading data from https://dumps.wikimedia.org/enwiki/20181001/enwiki-20181001-abstract26.xml.gz\n", | ||
"17891328/17890611 [==============================] - 8s 0us/step\n", | ||
"Downloading data from https://dumps.wikimedia.org/enwiki/20181001/enwiki-20181001-abstract27.xml.gz\n", | ||
"18423808/18416852 [==============================] - 8s 0us/step\n", | ||
"336 total seconds elapsed.\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"from keras.utils import get_file\n", | ||
"\n", | ||
"data_paths = []\n", | ||
"\n", | ||
"start = timer()\n", | ||
"for file in files_to_download:\n", | ||
" data_paths.append(get_file(file, dumps_url + file))\n", | ||
" \n", | ||
"end = timer()\n", | ||
"print(f'{round(end - start)} total seconds elapsed.')" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"The total download time was just over 2 hours. That's not bad for all of Wikipedia (at leas the English articles).\n", | ||
"\n", | ||
"This process could also be done in parallel using multithreading or multiprocessing. However, I have run into issues running parallel jobs donwloading files because the code was making too many requests to the server." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
" " | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python [conda env:tensorflow_p36]", | ||
"language": "python", | ||
"name": "conda-env-tensorflow_p36-py" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.6.6" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.