Permalink
Cannot retrieve contributors at this time
Fetching contributors…
| #!/bin/bash | |
| # downloads files from fileplanet by a span of numeric IDs | |
| # USAGE: | |
| # $ bash download_pages_and_files_from_fileplanet.sh 1 123 | |
| # would try to download all files and their download pages | |
| # with the IDs 1 to 123 | |
| # Files will be downloaded to ./www.fileplanet.com/NUMERICID/download/ | |
| # Random thoughts: | |
| # we can go with http://www.fileplanet.com/NUMERICID/download/ | |
| # For the numeric ID we can use both 012345 OR 12345 formats | |
| # We will be using the one without the leading zeros, since that is how Fileplanet links internally. | |
| echo "You will be downloading $1 to $2, you rock!" | |
| echo "Let's go!" | |
| mkdir logs | |
| mkdir "$1-$2" | |
| cd $1-$2/ | |
| for i in $(seq $1 $2) | |
| do | |
| echo "Trying to download $i" | |
| downloadpageurl="www.fileplanet.com/${i}/download/" | |
| # fileplanet returns a "302 Found" for non-existing IDs | |
| # redirecting to "Location: /error/error.shtml?aspxerrorpath=/autodownload.aspx | |
| # we don't want those files, so "--max-redirect=0" | |
| wget -nv -a pages_$1_$2.log --force-directories --max-redirect=0 http://${downloadpageurl} | |
| # extract the session download link to the actual file we want | |
| # the URL is enclosed by single quotes. The second grep will get everything from http until the last '. The rev/cut will remove the trailing '. | |
| linktowget=$(grep default-file-download-link ${downloadpageurl}index.html 2>/dev/null | grep -Eo "http.*'" | rev | cut -c 2- | rev) | |
| if [ ! -n "${linktowget}" ]; then | |
| echo "No download link found." | |
| else | |
| echo "Download link found, downloading ${linktowget}" | |
| # download the file to the same directory as its download page HTML | |
| wget -nv -a files_$1_$2.log --directory-prefix=${downloadpageurl} --referer="${downloadpageurl}" "${linktowget}" || echo "ERROR! If you see more than one, please Ctrl-C, check the files log and tell Schbirid!" | |
| fi | |
| echo "-----" | |
| done | |
| echo "Downloading finished! Yay!" | |
| echo -n "Counting files: " | |
| numberoffiles=$(ls -1 www.fileplanet.com/ | wc -l) | |
| echo $numberoffiles | |
| echo -n "Getting the size: " | |
| sizeofchunk=$(du -hs www.fileplanet.com/ | sed 's/\twww.*//') | |
| echo $sizeofchunk | |
| cd .. | |
| # just a handy local backup | |
| cp $1-$2/*.log logs/ | |
| echo "TARring!" | |
| tar -cf $1-$2.tar $1-$2/ && echo "TARring was a success. Now removing the directory." && rm -r $1-$2/ | |
| grep "ERROR" logs/*$1_$2.log || echo "Done. YAAAY!" | |
| echo "Here is copy'n'pastable meta for the wiki:" | |
| echo "|-" | |
| echo "| $1-$2" | |
| echo "| Done, locally" | |
| echo "| $numberoffiles" | |
| echo "| $sizeofchunk" | |
| echo "| insert nick name here" | |
| echo "|-" | |
| echo "And here is the s3cmd line:" | |
| echo "s3cmd --add-header x-archive-auto-make-bucket:1 --add-header \"x-archive-meta-description:Files from Fileplanet (www.fileplanet.com), all files from the ID range $1 to $2.\" put logs/*_$1_$2.log $1-$2.tar s3://FileplanetFiles_$1-$2/" |