# Fetching Pypi data
This notebook fetches every package from the pypi server (~74000 packages!), extracts the setup.py and any file or folder with the word 'requirements' in it. 

In [8]:
import xmlrpclib
import os
# only one api server so we'll use the deutschland mirror for downloading
client = xmlrpclib.ServerProxy('https://pypi.python.org/pypi')
packages = client.list_packages()



In [5]:
import tarfile, re, requests, csv, json
from base64 import b64encode
# from kglib.utils.HelperFunctions import ensure_dir
                
def ensure_dir(f):
    d = os.path.dirname(f)
    if not os.path.exists(d):
        os.makedirs(d)
    
def _save_file(pathname, member, tar_file):
    try:
        content = tar_file.extractfile(member).read()
    except:
        return
    
    outfilename = '{}{}'.format(pathname, os.path.basename(member.name))
    ensure_dir(outfilename)
    with open(outfilename, 'w') as outfile:
        outfile.write(content)
    return
                

def _extract_files(package_file, name):
    try:
        tar_file = tarfile.open(fileobj=package_file)
    except:
        return
    for member in tar_file.getmembers():
        if 'setup.py' in member.name or 'requirements' in member.name:
            _save_file(name, member, tar_file)
        #    content = tar_file.extractfile(member).read()
        #    with open('{}{}'.format(name, os.path.basename(member.name)), 'w') as outfile:
        #        outfile.write(content)
        #elif 'requirements' in member.name:
        #    content = tar_file.extractfile(member).read()
        #    with open('{}{}'.format(name, os.path.basename(member.name)), 'w') as outfile:
        #        outfile.write(content)
                
                
def extract_package_latest(name, client=xmlrpclib.ServerProxy('http://pypi.python.org/pypi')):
    for release in client.package_releases(name):
        outdir = 'packages/{}-{}/'.format(name, release)
        doc = client.release_urls(name, release)
        print doc
        if doc:
            url = None
            for d in doc:
                print d 
                if d['python_version'] == 'source' and d['url'].endswith('gz'):
                    url = d['url']
            if url:
                req = requests.get(url)
                if req.status_code != 200:
                    print "Could not download file %s" % req.status_code
                else:
                    #print(outdir)
                    ensure_dir('{}'.format(outdir))
                    with open('/tmp/temp_tar', 'w') as tar_file:
                        tar_file.write(req.content)
                    with open('/tmp/temp_tar', 'r') as tar_file:
                        return _extract_files(tar_file, name=outdir)

In [7]:
for i, package in enumerate(packages):
    if i % 100 == 0:
        print('Extracting package {} / {}'.format(i+1, len(packages)))
    #print(package)
    
    extract_package_latest(package, client)

Extracting package 1 / 94191
[{'has_sig': False, 'upload_time': <DateTime '20130711T00:07:26' at 1069b0ab8>, 'comment_text': '', 'python_version': 'source', 'url': 'https://pypi.python.org/packages/5e/d3/1cec3136c2f208a7529b82b53372da3bf82eb931be5434622c541d08de9d/d-0.2.2.tar.gz', 'md5_digest': 'a174b212b921cf03ca033f823701145b', 'downloads': 3971, 'filename': 'd-0.2.2.tar.gz', 'packagetype': 'sdist', 'path': '5e/d3/1cec3136c2f208a7529b82b53372da3bf82eb931be5434622c541d08de9d/d-0.2.2.tar.gz', 'size': 27780}]
{'has_sig': False, 'upload_time': <DateTime '20130711T00:07:26' at 1069b0ab8>, 'comment_text': '', 'python_version': 'source', 'url': 'https://pypi.python.org/packages/5e/d3/1cec3136c2f208a7529b82b53372da3bf82eb931be5434622c541d08de9d/d-0.2.2.tar.gz', 'md5_digest': 'a174b212b921cf03ca033f823701145b', 'downloads': 3971, 'filename': 'd-0.2.2.tar.gz', 'packagetype': 'sdist', 'path': '5e/d3/1cec3136c2f208a7529b82b53372da3bf82eb931be5434622c541d08de9d/d-0.2.2.tar.gz', 'size': 27780}
[]


We now have the setup.py and requirements files for every pypi package. I use my own fork of [this repository](https://github.com/landscapeio/requirements-detector) to find the requirements for every package with the following script:

```bash
for p in packages/*
do
  echo $p
  detect-requirements $p
  echo ''
done
```

I will parse the output of this using a different notebook.