In [1]:
# github api helper function

import requests
import time
import itertools

from requests.auth import HTTPBasicAuth


def make_api_call(url):
    auth = ('ThePrecious', 'XXX')
    headers = {"Accept": "application/vnd.github.v3.star+json"}
    response = requests.get(url, auth=auth, headers=headers)
    return response

def get_dict_subset(d):
    return dict(itertools.islice(d.iteritems(), 0, 10))

In [2]:
## get fork details for a sample repo

url = "https://api.github.com/repos/twbs/bootstrap/forks"
response = make_api_call(url)
total_pages = int(response.headers['Link'].split(',')[1].split('page=')[1].split('>;')[0])
base_url = response.headers['Link'].split(',')[1].split('?')[0][2:]

print total_pages
print base_url

1673
https://api.github.com/repositories/2126244/forks


## Get the required fork and user details page by page and store them in a dictionary

In [3]:

# Function to get the location of the user
def get_user_location(login):
    url = "https://api.github.com/users/%s" % login
    response = make_api_call(url)
    return response.json()['location']
    
fork_details = {}
total_pages = 25 # Temp; comment this line for all pages

# Get the login, forked date and location of all those who have forked the repo
for i in range(1, total_pages):
    url = base_url + "?page=%s" %i
    response = make_api_call(url);
    result = response.json()

    for obj in result:
        f_date = obj['created_at']
        owner_id = obj['owner']['id']
        login = obj['owner']['login']
        location = get_user_location(login)
        if location is None:
            location = "NOLOCATION"
        fork_details[owner_id] = [login, f_date, location] 
        
    

    ## get fields related to rate limit from response header
    rate_limit_remain = int(response.headers['X-RateLimit-Remaining'])
    rate_limit_reset = int(response.headers['X-RateLimit-Reset'])

    ## if <=100 sleep for 100 seconds
    if rate_limit_remain <= 100:
        diff = rate_limit_reset - int(time.time())
        if diff < 0: 
            diff = 100
        time.sleep(diff)
    
print len(fork_details)
print get_dict_subset(fork_details)

720
{29728800: [u'songafeng', u'2017-06-28T07:22:07Z', 'NOLOCATION'], 5212162: [u'bulinan', u'2017-06-28T05:37:45Z', u'Beijing'], 29624324: [u'HongDanni', u'2017-06-23T12:34:14Z', 'NOLOCATION'], 23920648: [u'coolzpl', u'2017-06-16T11:41:59Z', 'NOLOCATION'], 13555746: [u'zhangao15', u'2017-07-02T09:52:55Z', 'NOLOCATION'], 12838927: [u'xrj3000', u'2017-07-02T01:19:59Z', 'NOLOCATION'], 20639760: [u'bessii', u'2017-06-26T19:11:18Z', u'Buea | Cameroon'], 25606164: [u'Zlth', u'2017-06-28T01:50:52Z', 'NOLOCATION'], 1794074: [u'ciela', u'2017-06-19T11:21:57Z', u'Tokyo'], 11409435: [u'yangyangxie', u'2017-06-28T05:49:02Z', 'NOLOCATION']}


### Parse and bin the count according to forked date

In [4]:
fork_counts = {}

for k,v in fork_details.iteritems():
    d = v[1]
    d = d.split('T')[0]
    if fork_counts.has_key(d):
        fork_counts[d] += 1
    else:
        fork_counts[d] = 1
        
print get_dict_subset(fork_counts)
#fork_counts

{u'2017-06-28': 50, u'2017-06-29': 42, u'2017-06-22': 53, u'2017-06-23': 29, u'2017-06-20': 32, u'2017-06-21': 32, u'2017-07-04': 47, u'2017-07-03': 32, u'2017-07-02': 23, u'2017-07-01': 24}


### Parse and bin the count according to user's city (location from which it was forked)

In [5]:
fork_city_counts = {}
nolocation = 0
fork_state_counts = {}
fork_country_counts = {}

for k, v in fork_details.iteritems():
    city = v[2]
    city = city.split(',')[0]
    """ 
    state = v[2]
    state = state.split(',')[1]
    country = v[2]
    country = country.split(',')[2]
    """

    if fork_city_counts.has_key(city):
        fork_city_counts[city] += 1
    else:
        fork_city_counts[city] = 1
    
    """  
    if fork_state_counts.has_key(state):
        fork_state_counts[state] += 1
    else:
        fork_state_counts[state] = 1
    
    if fork_country_counts.has_key(country):
        fork_country_counts[country] += 1
    else:
        fork_country_counts[country] = 1 
    """
        
print get_dict_subset(fork_city_counts)

{u'New Delhi': 1, u'Chennai': 1, u'San Francisco Bay area': 1, u'Sacramento': 1, u'\u5317\u4eac': 2, u'Amman ': 1, u'Guatemala': 1, u'Kent': 1, u'Argentina': 1, u'Shenzhen': 2}



### Get the latitude, longitude of the cities using geopy

In [10]:
from geopy.geocoders import Nominatim
geolocator = Nominatim()

fork_city_counts.pop('NOLOCATION')

locations = []
for c in fork_city_counts.keys():
    location = geolocator.geocode(c)
    if location:
        locations.append((location.latitude, location.longitude))
    
print locations[0:10]

[(37.7884969, -122.3558473), (31.9515694, 35.9239625), (51.2474823, 0.7105077), (-34.9964963, -64.9672817), (28.6138967, 77.2159562), (22.5442673, 114.0545327), (13.0801721, 80.2838331), (38.5815719, -121.4943996), (39.9059631, 116.391248), (15.6356088, -89.8988087)]


### Visualize the fork locations on a map (using folium)

<img src="forks_map.png">

In [11]:
from IPython.display import display, HTML
import folium

# Create an empty map with focus on SF
SF_COORDINATES = (37.76, -122.45)
m = folium.Map(location=SF_COORDINATES, zoom_start=5)

marker_cluster = folium.MarkerCluster().add_to(m)

for loc in locations:
    folium.Marker(
        location=loc,
        icon=folium.Icon(color='green', icon='ok-sign'),
    ).add_to(marker_cluster)


m.save('forks_map.html')

In [12]:
HTML('<iframe width="800" height="350" frameborder="0" scrolling="no" marginheight="10" marginwidth="10" src="forks_map.html"></iframe>')
