## Pull request level analysis
One part of analysis carried out for both our research questions considers each pull request as a seperate data point, rather than the whole repository. Here, for each pull request, we are calculating the time-to-merge metric and listing the timezones of participants in that pull request.

In [14]:
import json
from datetime import datetime
from timezonefinder import TimezoneFinder
import spacy
nlp = spacy.load("en_core_web_sm")
tf = TimezoneFinder()

def query_timezone(location):
    url = 'https://nominatim.openstreetmap.org/search'
    params = {'q': location, 'format': 'json' }
    headers = {
        'User-Agent': 's.u.kaul@student.rug.nl'
    }
    response = requests.get(url, params = params, headers = headers)
    response_data = response.json()

    if response_data:
        lat, lon = response_data[0]['lat'], response_data[0]['lon']
        if lat is not None and lon is not None:
            return tf.timezone_at(lat=float(lat), lng=float(lon))       
        else:
            return None
    else:
        return None

def get_timezone(text):
    label_text = nlp(text)
    gpe_entities = [ent.text for ent in label_text.ents if ent.label_ == 'GPE']
    return query_timezone(' '.join(gpe_entities)) if gpe_entities else None

def time_difference(d1, d2, unit='hours'):
    d1 = datetime.strptime(d1, '%Y-%m-%dT%H:%M:%SZ')
    d2 = datetime.strptime(d2, '%Y-%m-%dT%H:%M:%SZ')
    
    time_diff = d2 - d1
    
    if unit == 'hours':
        return time_diff.total_seconds() / 3600
    elif unit == 'minutes':
        return time_diff.total_seconds() / 60
    else:
        raise ValueError("Invalid unit. Please use 'hours' or 'minutes'.")
        
_TOKEN = "<your-token>"

We are exploiting the pull requests and collaborator's dataset already extracted by the data collection section. 

In [15]:
with open("data/4_pull_requests.json", "r") as f1:
    pr_data = json.load(f1)
with open("data/4_collaborators_and_users_with_timezone.json") as f2:
    user_location_data = json.load(f2)

In this case, each pull request from each repository is analysed to derive the people that participated in the pull request (and thereby the timezones they are participating from) i.e. the author of the pull request, reviewers of the pull request and people who commented on the pull request. For a participant that doesn't happen to be a collaborator in the repository which the pull request in question is from, we won't explicitly have their location information and try to extract the timezone they are from by querying for their location as well (Note: this code logic has also been replicated from the data collection notebook and the implementation is discussed in Section 5). 

In [None]:
import requests
import time

def check_rate_limit(headers):
    rate_limit_response = requests.get('https://api.github.com/rate_limit', headers=headers)
    rate_limit_info = rate_limit_response.json()
    core_rate_limit = rate_limit_info['resources']['core']
    print("Rate Limit Info:")
    print(f"Limit: {core_rate_limit['limit']}")
    print(f"Used: {core_rate_limit['used']}")
    print(f"Remaining: {core_rate_limit['remaining']}")
    print(f"Reset: {core_rate_limit['reset']}")

rq3_data = []

for repo in pr_data:
    considered_prs = []
    author_submitter_location_data = []
    for pr in repo["pull_requests"]:
        if pr["created_at"] and pr["merged_at"] is not None:
            if "[bot]" not in pr["user"]["login"]:
                considered_prs.append(pr)
                
    for pr in considered_prs:
        print(f"Starting for repo {repo['full_name']}")
        repo_user_data = []
        for u in user_location_data:
            if u["name"] == repo["full_name"]:
                repo_user_data = u["users_location"]
                break
        
        repo_users = [item["login"] for item in repo_user_data if "login" in item]
        
        author = pr["user"]["login"]

        reviewers = []
        
        for reviewer in pr["requested_reviewers"]:
            reviewers.append(reviewer["login"])        
        #reviewers = pr["requested_reviewers"] if pr["requested_reviewers"] != [] else [] 
        
        commentators = []
        
        url = pr["comments_url"]
        headers = {
        'Authorization': f'token {_TOKEN}',
        'Accept': 'application/vnd.github.v3+json'
        }
        response = requests.get(url, headers=headers)

        if response.status_code == 403:
            print("Rate limited. Waiting 5 minutes...")
            check_rate_limit(headers=headers)
            time.sleep(300)

        elif response.status_code != 200:
            # print error and try to fetch results of the current page again
            print(f'Error code {response.status_code}: {response.reason} for respository')  

        else: # if no error, process results and continue to next page
            comments = response.json()
            #print(data)
            #print(data["closed_by"])
            
        if comments != []:
            for comment in comments:
                commentators.append(comment["user"]["login"])
        #print("sleeping for 1 seconds")
        #time.sleep(1)

        print(f"authors {[author]} reviewers {reviewers} commentators {commentators}")        
        pr_participants = list(set([author] + reviewers + commentators))
        
        
        pr_timezones = []
        
        if len(pr_participants) < 2: 
            print("Not enough participants")
            continue
        else:
            print(f"Enough participants {len(pr_participants)}")
            
        for participant in pr_participants:
            if participant in repo_users:
                for loc in repo_user_data:
                    if loc["login"] == participant:
                            location = None
                            if loc["location"] is not None:
                                location = loc["location"]
                            elif loc["bio"] is not None:
                                location = loc["bio"]
                            elif loc["company"] is not None:
                                location = loc["company"]
                            if location is not None:
                                pr_timezones.append(location)                            
                        
            else:
                url = f'https://api.github.com/users/{participant}'
                headers = {
                'Authorization': f'token {_TOKEN}',
                'Accept': 'application/vnd.github.v3+json'
                }
                response = requests.get(url, headers=headers)

                if response.status_code == 403:
                    print("Rate limited. Waiting 5 minutes...")
                    check_rate_limit(headers=headers)
                    time.sleep(300)

                elif response.status_code != 200:
                    # print error and try to fetch results of the current page again
                    print(f'Error code {response.status_code}: {response.reason} for respository ')  

                else: # if no error, process results and continue to next page
                    user = response.json()
                    
                #user_timezone = {}
                #user_timezone['login'] = user["login"]
                #location = None
                #if user["location"] is not None:
                location_l = get_timezone(user["location"]) if user["location"] is not None else None
                if location_l is None:
                    location_b = get_timezone(user["bio"]) if user["bio"] is not None else None
                    if location_b is None:
                        location_c = get_timezone(user["company"]) if user["company"] is not None else None
                        if location_c is not None:
                            pr_timezones.append(location_c)
                    else:
                        pr_timezones.append(location_b)
                else:
                    pr_timezones.append(location_l)
        
        pr_timezones_final = list(set(pr_timezones))
        
        pr_timezones_map = {}
        
        for timezone in pr_timezones:
            if timezone in pr_timezones_map:
                pr_timezones_map[timezone] += 1
            else:
                pr_timezones_map[timezone] = 1
        
        if pr_timezones_final:
            rq3_data.append({
                "repository": repo["full_name"],
                "participants_timezones": pr_timezones_map,
                "time_to_merge": time_difference(pr['created_at'], pr['merged_at'],'hours')
            })
            
            with open("data/pr_participation.json", "w+") as f:
                json.dump(rq3_data, f)
                
    

From this list of participants and the time to merge value for each PR, we are weeding out PR where we could only find the location of one participant as for collaboration, we need at least two participants.

In [None]:
with open("data/pr_participation.json", "r") as f1:
    pr_timezone_merge_data = json.load(f1)
pr_timezone_merge_data_filtered = []
for pr in pr_timezone_merge_data:
    if len(pr["participants_timezones"]) == 1 and list(pr["participants_timezones"].values())[0] == 1:
        continue
    else:
        pr_timezone_merge_data_filtered.append(pr)

with open("data/pr_participation_filtered.json", "w") as f:
    json.dump(pr_timezone_merge_data_filtered, f)
