# Microtask 0

## Aim:
- Get a basic understanding of perceval and the github api data it fetches
- Get comfortable analyzing said data: 
    - Total number of commits
    - Number of commits in master 
    - Number of issues and pull-requests
    - Number of open issues and closed issues

# Collecting the data
data is collected from the progit project on github. Specifically, the repositories progit2-ru and progit2-zh were used.


### Getting the data

In [2]:
github_url = "https://github.com/"
owner = "progit"
repos_used = ["progit2-ru", "progit2-zh"]
repo_urls = [github_url + owner + "/" + repo_used for repo_used in repos_used]
auth_token = "f4d98f2cef0a0b4873b723fe3ee4574765e8b37f" # Please enter your github token here

['https://github.com/progit/progit2-ru', 'https://github.com/progit/progit2-zh']


In [3]:
for repo, repo_url in zip(repos_used, repo_urls):
    print(repo, repo_url)
    if repo == repos_used[0]:
        !perceval git --json-line $repo_url > progit.json

    else:
        !perceval git --json-line $repo_url >> progit.json

    !perceval github -t $auth_token --json-line --sleep-for-rate --category pull_request $owner $repo >> progit.json

    !perceval github -t $auth_token --json-line --sleep-for-rate --category issue $owner $repo >> progit.json

progit2-ru https://github.com/progit/progit2-ru
/bin/sh: 1: perceval: not found
/bin/sh: 1: perceval: not found
/bin/sh: 1: perceval: not found
progit2-zh https://github.com/progit/progit2-zh
/bin/sh: 1: perceval: not found
/bin/sh: 1: perceval: not found
/bin/sh: 1: perceval: not found


In [1]:
import json
import datetime

In [4]:
class Code_Changes:
               
    
    def __init__(self, path_to_file):
        
        self.clean_data = defaultdict(list)
        
        with open(path_to_file, 'r') as raw_data:
            for line in raw_data:
                line = json.loads(line)

                clean_line = dict()
                if line['category'] == "commit":
                    clean_line = self._clean_commit(line)
                    
                elif line['category'] == "issue":
                    clean_line = self._clean_issue(line)

                elif line['category'] == "pull_request":
                    clean_line = self._clean_pr(line)

                self.clean_data[line['category']].append(clean_line)        
    
    
    def number_of_repos(self):
        return len(repos_used)
    
    def total_commits(self):
        return len(self.clean_data['commit'])
    
    def total_commits_per_repo(self):
        commits_per_repo = {el:0 for el in repo_urls}
        
        for commit in self.clean_data['commit']:
            
            commits_per_repo[commit['repo']] += 1
    
        return commits_per_repo
    
    def count_from_to(self, start=None, end=None, type_of_date="author_date", empty=True, merge=True):
        # commit_list has elements of a specific category
        category = "commit"
        commit_list = self.clean_data[category]
        start_date = datetime.datetime.strptime(start, "%Y-%m-%d") if start is not None else datetime.datetime.min
        end_date = datetime.datetime.strptime(end, "%Y-%m-%d") if end is not None else datetime.datetime.max
        
        required_commit_set = set()
        for elem in commit_list:
            if start_date <= self._clean_date(elem[type_of_date]) <= end_date:
                if (empty) or (not empty and elem['files_action'] != 0):
                    if (merge) or (not merge and elem['merge'] == False):

                        required_commit_set.add(elem['hash'])
        return len(required_commit_set)
                    
    
    # private methods to clean data ---------------------------
    
    @staticmethod
    def _clean_date(date_long_format):
        datetimeobj = datetime.datetime.strptime(date_long_format, "%a %b %d %H:%M:%S %Y %z")
        datetimeobj = datetimeobj.replace(tzinfo=None)
    
        return datetimeobj
    
    @staticmethod                
    def _clean_commit(line):
        repo_name = line['origin']
        line_data = line['data']
        cleaned_line = {
            'repo': repo_name,
            'hash': line_data['commit'],
            'category': "commit",
            'commit': line_data['Commit'],
            'author': line_data['Author'],
            'author_date': line_data['AuthorDate'],
            'commit_date': line_data['CommitDate'],
            'files_no': len(line_data['files'])
        }
        
        actions = 0
        
        cleaned_line['files_action'] = actions
        cleaned_line['merge'] = 'Merge' in line_data
        
        for file in line_data['files']:
            if 'action' in file:
                actions += 1
                cleaned_line['files_action'] = actions
                cleaned_line['merge'] = 'Merge' in line_data
        return cleaned_line
    
    @staticmethod
    def _clean_issue(line):
        repo_name = line['origin']
        line_data = line['data']
        cleaned_line ={
            'repo': repo_name,
            'hash': line_data['id'],
            'category': "issue",
            'author': line_data['user']['login'],
            'created_date': line_data['created_at'],
            'current_status': line_data['state']   
        }
        
        return cleaned_line
    
    @staticmethod
    def _clean_pr(line):
        repo_name = line['origin']
        line_data = line['data']
        cleaned_line ={
            'repo': repo_name,
            'hash': line_data['id'],
            'category': "pull_request",
            'author': line_data['user']['login'],
            'created_date': line_data['created_at'],
            'current_status': line_data['state']   
        }
        
        return cleaned_line
    

# Analyzing the data using the Class

In [7]:
commits_data = Code_Changes('./progit.json')

## Total number of commits 

In [8]:
print("The total number of commits in all repos is: ", commits_data.total_commits())
print("The number of commits repo-wise is ", commits_data.total_commits_per_repo())

The total number of commits in all repos is:  3024
The number of commits repo-wise is  {'https://github.com/progit/progit2-ru': 1292, 'https://github.com/progit/progit2-zh': 1732}


## Total number of commits between dates

In [9]:
print("Code changes count all period:", commits_data.count_from_to())
print("Code changes count from 2018-01-01 to 2018-07-01:",
      commits_data.count_from_to(start="2018-01-01", end="2019-07-01"))
print("Code changes count from 2018-01-01 to 2019-07-01 (no merge commits):",
      commits_data.count_from_to(start="2018-01-01", end="2018-07-01", merge=False))


Code changes count all period: 2402
Code changes count from 2018-01-01 to 2018-07-01: 77


KeyError: 'merge'

# Analyzing the json file directly

In [None]:
github_data = defaultdict(list)

count = 0
with open('trial.json', 'r') as github_data_file:
    for line in github_data_file:
        data_line = json.loads(line)
        count += 1

        category = data_line['category']
        data = data_line['data']
        github_data[category].append(data)
            
print(len(github_data['commit']))

## Total number of commits in the master branch


In [None]:
master_commits = set()

for elem in github_data['commit']:
    if 'HEAD -> refs/heads/master' in elem['refs']:
        master_commits.add(elem['commit'])       
        for parent in elem['parents']:
            
            if parent not in master_commits:
                master_commits.add(parent)
                
print(len(master_commits))
        

## Total number of non empty commits

In [None]:
num_empty_commits = 0

for commit in github_data['commits']:
    for file in commit['files']:
        if 'action' in file:
            num_empty_commits += 1
            break
            
print(num_empty_commits)
            

## Total number of non - merge commits

In [None]:
count = 0

for commit in github_data['commit']:
    if 'Merge' not in commit:
        count += 1
        
print("Number of non-merge commits is: %d" %count)

# Pull Requests and Issues

## Total number of pull requests and issues

In [None]:
total_issues = len(github_data["issue"])

print("The number of issues is {0}".format(total_issues))
print("The number of pull requests is {0}".format(len(github_data["pull_request"])))

## Total number of open and closed issues

In [None]:
num_open_issues = 0
for issue in github_data["issue"]:
    if issue['state'] == "open":
        num_open_issues += 1
        
print("The number of open issues is ", num_open_issues)
print("The number of closed issues is ", total_issues - num_open_issues)