# Pandas + Python + GitHub API




In [1]:
#Importing Pandas 
import pandas as pd
#Importing Requests
import requests

In [7]:
class GitHubRepo:
    PAGE=50 # Assuming total pages for any repo is not more than 50
    TOP_REPO=5 # selecting top 5 repositories 
    USER_ID='' #Github Username/email_id
    PASS=''  #Github Password
    API='https://api.github.com/users/google/repos?page={}' #Google Github API url; page-wise
    URL='https://api.github.com/users/google/repos'
    
    def __init__(self,username,password):
        self.USER_ID=username
        self.PASS=password
        #self.sort_by_param=sort_by_param
    
    def check_valid_input(self):
        r=requests.get(self.URL,auth=(self.USER_ID,self.PASS))
        code=r.status_code
        if code == 200:
            return True 
        elif code == 404:
            return  "API URL is incorrect!"
        else:
            return "Please provide valid GitHub Credentials"
                
            
    def print_data(self):
        print self.USER_ID, self.PASS
    
    def get_url_response(self):
        combined_json=pd.DataFrame()
        for page_number in range(1,self.PAGE):
            url = self.API.format(page_number)
            print "Accessing Page Number : ", page_number

            #Getting response through request function call
            urlData = requests.get(url,auth=(self.USER_ID,self.PASS)).content
            #Converting response into Pandas DataFrame format
            json_response =pd.read_json(urlData)
            if not (json_response.empty):
                #Combinig the data from all pages
                combined_json=pd.concat([combined_json, json_response], axis=0,ignore_index=True)
                #print len(combined_json['forks_count'])
            else:
                break
    
        return combined_json
    
    def get_sorted(self,combined_json,sort_by_param):
        sorted_repos=combined_json.sort_values(by=sort_by_param,ascending=False)
        return sorted_repos
    
    def get_top_records(self,sorted_repos):
        #Selecting TOP 5 Repositories 
        selected_repo=sorted_repos[:self.TOP_REPO]
        selected_repo=selected_repo.reset_index(drop=True)
        print selected_repo.shape
        #print type(selected_repo), selected_repo
        return selected_repo
        
    def prepare_table(self,selected_repo):
        data=pd.DataFrame(index=None)
        data['repo_name']=selected_repo['name']
        data['forks_count']=selected_repo['forks_count']
        data['contributer_url']=selected_repo['contributors_url']
        data.reset_index(drop=True)
        return data
    
    def get_contributor_url(self,data):
        contri_url=data['contributer_url']
        repo_name=data['repo_name']
        key=['Repo_Name','Committer_Name','Commit_Count']
        commit_data=pd.DataFrame(columns=key,index=None)
        #commit_data=pd.DataFrame()
        temp_data=pd.DataFrame(columns=key,index=None)

        for url,repo in zip(contri_url,repo_name): 
            contri_json =pd.read_json(url)
            #contri_json.apply(func(repo_name)) 
            temp_data['Committer_Name']=contri_json['login']
            temp_data['Repo_Name']= repo
            temp_data['Commit_Count']=contri_json['contributions']
            commit_data=pd.concat([commit_data,temp_data],ignore_index=True)
        return commit_data
    
    def get_commit_chart(self,commit_data):
        #### Used Groupby and sort on selected data
        result=commit_data.sort_values(by=['Commit_Count'],ascending=False).groupby('Repo_Name')
        return result

### Execution

In [None]:
import sys
if __name__=="__main__":
    username=raw_input("Enter GitHub UserName/EmailID \t")
    password=raw_input("Enter GitHub Password  \t")
    #print user_name,password
    obj=GitHubRepo(username,password)
    #Input Validity Check
    status=obj.check_valid_input()
    #print status
    if status != True:
        print status, "\nPlease try again..."
        sys.exit
    else:
        #Reading HTML Content 
        data=obj.get_url_response()
        
        # Sorting DataSet using 'fork_count' field
        sortdata=obj.get_sorted(data,'forks_count')
        
        # Select TOP 5 Sorted Records 
        selected_repo=obj.get_top_records(sortdata)
        
        #print type(selected_repo), selected_repo.shape
        data_table=obj.prepare_table(selected_repo)
        
        #print type(data_table),data_table
        commit_data=obj.get_contributor_url(data_table)
        
        final_result=obj.get_commit_chart(commit_data)
        #printing TOP commitee and their commit counts
        print final_result.head(3)

## Output should look something like this

In [None]:
              Repo_Name Committer_Name  Commit_Count
60                  guava        cpovirk        1144.0
45   material-design-lite     addyosmani         697.0
30               protobuf         jskeet         569.0
46   material-design-lite          surma         555.0
31               protobuf        xfxyjwf         530.0
61                  guava        kluever         496.0
47   material-design-lite         sgomes         473.0
62                  guava       cgdecker         421.0
32               protobuf          pherl         368.0
15                iosched    PaulRashidi         145.0
16                iosched   freewheelnat         129.0
17                iosched         tjohns          50.0
0   material-design-icons       jestelle          37.0
1   material-design-icons       shyndman          36.0
2   material-design-icons     addyosmani          12.0

Unnamed: 0,Repo_Name,Committer_Name,Commit_Count
60,guava,cpovirk,1144.0
45,material-design-lite,addyosmani,697.0
30,protobuf,jskeet,569.0
46,material-design-lite,surma,555.0
31,protobuf,xfxyjwf,530.0
61,guava,kluever,496.0
47,material-design-lite,sgomes,473.0
62,guava,cgdecker,421.0
32,protobuf,pherl,368.0
15,iosched,PaulRashidi,145.0


### Point to note here is

#### Given that top committers are those you can commit to master Repo not the merge committer. So filtering out dataset based on top contributors and their total commits.