# Pandas + Python + GitHub API
## Index
### 1: Logic One
### 2: Logic Two
### 3: Implementation with results
### 4: Modular Approach for same



## 1: Logic One:
### This logic does not need any authentication for accessing data. Reading all github pages from google repositories. Combined the complete dataset from all pages and sort it according to the parameter fork_count. Then select top 5 repositories. 

#### Syntax:  json_response =pd.read_json(url)
#### Drawback: GitHub API "https://api.github.com/" can allow to hit only 60 Request in one hour for particular IP Address

## 2: Logic Two:
### To solve the above problem we use "Requests" instead of "read_json" a Pandas inbuilt function, which needs authentication for accessing the GitHub API. Once we get the HTML data then convert it into Pandas Dataframe. 

#### Syntax: requests.get(url,auth=(USER_ID,PASS)).content

## 3: Implementation
### For second half of the problem, find the top 3 committe and their total commits for each selected Repo.

#### Given that top committers are those you can commit to master Repo not the merge committer. So filtering out dataset based on top contributors and their total commits.

In [1]:
#Importing Pandas 
import pandas as pd
#Importing Requests
import requests

In [35]:
PAGE=50 # Assuming total pages for any repo is not more than 50
TOPREPO=5 # selecting top 5 repositories 

api = 'https://api.github.com/users/google/repos?page={}'
combined_json=pd.DataFrame()
for page_number in range(1,PAGE):
    url = api.format(page_number)
    print "Accessing Page Number : ", page_number
    
    #Reading JSON response through Pandas function call
    json_response =pd.read_json(url)
    if not (json_response.empty):
        #Combinig the data from all pages
        combined_json=pd.concat([combined_json, json_response], axis=0,ignore_index=True)
        #print len(combined_json['forks_count'])
    else:
        break
    
#Sorting the Dataset
sorted_forks=combined_json.sort_values(by='forks_count',ascending=False)

#Selecting TOP 5 Repositories 
selected_repo=sorted_forks[:TOPREPO]
selected_repo=selected_repo.reset_index(drop=True)
#print selected_repo['forks_count']
print selected_repo.describe
print selected_repo.shape
#print "Repo Name : ", selected_repo['name'], "\t Fork Count : ", selected_repo['forks_count']


Accessing Page Number :  1
Accessing Page Number :  2
Accessing Page Number :  3
Accessing Page Number :  4
Accessing Page Number :  5
Accessing Page Number :  6
Accessing Page Number :  7
Accessing Page Number :  8
Accessing Page Number :  9
Accessing Page Number :  10
Accessing Page Number :  11
Accessing Page Number :  12
Accessing Page Number :  13
Accessing Page Number :  14
Accessing Page Number :  15
Accessing Page Number :  16
Accessing Page Number :  17
Accessing Page Number :  18
Accessing Page Number :  19
Accessing Page Number :  20
Accessing Page Number :  21
Accessing Page Number :  22
Accessing Page Number :  23
Accessing Page Number :  24


HTTPError: HTTP Error 403: Forbidden

In [14]:
print "Total number of Forks for the selected Repositories"
print selected_repo['forks_count']
print selected_repo['name']
print selected_repo.shape
selected_repo=selected_repo.reset_index(drop=True)

Total number of Forks for the selected Repositories
0    5298
1    4823
2    4441
3    4355
4    3388
Name: forks_count, dtype: int64
0    material-design-icons
1                  iosched
2                 protobuf
3     material-design-lite
4                    guava
Name: name, dtype: object
(5, 69)


In [15]:
data=pd.DataFrame(index=None)
data['repo_name']=selected_repo['name']
data['forks_count']=selected_repo['forks_count']
data['contributer_url']=selected_repo['contributors_url']
data.reset_index(drop=True)


Unnamed: 0,repo_name,forks_count,contributer_url
0,material-design-icons,5298,https://api.github.com/repos/google/material-d...
1,iosched,4823,https://api.github.com/repos/google/iosched/co...
2,protobuf,4441,https://api.github.com/repos/google/protobuf/c...
3,material-design-lite,4355,https://api.github.com/repos/google/material-d...
4,guava,3388,https://api.github.com/repos/google/guava/cont...


In [17]:
contri_url=data['contributer_url']
repo_name=data['repo_name']
key=['Repo_Name','Committer_Name','Commit_Count']
commit_data=pd.DataFrame(columns=key,index=None)
#commit_data=pd.DataFrame()
temp_data=pd.DataFrame(columns=key,index=None)

for url,repo in zip(contri_url,repo_name): 
    contri_json =pd.read_json(url)
    #contri_json.apply(func(repo_name)) 
    temp_data['Committer_Name']=contri_json['login']
    temp_data['Repo_Name']= repo
    temp_data['Commit_Count']=contri_json['contributions']
    commit_data=pd.concat([commit_data,temp_data],ignore_index=True)


### Top 3 commits with committer name and respective Repositories


In [18]:
#### Used Groupby and sort on selected data
commit_data.sort_values(by=['Commit_Count'],ascending=False).groupby('Repo_Name').head(3)

Unnamed: 0,Repo_Name,Committer_Name,Commit_Count
60,guava,cpovirk,1138.0
45,material-design-lite,addyosmani,697.0
30,protobuf,jskeet,567.0
46,material-design-lite,surma,555.0
31,protobuf,xfxyjwf,500.0
61,guava,kluever,493.0
47,material-design-lite,sgomes,473.0
62,guava,cgdecker,418.0
32,protobuf,pherl,358.0
15,iosched,PaulRashidi,145.0


## 4: Modular Approach

In [26]:
class GitHubRepo:
    PAGE=50 # Assuming total pages for any repo is not more than 50
    TOP_REPO=5 # selecting top 5 repositories 
    USER_ID='' #Github Username/email_id
    PASS=''  #Github Password
    API='https://api.github.com/users/google/repos?page={}' #Google Github API url; page-wise
    URL='https://api.github.com/users/google/repos'
    
    def __init__(self,username,password):
        self.USER_ID=username
        self.PASS=password
        #self.sort_by_param=sort_by_param
    
    def check_valid_input(self):
        r=requests.get(self.URL,auth=(self.USER_ID,self.PASS))
        code=r.status_code
        if code == 200:
            return True 
        elif code == 404:
            return  "API URL is incorrect!"
        else:
            return "Please provide valid GitHub Credentials"
                
            
    def print_data(self):
        print self.USER_ID, self.PASS
    
    def get_url_response(self):
        combined_json=pd.DataFrame()
        for page_number in range(1,self.PAGE):
            url = self.API.format(page_number)
            print "Accessing Page Number : ", page_number

            #Getting response through request function call
            urlData = requests.get(url,auth=(self.USER_ID,self.PASS)).content
            #Converting response into Pandas DataFrame format
            json_response =pd.read_json(urlData)
            if not (json_response.empty):
                #Combinig the data from all pages
                combined_json=pd.concat([combined_json, json_response], axis=0,ignore_index=True)
                #print len(combined_json['forks_count'])
            else:
                break
    
        return combined_json
    
    def get_sorted(self,combined_json,sort_by_param):
        sorted_repos=combined_json.sort_values(by=sort_by_param,ascending=False)
        return sorted_repos
    
    def get_top_records(self,sorted_repos):
        #Selecting TOP 5 Repositories 
        selected_repo=sorted_repos[:self.TOP_REPO]
        selected_repo=selected_repo.reset_index(drop=True)
        print selected_repo.shape
        #print type(selected_repo), selected_repo
        return selected_repo
        
    def prepare_table(self,selected_repo):
        data=pd.DataFrame(index=None)
        data['repo_name']=selected_repo['name']
        data['forks_count']=selected_repo['forks_count']
        data['contributer_url']=selected_repo['contributors_url']
        data.reset_index(drop=True)
        return data
    
    def get_contributor_url(self,data):
        contri_url=data['contributer_url']
        repo_name=data['repo_name']
        key=['Repo_Name','Committer_Name','Commit_Count']
        commit_data=pd.DataFrame(columns=key,index=None)
        #commit_data=pd.DataFrame()
        temp_data=pd.DataFrame(columns=key,index=None)

        for url,repo in zip(contri_url,repo_name): 
            contri_json =pd.read_json(url)
            #contri_json.apply(func(repo_name)) 
            temp_data['Committer_Name']=contri_json['login']
            temp_data['Repo_Name']= repo
            temp_data['Commit_Count']=contri_json['contributions']
            commit_data=pd.concat([commit_data,temp_data],ignore_index=True)
        return commit_data
    
    def get_commit_chart(self,commit_data):
        #### Used Groupby and sort on selected data
        result=commit_data.sort_values(by=['Commit_Count'],ascending=False).groupby('Repo_Name')
        return result

### Execution

In [27]:
import sys
if __name__=="__main__":
    username=raw_input("Enter GitHub UserName/EmailID \t")
    password=raw_input("Enter GitHub Password  \t")
    #print user_name,password
    obj=GitHubRepo(username,password)
    status=obj.check_valid_input()
    #print status
    if status != True:
        print status, "\nPlease try again..."
        sys.exit
    else:
        data=obj.get_url_response()
        sortdata=obj.get_sorted(data,'forks_count')
        selected_repo=obj.get_top_records(sortdata)
        #print type(selected_repo), selected_repo.shape
        data_table=obj.prepare_table(selected_repo)
        #print type(data_table),data_table
        commit_data=obj.get_contributor_url(data_table)
        final_result=obj.get_commit_chart(commit_data)
        #printing TOP 3 commitee and their commit counts
        print final_result.head(3)

Enter GitHub UserName/EmailID 	git_username
Enter GitHub Password  	git_password
Please provide valid GitHub Credentials 
Please try again...
