# Projet

Par Ulysse ARNAUD et Ryane ABBACHE

On utlisera l'API de Github

In [None]:
!pip install 'pymongo[srv]'

In [None]:
from pymongo import MongoClient
import pandas

endpointMongo = "mongo"
# endpointMongo = "mongodb://dbUser:aTerribleSecret@cluster0-shard-00-00.nxr70.mongodb.net:27017,cluster0-shard-00-01.nxr70.mongodb.net:27017,cluster0-shard-00-02.nxr70.mongodb.net:27017/myFirstDatabase?ssl=true&replicaSet=atlas-a985fp-shard-0&authSource=admin&w=majority"
client = MongoClient(endpointMongo)
database = client['projet']

# database.command("serverStatus")

#### Déclaration des endpoints et colonnes

In [None]:
endpoints = {
    'users': {
        'all': 'https://api.github.com/users',
        'get': lambda user: 'https://api.github.com/users/' + user,
        'following': lambda user: 'https://api.github.com/users/' + user + '/following',
        'repos': lambda user: 'https://api.github.com/users/' + user + '/repos'
    },
    'users': 'https://api.github.com/users'
}

In [None]:
columns = {
    'users': {'id', 'gravatar_id', 'node_id', 'site_admin', 'type', 'login', 'name', 'company'},
    'repos': {'id', 'node_id', 'name', 'full_name', 'private', 'owner', 'description', 'fork', 'homepage', 'size', 'language', 'archived', 'disabled', 'license', 'default_branch'}
}

#### Initialisation de la base de données

In [None]:
# Modules qui seront utiles par la suite :
#   - get permettant la récupération de données depuis une API
#   - types permettant de faire des comparaisons sur les types de Python

import types
from requests import get
from requests.auth import HTTPBasicAuth

In [None]:
# Création des collections permettant la manipulation optimale de l'API Github.

collection_users = database.users
collection_repos = database.repositories # On sait jamais

In [None]:
# getEndpoint : Récupération d'un endpoint
# exemple : 
# getEndpoint('user.get', 'mexanga')
def getEndpoint(endpoint, *args):
    splitOfEndpoints = endpoint.split('.')
    cursorEndpoint = None
    
    for cursorSplit in splitOfEndpoints:
        try:
            if None == cursorEndpoint: cursorEndpoint = endpoints[cursorSplit]
            else: cursorEndpoint = cursorEndpoint[cursorSplit]
        except: pass
        
    if type(cursorEndpoint) is types.LambdaType: cursorEndpoint = cursorEndpoint(*args)
    
    return cursorEndpoint

# requestEndpoint : Retourne un json
# exemple : 
# requestEndpoint('user.get', 'mexanga')
def requestEndpoint(endpoint, *argument):
    endpoint = getEndpoint(endpoint, *argument)
    return get(endpoint, auth=HTTPBasicAuth('mexanga', '8eabe1626601b2987c2ae360582a94be11e9a3de')).json()

# readAndSetColumnsInJson : Recupere un json et définie les colonnes
def readAndSetColumnsInJson(endpoint, columns, *args):
    datas = pandas.read_json(getEndpoint(endpoint, args), orient="records")
    datas = readDataframeAndSetColumns(datas, columns)
    return datas

# readDataframeAndSetColumns : Recupere un dataframe et définie les colonnes
def readDataframeAndSetColumns(dataframe, columns):
    dataframe = setColumnsOfDataframe(dataframe, columns)
    return dataframe

# setColumnsOfDataframe : Définie les colonnes d'un dataframe
def setColumnsOfDataframe(dataframe,columns):
    for column in dataframe.columns:
        if column in columns: continue
        else: dataframe.drop(column, inplace=True, axis=1)
    return dataframe

def dictToDataframe(entries):
    return pandas.Dataframe(entries)

def initializeDataframe(name, *args, **kwargs):
    endpoint = None
    
    for key,value in kwargs.items():
        if key == 'endpoint':
            endpoint = value
            
    dataframe = {}
    
    if endpoint != None:
        dataframe = readAndSetColumnsInJson(endpoint, columns[name], *args)
        dataframe.rename(columns={'id':'_id'}, inplace=True)
        dataframe = dataframe.drop_duplicates(subset='_id', keep='last')
    
    return dataframe

def mergeDataframe(name, dataframe):
    # On nettoie la collection afin d'éviter les erreurs même si ce n'est
    # pas la meilleur pratique
    database[name].drop()
    values = dataframe.to_dict(orient="records")
    for value in values:
        database[name].insert_one(value)

###  Utilisateurs

#### Initialisation de la collection

In [None]:
def initializeUsersCollection():
    users = initializeDataframe('users', endpoint="users.all")
    mergeDataframe('users', users)

In [None]:
initializeUsersCollection()

#### Récupération de tous les utilisateurs

In [None]:
def getAllUsers(**kwargs):
    to_list = False
    limit = False
    
    for key,value in kwargs.items():
        if key == 'to_list':
            to_list = value
        if key == 'limit':
            limit = value
    
    collection = database['users']
    
    result = collection.find()
    
    if type(limit) == int: result = result.limit(limit)
        
    if True == to_list: result = list(result)
    
    return result

In [None]:
getAllUsers(to_list=True,limit=3)

#### Récupération d'un utilisateur via le pseudo

In [None]:
def getUserByPseudo(pseudo, **kwargs):
    to_list = False
    
    for key,value in kwargs.items():
        if key == 'to_list':
            to_list = value
    
    collection = database['users']
    
    user = collection.find({
        'login': pseudo
    })
        
    if True == to_list: user = list(user)
    
    return user

In [None]:
getUserByPseudo('defunkt', to_list=True)

###  Répertoires

#### Initialisation de la collection

In [None]:
def initializeReposCollection(**kwargs):
    endpoint=None
    pseudo=None
    
    for key,value in kwargs.items():
        if key == 'pseudo':
            pseudo = value
            
    if pseudo == None:
        collection = initializeDataframe('repositories')
        users = getAllUsers()
        for user in users:
            repos = getReposOfUsersFromAPI(user['login'])
            for repo in repos:
                try:
                    repo['user_id'] = user['_id']
                    collection.insert_one(repo)
                except: pass
    else:
        repos = initializeDataframe('repos', pseudo, endpoint="users.repos")
        mergeDataframe('repositories', repos)
        
def getReposOfUsersFromAPI(pseudo):
    return requestEndpoint('users.repos', pseudo)

In [None]:
initializeReposCollection()

#### Tous les repertoires

In [None]:
def getAllRepos(**kwargs):
    to_list = False
    
    for key,value in kwargs.items():
        if key == 'to_list':
            to_list = value

    collection = database['repositories']
    
    repos = collection.find({})
        
    if True == to_list: repos = list(repos)
    
    return repos

In [None]:
getAllRepos(to_list=True)

#### Répertoires d'un utilisateur

In [None]:
# TODO: Réussir à associer l'utilisateur aux répertoires
# TODO: Utiliser .aggregate() pour faire la selection dans "users" pour le pseudo et "repositories" ?
def getReposOfUserByPseudo(pseudo, **kwargs):
    to_list = False
    
    for key,value in kwargs.items():
        if key == 'to_list':
            to_list = value

    collection = database['repositories']
    
    user = getUserByPseudo(pseudo)
    
    repos = collection.find({
        'user_id': user._id
    })
        
    if True == to_list: repos = list(repos)
    
    # return repos

In [None]:
getReposOfUserByPseudo('mexanga', to_list=True)

#### Nombre de répertoires par utilisateur

In [None]:
# TODO: Réussir à associer l'utilisateur aux répertoires
# TODO: Pour améliorer l'affichage, il faudrait mieux afficher le pseudo que son id
def getNumberOfReposByUsers(**kwargs):
    to_list = False
    
    for key,value in kwargs.items():
        if key == 'to_list':
            to_list = value

    collection = database['repositories']
    
    numbersOfReposByUser = collection.aggregate([
        { "$group": { "_id": "$user_id"}  },
        { "$group": { "_id": "$_id", "count": { "$sum": 1 } } } 
    ])
    
    if True == to_list: numbersOfReposByUser = list(numbersOfReposByUser)
    
    return numbersOfReposByUser

In [None]:
getNumberOfReposByUsers(to_list=True)

#### Répertoires les plus populaires

In [None]:
def getFamousRepositories(**kwargs):
    to_list = False
    limit = False
    
    for key,value in kwargs.items():
        if key == 'to_list':
            to_list = value
        if key == 'limit':
            limit = value
            
    collection = database['repositories']
    
    result = collection.find().sort('views', -1)
            
    if type(limit) == int: result = result.limit(limit)
        
    if True == to_list: result = list(result)
        
    return result

In [None]:
getFamousRepositories(to_list=True,limit=100)

#### Répertoire d'une personne les plus populaire

In [None]:
# TODO: Requete à faire
def getFamousRepositoriesOfUser(**kwargs):
    to_list = False
    limit = False
    
    for key,value in kwargs.items():
        if key == 'to_list':
            to_list = value
        if key == 'limit':
            limit = value
            
    collection = database['repositories']
    
    result = None
            
    if type(limit) == int: result = result.limit(limit)
        
    if True == to_list: result = list(result)
        
    return result

In [None]:
getFamousRepositoriesOfUser(to_list=True)

#### Répertoires populaire par langages

In [None]:
# TODO : Requête à faire
def getFamousReposByLanguage(**kwargs):
    to_list = False
    limit = False
    
    for key,value in kwargs.items():
        if key == 'to_list':
            to_list = value
        if key == 'limit':
            limit = value
            
    collection = database['repositories']
    
    result = None 
            
    if type(limit) == int: result = result.limit(limit)
        
    if True == to_list: result = list(result)
        
    return result

In [None]:
getFamousReposByLanguage(to_list=True)

#### Répertoires populaires d'un langage

In [None]:
def getFamousReposOfLanguage(language, **kwargs):
    to_list = False
    limit = False
    
    for key,value in kwargs.items():
        if key == 'to_list':
            to_list = value
        if key == 'limit':
            limit = value
            
    collection = database['repositories']
    
    result = collection.find({
        'language': language
    }) 
            
    if type(limit) == int: result = result.limit(limit)
        
    if True == to_list: result = list(result)
        
    return result

In [None]:
getFamousReposOfLanguage('PHP', to_list=True)