# Scraping des données des premiéres 6 mois de 2023

In [4]:
import datetime
import requests
import csv
import pandas as pd
from datetime import datetime, timedelta
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

In [8]:
# URL de base de GitHub
base_url = 'https://api.github.com'

# Nom d'utilisateur de GitHub
username = 'your username in github'

# Jeton d'accès personnel 
access_token = 'your token'

# date de début
start_date_str = input("Enter the start date (YYYY-MM-DD): ")
start_date = datetime.strptime(start_date_str, "%Y-%m-%d").date()

# date de fin
end_date_str = input("Enter the end date (YYYY-MM-DD): ")
end_date = datetime.strptime(end_date_str, "%Y-%m-%d").date()

# nombre de repositories par jour
repositories_per_day = int(input("Enter the number of repositories per day: "))

# nom du fichier csv où on va stocker les données extraites
output_file = 'repositories'+ start_date_str + '_' + end_date_str + '_' + str(repositories_per_day) + 'PerDay' + '.csv'

# calcul de nombre de jours
days_to_scrape = (end_date - start_date).days

# Initialisation de la liste de repositories 
repositories = []

# méchanism de retry pour les requêtes d'API
retry_strategy = requests.packages.urllib3.util.retry.Retry(
    total=3,
    backoff_factor=0.5,
    status_forcelist=[429, 500, 502, 503, 504]
)
adapter = requests.adapters.HTTPAdapter(max_retries=retry_strategy)
http = requests.Session()
http.mount("https://", adapter)

# Authentification en utilisant le jeton d'accès personnel
headers = {'Authorization': f'token {access_token}'}

# Répétition de l'opération chaque jour
current_date = start_date
while current_date <= end_date:
    formatted_date = current_date.strftime('%Y-%m-%d')

    # Récupérer les repositories en utilisant la pagination
    page = 1
    repositories_fetched = 0
    while repositories_fetched < repositories_per_day:
        # Création d'URL de l'API pour récupérer les repositories créés le jour même et la page spécifique.
        url = f'{base_url}/search/repositories?q=created:{formatted_date}&sort=stars&order=desc&per_page=100&page={page}'

        # Effectuation de la demande d'API avec une logique de répétition
        response = http.get(url, headers=headers)
        data = response.json()

        if 'items' in data:
            # l'extraction des informations relatives au repositories de la réponse
            for item in data['items']:
                repository = {
                    'name': item['name'],
                    'url': item['html_url'],
                    'description': item['description'],
                    'stars': item['stargazers_count'],
                    'created_at': item['created_at'],
                    'language': item.get('language', ''),
                    'forks': item['forks'],
                    'watchers': item['watchers'],
                    'open_issues': item['open_issues'],
                    'owner': item['owner']['login']
                }

                repositories.append(repository)
                repositories_fetched += 1

                if repositories_fetched >= repositories_per_day:
                    break

        page += 1

        if 'next' not in response.links:
            break

    current_date += timedelta(days=1)

# Ecrire les données extraites dans un fichier CSV
with open(output_file, 'w', newline='', encoding='utf-8') as file:
    fieldnames = ['name', 'url', 'description', 'stars', 'created_at', 'language', 'forks', 'watchers', 'open_issues', 'owner']
    writer = csv.DictWriter(file, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(repositories)

print('Repositories scraped and saved successfully!')

Repositories scraped and saved successfully!


In [5]:
df = pd.read_csv('repositories2023-01-01_2023-06-30_100PerDay.csv')
df.head()

Unnamed: 0,name,url,description,stars,created_at,language,forks,watchers,open_issues,owner
0,learn-javascript,https://github.com/sumn2u/learn-javascript,A book that teaches JavaScript,635,2023-01-01T15:16:26Z,HTML,32,635,1,sumn2u
1,NetCodeTop,https://github.com/bianchenglequ/NetCodeTop,收集GitHub上有关.Net、.NetCore有趣、有用、热门的开源项目。,589,2023-01-01T16:52:06Z,,107,589,1,bianchenglequ
2,LinksHub,https://github.com/rupali-codes/LinksHub,LinksHub aims to provide developers with acces...,359,2023-01-01T18:55:44Z,TypeScript,305,359,99,rupali-codes
3,90DaysOfDevOps,https://github.com/LondheShubham153/90DaysOfDe...,This repository is a Challenge for the DevOps ...,352,2023-01-01T11:41:21Z,Python,2274,352,54,LondheShubham153
4,JavaScriptCodingChallenges,https://github.com/jahidulislamzim/JavaScriptC...,Hello JavaScript code newbie! In this reposito...,221,2023-01-01T14:04:36Z,,41,221,0,jahidulislamzim


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18100 entries, 0 to 18099
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   name         18099 non-null  object
 1   url          18100 non-null  object
 2   description  15239 non-null  object
 3   stars        18100 non-null  int64 
 4   created_at   18100 non-null  object
 5   language     15600 non-null  object
 6   forks        18100 non-null  int64 
 7   watchers     18100 non-null  int64 
 8   open_issues  18100 non-null  int64 
 9   owner        18100 non-null  object
dtypes: int64(4), object(6)
memory usage: 1.4+ MB
