In [1]:
import requests
import os
import json
import pymongo
from dotenv import load_dotenv
from pprint import pprint
import datetime
from dateutil import parser
from github3 import GitHub
import github3
import time
import pandas as pd
import re


In [2]:
load_dotenv()
api_key =os.getenv("github_api_key_2")
headers = {"Authorization": f"token {api_key}"}

# scraping

In [3]:
def get_user_details(user_name):
       user_url = f'https://api.github.com/users/{user_name}'
       response = requests.get(user_url, headers=headers)
       user_json= response.json()
       starred_url = user_json['starred_url'].replace('{/owner}{/repo}', '')
       response = requests.get(starred_url)
       star_list=[]
       if response.status_code == 200:
            starred_repos = response.json()
            for repo in starred_repos:
                star_list.append(repo['full_name'])
       else:
            print("Failed to fetch starred repositories. Status code:", response.status_code)
       if not star_list:
           star_list = None
       data_list=[{
              'login':user_json['login'],
              'name':user_json['name'],
              'bio':user_json['bio'],
              'company':user_json['company'],
              'location':user_json['location'],
              'company':user_json['company'],
              'email':user_json['email'],
              'public_repos':user_json['public_repos'],
              'following_count':user_json['following'],
              'followers_count':user_json['followers'],
              'created_at':user_json['created_at'],
              'avatar_url':user_json['avatar_url'],
              'profile_url':user_json['html_url'],
       }]
       if star_list is not None:
           data_list[0]['user_starred_repo'] = ','.join(star_list)
       else:
           data_list[0]['user_starred_repo'] = None
       return data_list

#get_user_details('navinds')


In [4]:
import re
import requests
import time
from bs4 import BeautifulSoup

def scrape_repository_readme(repo_full_name):
    repo_url = f"https://github.com/{repo_full_name}"
    response = requests.get(repo_url)
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        
        readme_section = soup.find('article', class_='markdown-body')
        if readme_section:
            text = readme_section.get_text(separator=' ')
            
            
            text = re.sub(r'http\S+', '', text)
            text = re.sub(r'#+|\*{1,3}', '', text)
            text = re.sub(r'[^\w\s]', ' ', text)
            text = re.sub(r'\d', '', text)
            text = re.sub(r'\s+', ' ', text).strip()
            text = re.sub(r'\[.*?\]', '', text)
            
            lines = text.split(' ')
            cleaned_lines = []
            for line in lines:
                if not re.search(r'\([^\)]+\)\s*[-—]\s*\w+', line):
                    cleaned_lines.append(line)
            cleaned_text = ' '.join(cleaned_lines).strip()
            
            if len(cleaned_text) > 3000:
                cleaned_text = cleaned_text[:3000]
            
            return cleaned_text
        else:
            print(f"README not found for repository: {repo_full_name}")
            return None
    else:
        print(f"Failed to fetch repository: {repo_full_name}")
        return None

def get_all_repos(user_name):
    user_repos_url = f"https://api.github.com/users/{user_name}/repos"
    repos = []
    page = 1
    per_page = 100 

    while True:
        params = {'page': page, 'per_page': per_page}
        response = requests.get(user_repos_url, params=params)
        
        if response.status_code == 403:  # Rate limit error
            reset_time = int(response.headers.get('X-RateLimit-Reset'))
            sleep_time = reset_time - int(time.time()) + 5  
            time.sleep(sleep_time)
            continue
        
        response_json = response.json()
        
        if not response_json:
            break
        
        repos.extend(response_json)
        page += 1
    
    return repos

def get_all_readme_details(user_name):
    user_repos = get_all_repos(user_name)
    readme_data = []

    for repo in user_repos:
        repo_full_name = repo['full_name']
        readme = scrape_repository_readme(repo_full_name)

        if readme is not None:
            readme_data.append({
                'login': user_name,
                'repos_name': repo["name"],
                'repo_url': repo["html_url"],
                'readme': readme
            })
    
    return readme_data

# Example usage:
#readme_details = get_all_readme_details('armjscom')



In [5]:
def fetch_repository_details(repo):
    commits_count = 0
    commits_url = repo['commits_url'].replace('{/sha}', '') 
    response = requests.get(commits_url, headers=headers)
    if response.status_code == 200:
        commits_count = len(response.json())

    language_url = repo['languages_url']
    response = requests.get(language_url, headers=headers)
    languages_used = ""
    languages_list = []  # Initialize languages_list here
    if response.status_code == 200:
        languages_data = response.json()
        languages_list = list(languages_data.keys())
        languages_used = ','.join([f"{lang} ({languages_data[lang]})" for lang in languages_list])

    return commits_count, languages_used, languages_list

def get_all_repository_details(user_name):
    
    user_url = f'https://api.github.com/users/{user_name}'
    response = requests.get(user_url, headers=headers)
    user_json = response.json()
    repos_url = user_json['repos_url']
    
    repository_data = []
    page = 1
    per_page = 100  
   
    while True:
       
        params = {'page': page, 'per_page': per_page}
        response = requests.get(repos_url, headers=headers, params=params)
        response_json = response.json()
        
        if not response_json:
            break
        
        for repo in response_json:
            commits_count, languages_used, languages_list = fetch_repository_details(repo)
            
            repository_data.append({
                'login': repo['owner']['login'],
                'repo_id': repo['id'],
                'repos_name': repo["name"],
                'Language_used': repo['language'],
                'repo_url': repo["html_url"],
                'pushed_at': repo['pushed_at'],
                'size': repo['size'],
                'repos_description': repo["description"],
                'repo_created_at': repo["created_at"],
                'languages_with_count': languages_used,
                'languages_list': ' '.join(languages_list),
                'forks_count': repo["forks_count"],
                'open_issues_count': repo["open_issues"],
                'updated_at': repo['updated_at'],
                'Stargazers': repo['stargazers_count'],
                'Watchers_Counts': repo['watchers_count'],
                'commit_count': commits_count
            })
        
        page += 1
    
    return repository_data

# repository_details=get_all_repository_details('krishnaik06')
# repository_details


In [6]:
load_dotenv()
mongo_atlas_user_name = os.getenv("MONGO_ATLAS_USER_NAME") 
mongo_atlas_password =  os.getenv("MONGO_ATLAS_PASSWORD") 
client=pymongo.MongoClient(f"mongodb+srv://{mongo_atlas_user_name}:{mongo_atlas_password}@cluster0.ehfepgy.mongodb.net/?retryWrites=true&w=majority")
db = client["github"]
collection=db["github_user_details"] 

In [7]:
def get_all_detailsof_user(user_name):
    try:
        user=get_user_details(user_name)
        repository=get_all_repository_details(user_name)
        readme=get_all_readme_details(user_name)
        collection.insert_one({
                'user_data': user,
                'repository_data': repository,
                'readme_data':readme})
        print(f"Data inserted for user: {user_name}")
    except Exception as e:
        print(f"Failed to process user {user_name}. Error: {e}")
#get_all_detailsof_user('Python-Repository-Hub')

In [7]:
def get_all_detailsof_user(user_name):
    try:
        # Get the new cleaned readme details
        readme = get_all_readme_details(user_name)  # Assume this now gets cleaned data

        # Update the readme_data field
        collection.update_one(
            {'user_data.login': user_name},
            {'$set': {'readme_data': readme}}
        )

        print(f"Readme data updated for user: {user_name}")
    except Exception as e:
        print(f"Failed to process user {user_name}. Error: {e}")


#get_all_detailsof_user('charliecalvert')

In [None]:
def update_all_detailsof_user(user_name):
  try:
    user = get_user_details(user_name)
    repository = get_all_repository_details(user_name)
    readme=get_all_readme_details(user_name)

    collection.update_one(
        {'user_name': user_name},  
        {'$set': {'user_data': user, 'repository_data': repository,'readme_data':readme}},  # Update details
        upsert=True)  

    print(f"Data updated for user: {user_name}")
  except Exception as e:
    print(f"Failed to update user {user_name}. Error: {e}")
#update_all_detailsof_user('Shobana1310)

In [9]:
import os
import github3
import logging
from dotenv import load_dotenv
import asyncio
import httpx

load_dotenv()


logging.basicConfig(filename='data_collection.log', level=logging.INFO, format='%(asctime)s - %(levelname)s: %(message)s')


api_keys = [os.getenv(f"github_api_key_{i}") for i in range(1, 6)]
current_key_index = 0
api_keys_lock = asyncio.Lock()

def initialize_github_client():
    return github3.login(token=api_keys[current_key_index])

gh = initialize_github_client()

async def switch_github_client():
    global current_key_index
    async with api_keys_lock:
        current_key_index = (current_key_index + 1) % len(api_keys)
        new_client = github3.login(token=api_keys[current_key_index])
        logging.info(f"Switched to API key index {current_key_index}")
        return new_client

async def fetch_user_details(client, user_name, retries=3, backoff_factor=0.3):
    global gh
    try:
        url = f"https://api.github.com/users/{user_name}"
        headers = {
            'Authorization': f'token {api_keys[current_key_index]}',
            'Accept': 'application/vnd.github.v3+json'
        }
        for attempt in range(retries):
            response = await client.get(url, headers=headers)
            if response.status_code == 404:
                logging.warning(f"User {user_name} not found.")
                return
            elif response.status_code == 403 and 'X-RateLimit-Remaining' in response.headers and int(response.headers['X-RateLimit-Remaining']) == 0:
                logging.warning("Rate limit exceeded. Switching API key and retrying...")
                gh = await switch_github_client()
                headers['Authorization'] = f'token {api_keys[current_key_index]}'
                await asyncio.sleep(backoff_factor * (2 ** attempt))
                continue
            elif response.status_code != 200:
                logging.error(f"Error fetching user {user_name}: {response.status_code}")
                await asyncio.sleep(backoff_factor * (2 ** attempt))
                continue
            user_data = response.json()
            # Replace with actual processing of user_data
            get_all_detailsof_user(user_name)  
            logging.info(f"Processed user: {user_name}")
            return
    except Exception as e:
        logging.error(f"Exception while fetching user {user_name}: {e}")

async def process_user_batch(usernames):
    async with httpx.AsyncClient(http2=True) as client:
        tasks = [fetch_user_details(client, user_name) for user_name in usernames]
        await asyncio.gather(*tasks)

async def proactive_rate_limit_check():
    global gh  
    while True:
        rate_limit = gh.rate_limit()['rate']
        remaining = rate_limit['remaining']
        if remaining < 10: 
            logging.info("Proactively switching API key due to low remaining requests.")
            gh = await switch_github_client()
        await asyncio.sleep(5) 

async def push_users_to_mongodb(usernames, batch_size):
    collected_usernames = set()
    batch_number = 1
    for i in range(0, len(usernames), batch_size):
        batch_usernames = usernames[i:i+batch_size]
        await process_user_batch(batch_usernames)
        logging.info(f"Batch {batch_number} processed.")
        batch_number += 1

async def main():
    usernames = ["nskumara"]

    batch_size = 3  # Adjust the batch size as necessary
    logging.info(f"Processing users: {usernames}")
    
    rate_limit_task = asyncio.create_task(proactive_rate_limit_check())
    await push_users_to_mongodb(usernames, batch_size)
    rate_limit_task.cancel()


def run():
    try:
        loop = asyncio.get_event_loop()
        if loop.is_running():
            asyncio.ensure_future(main())
        else:
            loop.run_until_complete(main())
    except RuntimeError:
        new_loop = asyncio.new_event_loop()
        asyncio.set_event_loop(new_loop)
        new_loop.run_until_complete(main())

run()


README not found for repository: nskumara/AlgoAprioriPython
README not found for repository: nskumara/Assignment3_CV
README not found for repository: nskumara/Assignment_2_Flex
README not found for repository: nskumara/Assignment_2_Float
README not found for repository: nskumara/Disaster_Preparedness
README not found for repository: nskumara/Disaster_Preparedness2
README not found for repository: nskumara/Disaster_Preparedness_in_Japan
README not found for repository: nskumara/Leadership_Project_Group_18
README not found for repository: nskumara/Sri_Lanka2
README not found for repository: nskumara/Web_Final_Project
README not found for repository: nskumara/Web_Final_Project_1
README not found for repository: nskumara/Welcome_to_Sri_Lanka
Readme data updated for user: nskumara


In [None]:
api_key =os.getenv("github_api_key_3")
gh = github3.login(token=api_key)

def collect_usernames_by_keyword(keyword, batch_size):
    usernames = [] 
    search_results = gh.search_users(keyword)
    count = 0
    for user in search_results:
        usernames.append(user.login)
        count += 1
        if count >= batch_size:
            break
    return usernames

keyword = 'Data scientist'  
batch_size = 500
usernames = collect_usernames_by_keyword(keyword, batch_size)
for user_name in usernames:
    print(user_name)
  
    

