In [None]:
import requests
import csv
import time

# Define your GitHub username and personal access token
username = 'ADD_YOUR_USERNAME_HERE'
token = 'ADD_YOUR_ACCESS_TOKEN_HERE

# GitHub API URL to fetch starred repositories
url = f'https://api.github.com/users/{username}/starred'

# Headers for GitHub API request
headers = {
    'Authorization': f'token {token}',
    'Accept': 'application/vnd.github.v3+json'
}

# Function to get the content of the repository
def get_repo_content(repo_full_name):
    print(f"Fetching content for repository: {repo_full_name}")
    content_url = f'https://api.github.com/repos/{repo_full_name}/contents'
    content_response = requests.get(content_url, headers=headers)

    if content_response.status_code == 200:
        return content_response.json()
    else:
        print(f"Failed to get contents for {repo_full_name}, status code: {content_response.status_code}")
        return []

# Function to split content into chunks, ensuring no split occurs mid-line
def split_content(content, chunk_size=5000):
    lines = content.splitlines(keepends=True)  # Preserve line breaks
    chunks = []
    current_chunk = ""

    for line in lines:
        if len(current_chunk) + len(line) <= chunk_size:
            current_chunk += line
        else:
            chunks.append(current_chunk)
            current_chunk = line

    if current_chunk:
        chunks.append(current_chunk)

    return chunks

# Function to check if the file is text or binary (like PDF)
def is_text_file(content_type, file_extension):
    binary_file_extensions = ['.pdf', '.png', '.jpg', '.jpeg', '.gif', '.zip', '.exe', '.tar', '.gz', '.rar']
    if any(file_extension.lower().endswith(ext) for ext in binary_file_extensions):
        return False
    if 'text' in content_type or 'json' in content_type:
        return True
    return False

# Function to fetch starred repositories with pagination and save them to a CSV
def fetch_starred_repos():
    page = 1
    has_more = True

    print("Opening CSV file for writing...")
    with open('starred_repos.csv', mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file, quoting=csv.QUOTE_ALL, escapechar='\\')
        writer.writerow(['Repo Name', 'Repo URL', 'File Path', 'Chunk Number', 'File Content'])

        while has_more:
            print(f"Fetching page {page} of starred repositories...")
            response = requests.get(f'{url}?page={page}', headers=headers)

            if response.status_code != 200:
                print(f"Failed to fetch starred repositories on page {page}: {response.status_code}")
                return

            starred_repos = response.json()
            print(f"Processing {len(starred_repos)} repositories on page {page}...")

            if len(starred_repos) == 0:
                print(f"No more repositories to fetch on page {page}.")
                has_more = False
                break

            for repo in starred_repos:
                repo_name = repo['name']
                repo_url = repo['html_url']
                repo_full_name = repo['full_name']

                # Fetch repository content
                repo_content = get_repo_content(repo_full_name)

                for item in repo_content:
                    # Only handle files, not directories
                    if item['type'] == 'file':
                        file_path = item['path']
                        file_content_url = item['download_url']

                        # Fetch the file content
                        try:
                            file_content_response = requests.get(file_content_url)
                            file_content_type = file_content_response.headers.get('Content-Type', '')
                            file_extension = file_path.split('.')[-1] if '.' in file_path else ''

                            # Skip binary files like PDFs
                            if not is_text_file(file_content_type, file_extension):
                                print(f"Skipping binary file: {file_path} (type: {file_content_type})")
                                continue

                            file_content = file_content_response.text
                        except Exception as e:
                            print(f"Failed to fetch content for {file_path}: {e}")
                            continue

                        # Split the file content into chunks and write them to the CSV
                        chunks = split_content(file_content, chunk_size=5000)

                        for i, chunk in enumerate(chunks, start=1):
                            writer.writerow([repo_name, repo_url, file_path, i, chunk])

            page += 1
            print(f"Moving to next page: {page}")
            time.sleep(1)

    print("CSV file written successfully!")

# Run the function to fetch starred repos and generate CSV
fetch_starred_repos()

from google.colab import files
print("Downloading CSV file...")
files.download('starred_repos.csv')
print("Download initiated.")