In [152]:
import os
from pathlib import Path
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone
import json
import requests

API_KEY = "52eac678-8ca9-4c63-a235-75e4f2494172"
owner = "Souradeep028"

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
pc = Pinecone(api_key=API_KEY)

# pc.create_index(
#         name='github-repos', 
#         dimension=384,
#         metric='cosine',
#         spec=PodSpec(
#             environment="gcp-starter"
#           )
#     )
index = pc.Index("github-repos")

In [158]:
def generate_embeddings(texts):
    return model.encode(texts, show_progress_bar=True)

def chunker(seq, size):
    for pos in range(0, len(seq), size):
        yield seq[pos:pos + size]

def process_files_and_upload_embeddings(directory_path, chunk_size=10):
    directory_path = Path(directory_path).resolve()
    all_files_data = []

    for file_path in directory_path.rglob('*'):
        if file_path.is_file():
            repo_name = file_path.relative_to(directory_path).parts[0]
            excluded_extensions = ['.png', '.jpg', '.jpeg', '.gif', '.svg', '.ico',
                                   '.zip', '.rar', '.tar', '.gz',
                                   '.mp3', '.mp4', '.avi', '.mov',
                                   '.DS_Store']
            
            if any(file_path.suffix == ext for ext in excluded_extensions) or '.git' in file_path.parts or '.ipynb_checkpoints' in file_path.parts:
                continue
            
            try:
                with file_path.open('r', encoding='utf-8', errors='ignore') as file:
                    content = file.read()
                    all_files_data.append((repo_name, file_path.name, '/'.join(str(file_path.relative_to(directory_path)).split('/')[1:]), content))
            except Exception as e:
                print(f"Error processing {file_path}: {e}")

    for chunk in chunker(all_files_data, chunk_size):
        repo_names, file_names, file_rel_path, texts = zip(*chunk)
        embeddings = generate_embeddings(texts)
        vectors = [
            {
                "id": f"{repo_name}#{file_name}", 
                "values": emb,
                "metadata": {"file_name": file_name, "repo": repo_name, "file_path": file_rel_path}
            } 
            for i, (repo_name, file_name, file_rel_path, emb) in enumerate(zip(repo_names, file_names, file_rel_path, embeddings))
        ]
        index.upsert(vectors=vectors)
        # print(vectors[0]['metadata'])

directory_path = './Development/AI:ML/downloaded_repos'
process_files_and_upload_embeddings(directory_path)


Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.30it/s]
Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  8.80it/s]


In [160]:
def get_repo_link(repo_name):
    base_url = f"https://api.github.com/repos/{owner}/{repo_name}"

    response = requests.get(base_url)

    if response.status_code == 200:
        data = response.json()
        repo_details = {
            'html_url': data.get('html_url'),
            'ssh_url': data.get('ssh_url'),
            'visibility': 'public' if data.get('private') == False else 'private',
            'created_at': data.get('created_at'),
            'languages': []
        }

        languages_response = requests.get(f"{base_url}/languages")
        if languages_response.status_code == 200:
            languages_data = languages_response.json()
            repo_details['languages'] = list(languages_data.keys())
        else:
            repo_details['languages'] = ['Error fetching languages']

        return repo_details
    else:
        return {'error': 'Failed to retrieve repository information'}

def query(query_text):
    query_embedding = generate_embeddings(query_text).tolist()
    matches = index.query(vector=query_embedding, top_k=5, include_metadata=True)['matches']
    
    results = []
    for match in matches:
        repo_name = match['metadata']['repo']
        file_name = match['metadata']['file_name']
        file_path = match['metadata']['file_path']
        score = match['score']
        repo_details = get_repo_link(repo_name)
        html_url = repo_details.get('html_url')
        
        result = {
            'file_name': file_name,
            'file_url': f"{html_url}/tree/main/{file_path}",
            'repository': repo_name,
            'repo_link': html_url,
            'ssh_url': repo_details.get('ssh_url'),
            'languages_used': repo_details.get('languages'),
            'owner': owner,
            'visibility': repo_details.get('visibility'),
            'created_at': repo_details.get('created_at'),
            'relevance': f"{score * 100:.1f}%"
        }
        results.append(result)
    json_output = json.dumps(results, indent=4)
    return json_output

query_text = """
What is the easiest way to deploy Next.js app?
"""

result = query(query_text)
print(result)



Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.09s/it]


[
    {
        "file_name": "README.md",
        "file_url": "https://github.com/Souradeep028/tic-tac-toe/tree/main/README.md",
        "repository": "tic-tac-toe",
        "repo_link": "https://github.com/Souradeep028/tic-tac-toe",
        "ssh_url": "git@github.com:Souradeep028/tic-tac-toe.git",
        "languages_used": [
            "JavaScript",
            "CSS"
        ],
        "owner": "Souradeep028",
        "visibility": "public",
        "created_at": "2024-03-10T13:27:54Z",
        "relevance": "60.5%"
    },
    {
        "file_name": ".gitignore",
        "file_url": "https://github.com/Souradeep028/tic-tac-toe/tree/main/.gitignore",
        "repository": "tic-tac-toe",
        "repo_link": "https://github.com/Souradeep028/tic-tac-toe",
        "ssh_url": "git@github.com:Souradeep028/tic-tac-toe.git",
        "languages_used": [
            "JavaScript",
            "CSS"
        ],
        "owner": "Souradeep028",
        "visibility": "public",
        "created_at": 