<a href="https://colab.research.google.com/github/Samin-Sadaf7/Python-Dependency-Resolver/blob/main/PythonDependencyResolver.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install requests beautifulsoup4 faiss-cpu sentence-transformers openai numpy

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_6

In [3]:
#Import Libraries
import os
import subprocess
import glob
import requests
from bs4 import BeautifulSoup
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from openai import OpenAI

In [12]:
#######################################
# 1. Repository Handling Functions
#######################################

def clone_repo(github_url):
    """
    Clone the GitHub repository if not already cloned.
    Returns the repository directory name.
    """
    repo_name = github_url.rstrip('/').split('/')[-1]
    if repo_name.endswith('.git'):
        repo_name = repo_name[:-4]
    if os.path.exists(repo_name):
        print(f"Repository '{repo_name}' already exists. Skipping clone.")
        return repo_name
    print(f"Cloning repository from {github_url} ...")
    cmd = ["git", "clone", github_url]
    result = subprocess.run(cmd, capture_output=True, text=True)
    if result.returncode != 0:
        print("Error cloning repository:", result.stderr)
        return None
    return repo_name


def read_codebase(project_dir):
    """
    Read all Python files from a project directory
    """
    code = ""
    for filepath in glob.glob(os.path.join(project_dir, '**', '*.py'), recursive=True):
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                code += f"\n# File: {filepath}\n" + f.read() + "\n"
        except Exception as e:
            print(f"Error reading {filepath}: {e}")
    return code


def read_requirements(project_dir):
    """
    Read requirements.txt from a project directory
    """
    req_path = os.path.join(project_dir, 'requirements.txt')
    if os.path.exists(req_path):
        with open(req_path, 'r', encoding='utf-8') as f:
            return f.read()
    return ""

In [13]:
#######################################
# 2. Internet Data Collection Functions
#######################################

def search_web(query):
    """
    Dummy implementation of a web search. In production, use a real search API.
    """
    urls = [
        "https://packaging.python.org/en/latest/",
        "https://www.python.org/dev/peps/pep-0508/",
        "https://pypi.org/"
    ]
    return urls


def collect_data_from_url(url):
    """
    Fetch a URL and extract its text content.
    """
    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            text = soup.get_text(separator="\n", strip=True)
            return text
        else:
            print(f"Warning: Received status code {response.status_code} from {url}")
            return ""
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return ""

In [14]:
#######################################
# 3. Embedding and Vector Database
#######################################

class EmbeddingModel:
    def __init__(self, model_name="all-MiniLM-L6-v2"):
        self.model = SentenceTransformer(model_name)

    def encode(self, text):
        return self.model.encode(text)


class VectorDB:
    def __init__(self, embedding_dim):
        self.embedding_dim = embedding_dim
        self.index = faiss.IndexFlatL2(embedding_dim)
        self.texts = []  # To store original texts

    def add(self, embedding, text):
        embedding = np.array(embedding).astype("float32").reshape(1, -1)
        self.index.add(embedding)
        self.texts.append(text)

    def search(self, query_embedding, k=5):
        query_embedding = np.array(query_embedding).astype("float32").reshape(1, -1)
        distances, indices = self.index.search(query_embedding, k)
        results = []
        for i in indices[0]:
            if i < len(self.texts):
                results.append(self.texts[i])
        return results


def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2) + 1e-10)


def rerank_results(query_embedding, texts, embedding_model):
    """
    Re-rank texts based on cosine similarity between their embeddings and the query.
    """
    scored_texts = []
    for text in texts:
        text_embedding = embedding_model.encode(text)
        score = cosine_similarity(query_embedding, text_embedding)
        scored_texts.append((text, score))
    scored_texts.sort(key=lambda x: x[1], reverse=True)
    return [text for text, score in scored_texts]

In [15]:
#######################################
# 4. LLM Client Functions
#######################################

def get_dependency_status(context, codebase, package_versions, openai_api_key):
    """
    Ask the LLM whether package version changes are required.
    Expects an answer formatted as:
      Package Version Changes Required: <Yes/No>
      Explanation: <brief explanation>
    """
    from openai import OpenAI

    client = OpenAI(api_key=openai_api_key)

    system_message = """You are an expert Python dependency resolver. Analyze the given codebase and
    requirements to determine if package version changes are needed to resolve dependency issues."""

    user_message = f"""
    Relevant context from documentation and discussions:
    {context}

    Codebase:
    {codebase}

    Current package requirements:
    {package_versions}

    Answer in format:
    Package Version Changes Required: <Yes/No>
    Explanation: <brief explanation>"""

    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": user_message}
        ],
        temperature=0.2,
        max_tokens=150
    )
    return response.choices[0].message.content.strip()


def get_new_requirements(context, codebase, package_versions, openai_api_key):
    """
    Ask the LLM to generate an updated requirements.txt file.
    If no changes are needed, the LLM should output the original file.
    """
    from openai import OpenAI

    client = OpenAI(api_key=openai_api_key)

    system_message = """You are an expert Python dependency resolver. Generate an updated requirements.txt
    that resolves dependency conflicts. If no changes needed, return the original content."""

    user_message = f"""
    Relevant context from documentation and discussions:
    {context}

    Codebase:
    {codebase}

    Current package requirements:
    {package_versions}

    Output only valid requirements.txt contents:"""

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": user_message}
        ],
        temperature=0.2,
        max_tokens=500
    )
    return response.choices[0].message.content.strip()

In [16]:
#######################################
# 5. Main Pipeline
#######################################

def run_pipeline(github_url, openai_api_key):
    # Clone the main repository
    repo_dir = clone_repo(github_url)
    if not repo_dir:
        print("Failed to clone repository. Exiting.")
        return

    # Path to Dataset directory
    dataset_dir = os.path.join(repo_dir, "Dataset")
    if not os.path.exists(dataset_dir):
        print("Dataset directory not found. Exiting.")
        return

    # Process each project in the Dataset directory
    for project_name in os.listdir(dataset_dir):
        project_dir = os.path.join(dataset_dir, project_name)

        # Skip non-directories and files without "project" in name
        if not os.path.isdir(project_dir) or "project" not in project_name.lower():
            continue

        print(f"\n{'='*40}")
        print(f"Processing project: {project_name}")
        print(f"{'='*40}")

        # Read project contents
        codebase = read_codebase(project_dir)
        package_versions = read_requirements(project_dir)

        if not codebase:
            print(f"No Python code found in {project_name}. Skipping.")
            continue

        # Collect contextual information (existing implementation)
        query = "Python dependency resolution best practices and package compatibility"
        urls = search_web(query)
        collected_texts = [collect_data_from_url(url) for url in urls]
        collected_texts = [text for text in collected_texts if text]

        # Build context (existing implementation)
        context = ""
        if collected_texts:
            embedding_model = EmbeddingModel()
            embedding_dim = 384
            vector_db = VectorDB(embedding_dim=embedding_dim)
            for text in collected_texts:
                emb = embedding_model.encode(text)
                vector_db.add(emb, text)

            query_embedding = embedding_model.encode(query)
            search_results = vector_db.search(query_embedding, k=5)
            reranked_results = rerank_results(query_embedding, search_results, embedding_model)
            context = "\n\n".join(reranked_results)

        # Get dependency status
        status_response = get_dependency_status(context, codebase, package_versions, openai_api_key)
        print("\n=== Dependency Status ===")
        print(status_response)

        # Generate new requirements
        new_requirements = get_new_requirements(context, codebase, package_versions, openai_api_key)

        # Save results per project
        output_dir = os.path.join(repo_dir, "generated_requirements")
        os.makedirs(output_dir, exist_ok=True)
        output_path = os.path.join(output_dir, f"{project_name}_requirements.txt")

        with open(output_path, "w", encoding="utf-8") as f:
            f.write(new_requirements)

        print(f"\n=== Generated requirements saved to: {output_path} ===")

    print("\nProcessing completed for all projects in Dataset folder.")

In [17]:
from google.colab import userdata
OpenAI_API_KEY = userdata.get('OpenAIKey')

In [18]:
run_pipeline(
    github_url="https://github.com/Samin-Sadaf7/Python-Dependency-Resolver.git",
    openai_api_key= OpenAI_API_KEY
)

Cloning repository from https://github.com/Samin-Sadaf7/Python-Dependency-Resolver.git ...

Processing project: tensorflow_project


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


=== Dependency Status ===
Package Version Changes Required: Yes

Explanation: The current package requirements specify `tensorflow==2.5.0` and `keras==3.0.0`. However, in the provided codebase, the import statement `from tensorflow.keras import layers` suggests the usage of TensorFlow's integrated Keras module. In TensorFlow 2.5.0, Keras is included as part of TensorFlow itself, and the standalone `keras` package is not needed. Therefore, to resolve this dependency issue and ensure compatibility, the `keras` package should be removed from the requirements, and only `tensorflow==2.5.0` should be specified.

=== Generated requirements saved to: Python-Dependency-Resolver/generated_requirements/tensorflow_project_requirements.txt ===

Processing project: pandas_project

=== Dependency Status ===
Package Version Changes Required: Yes
Explanation: The current package requirements specify numpy version 2.0.0, which is not a valid version. The correct version should be numpy==1.21.0 to match