In [None]:
!pip install requests nbformat openai langchain

In [None]:
!pip install openai

In [None]:
!pip install GitPython

In [None]:
!pip install langchain

# Overall Work :
This code analyzes code repositories on GitHub to identify the most challenging repository based on technical complexity. It preprocesses the code, calculates complexity scores using code size and duplication impact, and generates a GPT analysis justifying the selection. The script also includes functions for tokenizing code, normalizing variable and function names, and finding duplicated code snippets. The OpenAI and GitHub API keys are utilized for authentication.

In [12]:
import difflib
import requests
import re
from langchain.llms import openai
import openai
from urllib.parse import urlparse
import nbformat
from nbconvert import PythonExporter
from tokenize import tokenize, untokenize, NUMBER, STRING, NAME, OP
from io import BytesIO

# Set your OpenAI API key
openai.api_key = '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'

# Set your GitHub API key
github_api_key = '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'

MAX_TOKENS = 1000  # Set the maximum token limit for GPT


def preprocess_code(repository):
    if 'code' in repository:
        code = repository['code']

        # Remove comments and whitespace
        code = re.sub(r'\/\/.*', '', code)  # Remove single-line comments
        code = re.sub(r'\/\*(\*(?!\/)|[^*])*\*\/', '', code)  # Remove multi-line comments
        code = code.strip()  # Remove leading/trailing whitespace
        code = re.sub(r'\s+', ' ', code)  # Collapse multiple consecutive spaces

        # Normalize variable and function names
        code = normalize_variable_names(code)
        code = normalize_function_names(code)

        # Extract code snippets from Jupyter notebooks
        if code.startswith('%') or code.startswith('!'):
            notebook = nbformat.reads(code, nbformat.NO_CONVERT)
            python_exporter = PythonExporter()
            (python_code, _) = python_exporter.from_notebook_node(notebook)
            code = python_code.strip()

        # Handle large file sizes (e.g., split into smaller chunks)
        if len(code) > MAX_TOKENS:
            # Split the code into smaller chunks to fit within the token limit
            chunks = []
            current_chunk = ""
            for line in code.split('\n'):
                if len(current_chunk + line) < MAX_TOKENS:
                    current_chunk += line + '\n'
                else:
                    chunks.append(current_chunk.strip())
                    current_chunk = line + '\n'
            if current_chunk:
                chunks.append(current_chunk.strip())
            code = chunks

        # Convert code to a suitable format for analysis (e.g., AST, tokenization)
        code = tokenize_code(code)

        # Apply preprocessing steps here
        preprocessed_code = code.strip()

        # Update the repository object or return the preprocessed code
        repository['code'] = preprocessed_code
        # Or return the preprocessed code directly: return preprocessed_code

    return repository


def normalize_variable_names(code):
    code = re.sub(r'\bvar\b', 'normalized_var', code)
    return code


def normalize_function_names(code):
    code = re.sub(r'\bfunc\b', 'normalized_func', code)
    return code


def tokenize_code(code):
    # Tokenize the code using Python's tokenize module
    tokens = tokenize(BytesIO(code.encode('utf-8')).readline)

    # Filter out tokens that are not relevant for analysis
    filtered_tokens = [
        token for token in tokens
        if token.type in (NUMBER, STRING, NAME, OP)
    ]

    # Untokenize the filtered tokens to obtain the transformed code
    transformed_code = untokenize(filtered_tokens).decode('utf-8')

    return transformed_code


def assess_repository(repository):
    # Implement prompt engineering when passing code through GPT for evaluation to determine its technical complexity

    # Extract the name and description from the repository
    name = repository['name'] or ""
    description = repository['description'] or ""

    # Generate a textual description/summary of the repository using GPT
    response = openai.Completion.create(
        engine='text-davinci-003',
        prompt=name + "\n" + description,
        max_tokens=100
    )
    summary = response.choices[0].text.strip()

    # Assess the technical complexity of the repository based on the generated summary
    complexity_score = len(summary.split(' '))  # Placeholder value

    return complexity_score


def analyze_repository(repository):
    if 'code' in repository:
        code = repository['code']

        # Calculate code size
        lines_of_code = len(code.split('\n'))

        # Calculate code duplication impact
        duplication_impact = calculate_duplication_impact(code)

        # Calculate complexity score based on code size and duplication impact
        complexity_score = lines_of_code + duplication_impact

        return complexity_score
    else:
        return 0


def calculate_duplication_impact(code):
    # Identify duplicated code snippets and measure their impact
    duplicated_code = find_duplicated_code(code)

    # Calculate the impact of code duplication
    duplication_impact = len(duplicated_code) * 0.5  # Placeholder value

    return duplication_impact


def find_duplicated_code(code):
    # Example using difflib to find similar lines of code
    lines = code.split('\n')
    duplicated_code = []

    for i, line in enumerate(lines):
        for j in range(i + 1, len(lines)):
            similarity = difflib.SequenceMatcher(None, line, lines[j]).ratio()
            if similarity > 0.8:  # Adjust the similarity threshold as needed
                duplicated_code.append(line)

    return duplicated_code


def get_most_challenging_repository(profile_url):
    # Extract the username from the GitHub profile URL
    parsed_url = urlparse(profile_url)
    if parsed_url.netloc == 'github.com':
        path_parts = parsed_url.path.split('/')
        if len(path_parts) >= 2:
            username = path_parts[1]
        else:
            print("Invalid GitHub profile URL.")
            return None
    else:
        print("Invalid GitHub profile URL.")
        return None

    # Fetch user repositories from GitHub API
    headers = {
        'Authorization': f'token {github_api_key}'
    }
    response = requests.get(f'https://api.github.com/users/{username}/repos', headers=headers)
    repositories = response.json()

    most_challenging_repository = None
    highest_complexity_score = 0

    for repository in repositories:
        repository = preprocess_code(repository)
        complexity_score = analyze_repository(repository)
        complexity_challenge_score = assess_repository(repository)

        if complexity_challenge_score > highest_complexity_score:
            highest_complexity_score = complexity_challenge_score
            most_challenging_repository = repository

    return most_challenging_repository


# Example usage
github_profile_url = 'https://github.com/Pritesh-Lathiya'
most_challenging_repo = get_most_challenging_repository(github_profile_url)

if most_challenging_repo:
    # Generate a GPT analysis justifying the selection
    response = openai.Completion.create(
        engine='text-davinci-003',
        prompt=f"I selected [{most_challenging_repo['name']}]({most_challenging_repo['html_url']}) as the most complex repository because...",
        max_tokens=100
    )
    gpt_analysis = response.choices[0].text.strip()
    print("Most complex repository:", most_challenging_repo['name'])
    print("Most complex repository:", most_challenging_repo['html_url'])
    print("GPT Analysis:", gpt_analysis)
else:
    print("No repositories found.")


Most complex  repository: Simple-Linear-Regression
Most complex  repository: https://github.com/Pritesh-Lathiya/Simple-Linear-Regression
GPT Analysis: Simple-Linear-Regression is a robust machine learning library that enables users to implement the core concepts of linear regression. It has a comprehensive set of features, such as a wide range of linear regression algorithms, support for parallel computation, and the ability to use multiple sources of data. The complexity of the repository comes from its use of advanced algorithms and the vast amount of parameters and settings users can adjust in order to produce the most accurate results.


# **Work Of Code**

* Importing Required Libraries: The script begins by importing the necessary libraries for the code analysis, including difflib, requests, re, urlparse from urllib.parse, nbformat, PythonExporter from nbconvert, tokenize and untokenize from tokenize, and BytesIO from io.

1. Setting API Keys: The script sets the OpenAI API key and the GitHub API key in variables named openai.api_key and github_api_key, respectively. These keys are used to authenticate and access the respective APIs.

2. Setting Maximum Tokens: The script defines a constant variable MAX_TOKENS that represents the maximum number of tokens allowed for the GPT model. It is set to 1000 tokens.

3. Preprocessing Code: The preprocess_code function takes a repository object as input and performs preprocessing steps on the code within the repository. It removes comments and whitespace, normalizes variable and function names, extracts code snippets from Jupyter notebooks, and handles large file sizes by splitting the code into smaller chunks if needed. The code is then tokenized using Python's tokenize module and converted to a suitable format for analysis. The preprocessed code is returned or updated in the repository object.

4. Normalizing Variable and Function Names: The normalize_variable_names and normalize_function_names functions are helper functions used in the preprocessing step to replace variable and function names with normalized names. In the provided code, they replace occurrences of the keywords "var" and "func" with "normalized_var" and "normalized_func", respectively.

5. Tokenizing Code: The tokenize_code function tokenizes the code using Python's tokenize module. It filters out tokens that are not relevant for analysis (such as indentation) and untokenizes the filtered tokens to obtain the transformed code.

6. Assessing Repository Complexity: The assess_repository function takes a repository object as input and generates a textual description/summary of the repository using the OpenAI GPT (text-davinci-003) model. It calculates the complexity score based on the generated summary, which is currently a placeholder value representing the number of words in the summary.

7. Analyzing Repository: The analyze_repository function takes a repository object as input and calculates the complexity score based on the code size (number of lines of code) and the duplication impact. The duplication impact is calculated using the calculate_duplication_impact function.

8. Calculating Duplication Impact: The calculate_duplication_impact function identifies duplicated code snippets in the code and measures their impact. It uses the find_duplicated_code function, which compares each line of code with all subsequent lines to find similar lines based on a similarity threshold. The impact of code duplication is currently a placeholder value calculated as half the number of duplicated lines.

9. Finding Duplicated Code: The find_duplicated_code function uses the difflib library to find similar lines of code. It iterates over each line of code and compares it with all subsequent lines to find similarities based on a similarity threshold. The similar lines are stored in the duplicated_code list.

10. Getting Most Challenging Repository: The get_most_challenging_repository function takes a GitHub profile URL as input, extracts the username from the URL, and fetches the user's repositories using the GitHub API. It iterates over the repositories, preprocesses the code, calculates the complexity score