# Replace openaiAPI.txt with your Open AI api key and githubapi.txt with your GitHub api key

In [None]:
import os
import requests
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.document_loaders import PyPDFLoader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


def read_file(filename):
    """Read contents of a file."""
    with open(filename, 'r', encoding='utf-8') as f:
        return f.read().strip()


def save_file(filename, content):
    """Save content to a file."""
    with open(filename, 'w', encoding='utf-8') as f:
        f.write(content.strip())


def extract_resume_content(pdf_path):
    """Extract the content from the resume PDF and save it to 'resume.txt'."""
    pdf_loader = PyPDFLoader(pdf_path)
    resume_text = "\n".join(page.page_content for page in pdf_loader.load())
    save_file('resume.txt', resume_text)
    print("Resume content has been saved to 'resume.txt'.")
    return resume_text


def extract_project_names_with_langchain(text):
    """Extract project names and save them to 'proj_names.txt'."""
    os.environ['OPENAI_API_KEY'] = read_file('openaiAPI.txt')
    llm = ChatOpenAI(temperature=0, model='gpt-3.5-turbo')

    names_prompt = PromptTemplate(
        input_variables=['text'],
        template="""Extract only the project names from the following text.
        Return only the names, one per line, without any descriptions or bullet points.
        Remove any numbering or special characters from the start of the names.

        Text:
        {text}

        Project Names:"""
    )

    names_chain = LLMChain(llm=llm, prompt=names_prompt)
    project_names = names_chain.run(text)
    save_file('proj_names.txt', project_names)
    print("Project names have been extracted and saved to 'proj_names.txt'.")
    return project_names.strip().split('\n')


def extract_project_details_with_langchain(text):
    """Extract project names and descriptions and save them to 'projects.txt'."""
    os.environ['OPENAI_API_KEY'] = read_file('openaiAPI.txt')
    llm = ChatOpenAI(temperature=0, model='gpt-3.5-turbo')

    projects_prompt = PromptTemplate(
        input_variables=['text'],
        template="""Extract the full project details including the names and their descriptions from the following text.
        Keep the structure intact with project names followed by their descriptions.
        Remove any numbering or bullet points from the project names.

        Text:
        {text}

        Projects:"""
    )

    projects_chain = LLMChain(llm=llm, prompt=projects_prompt)
    project_details = projects_chain.run(text)
    save_file('projects.txt', project_details)
    print("Project details have been extracted and saved to 'projects.txt'.")
    return project_details


def extract_skills_with_langchain(text):
    """Extract skills and save them to 'skills.txt'."""
    os.environ['OPENAI_API_KEY'] = read_file('openaiAPI.txt')
    llm = ChatOpenAI(temperature=0, model='gpt-3.5-turbo')

    skills_prompt = PromptTemplate(
        input_variables=['text'],
        template="""Extract all skills mentioned in the following text.
        Return only the skills as a comma-separated list.

        Text:
        {text}

        Skills:"""
    )

    skills_chain = LLMChain(llm=llm, prompt=skills_prompt)
    skills = skills_chain.run(text)
    save_file('skills.txt', skills)
    print("Skills have been extracted and saved to 'skills.txt'.")
    return skills.strip()

def get_languages(owner, repo, headers):
    """Get the languages used in a specific GitHub repository."""
    api_url = f'https://api.github.com/repos/{owner}/{repo}/languages'
    response = requests.get(api_url, headers=headers)
    response.raise_for_status()
    return response.json()

def get_all_repos(user, headers):
    """Fetch all repositories of a GitHub user."""
    api_url = f'https://api.github.com/users/{user}/repos?per_page=100'
    repos = []

    while api_url:
        response = requests.get(api_url, headers=headers)
        response.raise_for_status()
        repos.extend(response.json())

        # Check for pagination and fetch next page if available
        api_url = response.links.get('next', {}).get('url')

    return repos

def fetch_and_save_github_info(user, headers, output_filename):
    """Fetch repository details and save to a text file."""
    repos = get_all_repos(user, headers)
    print(repos)

    with open(output_filename, 'w', encoding='utf-8') as f:
        for repo in repos:
            repo_name = repo['name']
            repo_url = repo['html_url']
            print(f"Processing repository: {repo_name}")

            # Fetch languages used in the repository
            languages = get_languages(repo['owner']['login'], repo_name, headers)

            # Write repository details to file
            f.write(f"Repository: {repo_name}\n")
            for language, size in languages.items():
                f.write(f"{language}: {size} bytes\n")
            f.write(f"Repository URL: {repo_url}\n")
            f.write(f"{'=' * 80}\n")

    print(f"Repository details saved to {output_filename}")


# Step 1: Extract resume content from PDF and save to 'resume.txt'
pdf_path = 'Saurabh_resume.pdf'  # Replace with your PDF file name
resume_text = extract_resume_content(pdf_path)

# Step 2: Extract project names from resume and save to 'proj_names.txt'
project_names = extract_project_names_with_langchain(resume_text)

# Step 3: Extract project details (names + descriptions) and save to 'projects.txt'
project_details = extract_project_details_with_langchain(resume_text)

# Step 4: Extract skills from resume and save to 'skills.txt'
skills = extract_skills_with_langchain(resume_text)

# Step 5: Fetch GitHub repository information and save to 'github_lang.txt'
github_token = read_file('githubapi.txt')
headers = {
    'Authorization': f'token {github_token}',
    'Accept': 'application/vnd.github.v3+json'
}
github_user = input("Enter GitHub Username: ")
fetch_and_save_github_info(github_user, headers, 'github_lang.txt')

print("All tasks completed successfully!")


In [None]:
import os
import requests
import base64
from langchain_openai import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.document_loaders import PyPDFLoader
import os
import requests
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.document_loaders import PyPDFLoader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def read_file(filename):
    """Read contents of a file."""
    with open(filename, 'r', encoding='utf-8') as f:
        return f.read().strip()

def extract_sections_with_langchain(text):
    """
    Extract project names using LangChain.
    Saves them in separate files.
    """
    # Set OpenAI API key
    os.environ['OPENAI_API_KEY'] = read_file('openaiAPI.txt')

    # Initialize language model
    llm = ChatOpenAI(temperature=0, model='gpt-3.5-turbo')

    # Create prompts for extraction
    names_prompt = PromptTemplate(
        input_variables=['text'],
        template="""
        Extract only the project names from the following text.
        Return only the names, one per line, without any descriptions or bullet points.
        Remove any numbering or special characters from the start of the names.

        Text:
        {text}

        Project Names:"""
    )

    descriptions_prompt = PromptTemplate(
        input_variables=['text'],
        template="""
        Extract the projects section with full descriptions from the following text.
        Include the project names and their complete descriptions.

        Text:
        {text}

        Projects:"""
    )

    # Create chains
    names_chain = LLMChain(llm=llm, prompt=names_prompt)
    descriptions_chain = LLMChain(llm=llm, prompt=descriptions_prompt)

    # Extract content
    project_names = names_chain.run(text)
    project_descriptions = descriptions_chain.run(text)

    # Save to files
    with open('proj_names.txt', 'w', encoding='utf-8') as f:
        f.write(project_names.strip())

    with open('projects.txt', 'w', encoding='utf-8') as f:
        f.write(project_descriptions.strip())

    return project_names.strip().split('\n')

def get_proj_names(filename):
    with open(filename, 'r') as file:
        lines = file.readlines()

    names = [line.strip() for line in lines]
    print(names)
    return names

def get_repositories(user, headers, project_names, similarity_threshold=0.5):
    """
    Fetch all repositories of a GitHub user and return those with names
    similar to the provided project names based on cosine similarity.

    Args:
        user (str): GitHub username.
        headers (dict): Headers for the API request (e.g., authentication token).
        project_names (list of str): List of project names to match against.
        similarity_threshold (float): Minimum similarity score for a repository to be included.

    Returns:
        list of dict: Repositories with similar names and their owner details.
    """
    api_url = f'https://api.github.com/users/{user}/repos?per_page=100'
    repos = []
    repo_names = []

    # Fetch all repositories
    while api_url:
        response = requests.get(api_url, headers=headers)
        response.raise_for_status()
        repos.extend(response.json())
        api_url = response.links.get('next', {}).get('url')  # Pagination

    # Extract repository names
    repo_names = [repo['name'] for repo in repos]

    # Combine provided project names and repository names for vectorization
    all_names = project_names + repo_names

    # Compute TF-IDF vectors
    vectorizer = TfidfVectorizer().fit_transform(all_names)

    # Compute cosine similarity between project names and repository names
    project_vectors = vectorizer[:len(project_names)]
    repo_vectors = vectorizer[len(project_names):]
    similarity_matrix = cosine_similarity(project_vectors, repo_vectors)

    # Find repositories with similarity above the threshold
    similar_repos = []
    for project_idx, project_name in enumerate(project_names):
        for repo_idx, similarity in enumerate(similarity_matrix[project_idx]):
            if similarity >= similarity_threshold:
                repo = repos[repo_idx]
                similar_repos.append({
                    'project_name': project_name,
                    'repo_name': repo['name'],
                    'owner_login': repo['owner']['login'],
                    'similarity': similarity,
                    'repo_data': repo
                })

    return similar_repos

def get_repo_contents(owner, repo, path='', headers=None):
    """
    Recursively fetch code files from a repository using GitHub API.
    """
    contents = []
    allowed_extensions = {'.py', '.js', '.java', '.cpp', '.c', '.html', '.css', '.ts', '.go', '.rb', '.php'}  # Add other extensions as needed
    api_url = f'https://api.github.com/repos/{owner}/{repo}/contents/{path}'

    try:
        response = requests.get(api_url, headers=headers)
        response.raise_for_status()
        items = response.json()

        # Handle single file response
        if not isinstance(items, list):
            items = [items]

        for item in items:
            if item['type'] == 'file':
                file_extension = item['name'].split('.')[-1] if '.' in item['name'] else ''
                if f".{file_extension}" not in allowed_extensions:
                    print(f"Skipping non-code file: {item['path']}")
                    continue

                if item['size'] == 0:
                    continue

                try:
                    if item.get('size', 0) > 1000000:  # Skip files larger than 1MB
                        print(f"Skipping large file: {item['path']}")
                        continue

                    raw_response = requests.get(item['download_url'], headers=headers)
                    raw_response.raise_for_status()
                    content = raw_response.text

                    contents.append({
                        'path': item['path'],
                        'content': content
                    })
                except Exception as e:
                    print(f"Error fetching content for {item['path']}: {e}")

            elif item['type'] == 'dir':
                contents.extend(get_repo_contents(owner, repo, item['path'], headers))

    except Exception as e:
        print(f"Error accessing {api_url}: {e}")
        return contents

    return contents

def parse_github_lang_file(filename):
    """Parse github_lang.txt to extract repository URLs."""
    repo_info = {}
    current_repo = None

    with open(filename, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    for line in lines:
        line = line.strip()
        if line.startswith('Repository: '):
            current_repo = line.replace('Repository: ', '')
        elif line.startswith('Repository URL: ') and current_repo:
            repo_info[current_repo] = line.replace('Repository URL: ', '')

    return repo_info

def extract_owner_repo(repo_url):
    """Extract owner and repo name from GitHub URL."""
    parts = repo_url.rstrip('/').split('/')
    return parts[-2], parts[-1]

# Load and read resume from PDF
pdf_loader = PyPDFLoader('Saurabh_resume.pdf')
resume_text = "\n".join(page.page_content for page in pdf_loader.load())

# Extract and save project names and descriptions using LangChain
project_names = extract_sections_with_langchain(resume_text)
print("Extracted project names:", project_names)

# Load GitHub API token
github_token = read_file('githubapi.txt')
headers = {
    'Authorization': f'token {github_token}',
    'Accept': 'application/vnd.github.v3+json'
}

proj_names = get_proj_names('proj_names.txt')
headers = {
    'Authorization': f'token {github_token}',
    'Accept': 'application/vnd.github.v3+json'
}

# Parse repository information from github_lang.txt
repo_info = get_repositories(github_user, headers, proj_names)
print("Available repositories:", list(repo_info))

# Create/open resume_proj_code.txt for writing
with open('resume_proj_code.txt', 'w', encoding='utf-8') as output_file:
        contents = []
        for repo in repo_info:
            contents.extend(get_repo_contents(repo['owner_login'], repo['repo_name'], headers=headers))

            # Write contents to file
            for file_info in contents:
                output_file.write(f"\n--- File: {file_info['path']} ---\n")
                output_file.write(file_info['content'])
                output_file.write("\n\n")

            output_file.write(f"\n{'='*80}\n")

print("Repository contents have been saved to resume_proj_code.txt")


In [None]:
import os
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

# Read API key from file
with open('openaiAPI.txt', 'r') as api_file:
    openai_api_key = api_file.read().strip()

# Initialize OpenAI model with lower-cost model
llm = ChatOpenAI(
    api_key=openai_api_key,
    model="gpt-3.5-turbo",  # Lower-cost, token-friendly model
    temperature=0.3
)

def chunk_large_file(input_file, max_chunk_size=10000):
    """
    Split large file into manageable chunks

    Args:
        input_file (str): Path to large code file
        max_chunk_size (int): Maximum characters per chunk

    Returns:
        list: Chunks of code
    """
    with open(input_file, 'r', encoding='utf-8') as f:
        content = f.read()

    # Split into chunks
    chunks = []
    for i in range(0, len(content), max_chunk_size):
        chunks.append(content[i:i+max_chunk_size])

    return chunks

def generate_code_descriptions(input_file, output_file):
    """
    Generate descriptions for large code files by processing in chunks
    """
    # Create a prompt template for code description
    code_desc_prompt = ChatPromptTemplate.from_messages([
        ("system", "You are an expert code analyst. Provide a 5-sentence description of the code of project. "
                   "Mention the inferred tech stack."),
        ("human", "Analyze this code project:\n\n{code}")
    ])

    # Create output parser
    output_parser = StrOutputParser()

    # Combine prompt, model, and parser into a chain
    code_desc_chain = code_desc_prompt | llm | output_parser

    # Chunk the large file
    code_chunks = chunk_large_file(input_file)

    # Generate descriptions for each chunk
    project_descriptions = []
    for idx, chunk in enumerate(code_chunks, 1):
        try:
            description = code_desc_chain.invoke({"code": chunk})
            project_descriptions.append(f"### Code Chunk {idx} Description:\n{description}\n")
        except Exception as e:
            project_descriptions.append(f"### Code Chunk {idx} Error: {str(e)}\n")

    # Write descriptions to output file
    with open(output_file, 'w', encoding='utf-8') as f:
        f.writelines(project_descriptions)

generate_code_descriptions('resume_proj_code.txt', 'code_desc.txt')

In [None]:
import os
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

# Read API key from file
with open('openaiAPI.txt', 'r') as api_file:
    openai_api_key = api_file.read().strip()

# Initialize OpenAI model with lower-cost model
llm = ChatOpenAI(
    api_key=openai_api_key,
    model="gpt-3.5-turbo",  # Lower-cost, token-friendly model
    temperature=0.3
)

# Read file content as a list of lines
def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.readlines()

# Prompt 1: Compare projects with code description
def compare_projects_with_code_desc(code_desc_text, projects_file):
    """
    Use LangChain to analyze the code description and compare it with the projects in projects.txt.
    LangChain will directly compare each project with the description and output the result.
    """
    # Read content from projects.txt
    projects = read_file(projects_file)

    # Define the prompt template for comparing with project descriptions
    projects_prompt = ChatPromptTemplate.from_messages([
        ("system", """You are an expert code analyst. Compare the content in projects.txt with the content in code_desc.txt. The code_desc.txt was generated by processing large code file in chunks, hence the chunk number is mentioned, so ignore all lines which have chunks mentioned in it.
                        Go through the whole file and print everything that is present in the projects.txt file as it is, then verify True/False whether the entire project matches the description of what's given in code_desc.txt. Save the final part in comparison_report.txt."""),
        ("human", "Analyze this code description and verify the projects:\n\n{description}")
    ])

    # Create output parser
    output_parser = StrOutputParser()

    # Combine prompt, model, and parser into a chain
    projects_chain = projects_prompt | llm | output_parser

    # Execute the LangChain chain for project comparison
    try:
        result = projects_chain.invoke({"description": code_desc_text})
        return result
    except Exception as e:
        print(f"Error during project comparison: {str(e)}")
        return None

# Prompt 2: Compare skills and tech stack with code description
def compare_skills_with_code_desc_and_github_lang(code_desc_text, skills_file, github_lang_file):
    """
    Use LangChain to analyze the code description and compare it with the skills in skills.txt and github_lang.txt.
    LangChain will directly compare each skill and tech stack with the description and output the result.
    """
    # Read content from skills.txt and github_lang.txt
    skills = read_file(skills_file)
    github_langs = read_file(github_lang_file)

    # Define the prompt template for comparing with skills and tech stack
    skills_prompt = ChatPromptTemplate.from_messages([
        ("system", """You are an expert code analyst. Compare the content in skills.txt with the tech stack in code_desc.txt and the languages in github_lang.txt. The code_desc.txt was generated by processing large code file in chunks, hence the chunk number is mentioned, so ignore all lines which have chunks mentioned in it.
                        Go through the whole file and print everything that is present in the skills.txt file as it is, then verify True/False whether the entire skills match the description of what's given in tech stack in code_desc.txt and languages in github_lang.txt. Append the final part in comparison_report.txt."""),
        ("human", "Analyze this code description and verify the skills and tech stack:\n\n{description}")
    ])

    # Create output parser
    output_parser = StrOutputParser()

    # Combine prompt, model, and parser into a chain
    skills_chain = skills_prompt | llm | output_parser

    # Execute the LangChain chain for skills and tech stack comparison
    try:
        result = skills_chain.invoke({"description": code_desc_text})
        return result
    except Exception as e:
        print(f"Error during skills and tech stack comparison: {str(e)}")
        return None

def generate_report(code_desc_file, projects_file, skills_file, github_lang_file, output_report_file):
    """
    Generate a report by processing the full description and comparing it with projects and skills.
    """
    # Read the full code description from code_desc.txt
    code_desc_text = ''.join(read_file(code_desc_file))

    # Compare code description with projects
    comparison_report_projects = compare_projects_with_code_desc(code_desc_text, projects_file)

    # Compare code description with skills and github languages
    comparison_report_skills = compare_skills_with_code_desc_and_github_lang(code_desc_text, skills_file, github_lang_file)

    # Write the results to the output file
    with open(output_report_file, 'w', encoding='utf-8') as file:
        file.write("Projects Comparison:\n")
        file.write(comparison_report_projects)
        file.write("\n\nSkills and Tech Stack Comparison:\n")
        file.write(comparison_report_skills)

    # Specify the input and output files directly
generate_report('code_desc.txt', 'projects.txt', 'skills.txt', 'github_lang.txt', 'comparison_report.txt')
