In [1]:
#Always put this line on top before importing transformers/transformers-related packages 
import os
os.environ['TRANSFORMERS_CACHE'] = '/home/shared/ujjawal/hf_cache'

In [7]:
import re
import pandas as pd
from langchain import PromptTemplate
from transformers import AutoModelForCausalLM, AutoTokenizer

In [3]:
MODEL_NAME="Qwen/CodeQwen1.5-7B-Chat"
device = "cuda" # the device to load the model onto

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype="auto",
    device_map="auto",
    cache_dir="../model"
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME,cache_dir="../model")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [4]:
generation_instruct_template="""
topic:
{topic}

subtopic:
{subtopic}

concept:
{concept}

program:
- given a topic , it's subtopic and it's corresponding concept, generate a plan to generate challenging and tricky coding questions that can truly test the capability of experts from this field.
- execute each step of the plan and show your work.
- generate {NO_QUES} challenging and tricky BUT VALID coding questions on the given concept that can truly test the capability of experts from this field, and begin your questions with COMPLEX_QUESTIONS.

State each step of the program and show your work for performing that step. 

1: given a topic , it's subtopic and it's corresponding concept, generate a plan to generate challenging and tricky coding questions that can truly test the capability of experts from this field.
"""
generation_instruct_prompt = PromptTemplate.from_template(generation_instruct_template)


solution_prompt_template="""
Write code for the following question in Python.

question:
{question}

code:
"""
solution_prompt=PromptTemplate.from_template(solution_prompt_template)

### Testing on a single topic,subtopic and its concept

In [7]:
per_topic_ques=10
topic='Code generation / synthesis'
subtopic='Code generation (e.g., Text to Code)'
concept='Code synthesis'

prompt = generation_instruct_prompt.format_prompt(topic=topic,subtopic=subtopic,concept=concept,NO_QUES=per_topic_ques).to_string()
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(device)

generated_ids = model.generate(
    model_inputs.input_ids,
    max_new_tokens=4096
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)

Step 1: Understand the topic, subtopic, and concept.

In this case, the topic is "Code generation / synthesis", the subtopic is "Code generation (e.g., Text to Code)", and the corresponding concept is "Code synthesis".

Step 2: Determine the challenges and tricky parts of code generation and synthesis.

Challenges include creating complex algorithms that can handle a wide range of input data and generate appropriate output. Tricky parts may include understanding language specifications, managing memory efficiently, and dealing with edge cases.

Step 3: Generate a plan to generate challenging and tricky coding questions.

1. Complex questions related to code generation and synthesis should be challenging and difficult to solve by experts in their field.
2. These questions should cover a variety of topics, such as natural language processing, machine learning, and robotics.
3. The questions should require advanced programming concepts and a deep understanding of algorithm design and opti

In [12]:


text = """
1. Develop a code editor with features such as syntax highlighting, auto-completion, and code folding.
2. Create a natural language translation system that can accurately translate a user's text from one language to another.
3. Implement a robot control system that can move a robot around a 2D environment using only text commands.
4. Develop a machine learning model that can classify a user's image as one of several categories.
5. Design a program that can generate a random sentence in a specified language using a given set of words.
6. Implement a function that can find the longest common subsequence between two strings.
7. Create a deep learning model that can classify a given set of images as cats or dogs.
8. Design a program that can generate a random password using a given set of characters and constraints.
9. Implement a function that can find the shortest path between two points in a grid.
10. Develop a code generator that can create a program that can generate the Fibonacci sequence up to a given number.
"""

# Split the text by the numbered lines followed by a dot (".")
questions = re.split(r"\d+\. (.*?)\n", text, flags=re.DOTALL)

# Remove any empty strings from the list
questions = [q for q in questions if q.strip()]

print(questions)

['Develop a code editor with features such as syntax highlighting, auto-completion, and code folding.', "Create a natural language translation system that can accurately translate a user's text from one language to another.", 'Implement a robot control system that can move a robot around a 2D environment using only text commands.', "Develop a machine learning model that can classify a user's image as one of several categories.", 'Design a program that can generate a random sentence in a specified language using a given set of words.', 'Implement a function that can find the longest common subsequence between two strings.', 'Create a deep learning model that can classify a given set of images as cats or dogs.', 'Design a program that can generate a random password using a given set of characters and constraints.', 'Implement a function that can find the shortest path between two points in a grid.', 'Develop a code generator that can create a program that can generate the Fibonacci seque

In [8]:
COMPLEX_QUESTIONS=[
"Develop a code editor with features such as syntax highlighting, auto-completion, and code folding.",
"Create a natural language translation system that can accurately translate a user's text from one language to another.",
"Implement a robot control system that can move a robot around a 2D environment using only text commands.",
"Develop a machine learning model that can classify a user's image as one of several categories.",
"Design a program that can generate a random sentence in a specified language using a given set of words.",
"Implement a function that can find the longest common subsequence between two strings.",
"Create a deep learning model that can classify a given set of images as cats or dogs.",
"Design a program that can generate a random password using a given set of characters and constraints.",
"Implement a function that can find the shortest path between two points in a grid."
"Develop a code generator that can create a program that can generate the Fibonacci sequence up to a given number."
]



In [9]:
for question in COMPLEX_QUESTIONS :
    print('question:\n',question)
    prompt=solution_prompt.format_prompt(question=question).to_string()
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": prompt}
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(device)

    generated_ids = model.generate(
        model_inputs.input_ids,
        max_new_tokens=4096
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    print('code:',response)
    print('-'*100)

question:
 Develop a code editor with features such as syntax highlighting, auto-completion, and code folding.
code: As an AI model, I cannot directly create a code editor. However, I can provide guidance on how you can develop a code editor. 

1. Choose a text editor framework: You can choose a Python text editor framework like PyQt, wxPython or Tkinter.

2. Implement features:
   - Syntax highlighting: This involves creating a dictionary where you can map programming language keywords to their corresponding colors. You can use regular expressions to match words based on this dictionary and apply the color accordingly.
   - Auto-completion: You can use libraries like Pygments or Jedi for this. These libraries provide APIs to retrieve the valid words at a given position in a Python file.
   - Code folding: Implementing code folding involves keeping track of the indentation level of each line and determining whether they should be collapsed or not. You can use a tree structure to store 

In [10]:
def longest_common_subsequence(str1, str2):
    m, n = len(str1), len(str2)
    dp = [[0] * (n+1) for _ in range(m+1)]

    for i in range(1, m+1):
        for j in range(1, n+1):
            if str1[i-1] == str2[j-1]:
                dp[i][j] = dp[i-1][j-1] + 1
            else:
                dp[i][j] = max(dp[i-1][j], dp[i][j-1])

    i, j = m, n
    sequence = []
    while i > 0 and j > 0:
        if str1[i-1] == str2[j-1]:
            sequence.append(str1[i-1])
            i -= 1
            j -= 1
        elif dp[i-1][j] > dp[i][j-1]:
            i -= 1
        else:
            j -= 1

    return ''.join(reversed(sequence))

# Test the function
str1 = "AGGTAB"
str2 = "GXTXAYB"
print(longest_common_subsequence(str1, str2))  # Output: GTAB

GTAB


In [11]:
str1 = "abcde"
str2 = "ace" 
print(longest_common_subsequence(str1, str2))

ace


### Testing on all the L1,L2 & L3 mentioned in the sheet

In [10]:
scenerio_template="""
topic:
{topic}

subtopic:
{subtopic}

concept:
{concept}

program:
- given a topic , it's subtopic and it's corresponding concept, generate a plan to create challenging and tricky technical coding questions premises across various programming languages (E.g Python,Java,SQL) relevant to various domains softwares (E.g Sports,Entertainment,Healcare,Finance,Agriculture) that can truly test the capability of experts of software development.
- execute each step of the plan and show your work.
- generate such {NO_QUES} challenging and tricky BUT VALID coding questions based on the premises you planned, and begin your questions with ###COMPLEX_QUESTIONS.

State each step of the program and show your work for performing that step. 

1: given a topic , it's subtopic and it's corresponding concept, generate a plan to create challenging and tricky technical coding questions premises across various programming languages (E.g Python,Java,SQL) relevant to various domains softwares (E.g Sports,Entertainment,Healcare,Finance,Agriculture) that can truly test the capability of experts of software development.

"""
scenerio_prompt = PromptTemplate.from_template(scenerio_template)

def generate_code_scenarios(model,tokenizer,per_topic_ques,input_file,output_file,scenerio_prompt):
    df=pd.read_excel(input_file)
    #print(df.head)
    df_dict_list=df.to_dict('records')
    out_dict_list=[]
    for i in range(len(df_dict_list)):
        topic=df_dict_list[i]['L1_Categories']
        subtopic=df_dict_list[i]['L2_Categories']
        concept=df_dict_list[i]['L3_Categories']
        prompt = scenerio_prompt.format_prompt(topic=topic,subtopic=subtopic,concept=concept,NO_QUES=per_topic_ques).to_string()
        messages = [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ]
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        model_inputs = tokenizer([text], return_tensors="pt").to(device)

        generated_ids = model.generate(
            model_inputs.input_ids,
            max_new_tokens=4096
        )
        generated_ids = [
            output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
        ]

        response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
        print('Generated Complex Question :\n',response)
        print('-'*100)
        out_dict_list.append({'L1_Categories':topic,'L2_Categories':subtopic,'L3_Categories':concept,'Scenario':response})
  
    out_df = pd.DataFrame.from_dict(out_dict_list) 
    out_df.to_excel(output_file)
    

per_topic_ques=20
input_file='/home/shared/ujjawal/Meta_Prompts_Generation/input/categories_L1L2L3_Sheet3.xlsx'
output_file='/home/shared/ujjawal/Meta_Prompts_Generation/input/categories_L1L2L3_Sheet3_scenario_result.xlsx'
generate_code_scenarios(model,tokenizer,per_topic_ques,input_file,output_file,scenerio_prompt)

Generated Complex Question :
 Step 1: Understand the problem and requirements

To create challenging and tricky technical coding questions, we need to know the topic, subtopic, and concept. In this case, the topic is "Code Generation", subtopic is "Code Synthesis", and the corresponding concept is "Code generation". The goal is to generate questions related to various programming languages and domains of software development, testing experts' coding skills.

Step 2: Gather and analyze data

The next step is to gather and analyze relevant data for creating coding questions. We need to know the available programming languages, software domains, and the types of coding problems that can be challenging and tricky.

Step 3: Create a list of programming languages and domains

After analyzing the data, we can create a list of popular programming languages and software domains related to the topic. For example, we can have Python, Java, and SQL as programming languages and Sports, Entertainmen

In [8]:
def generate_code_questions_solution_pair(model,tokenizer,per_topic_ques,input_file,output_file,generation_instruct_prompt,solution_prompt):
    df=pd.read_excel(input_file)
    #print(df.head)
    df_dict_list=df.to_dict('records')
    out_dict_list=[]
    for i in range(len(df_dict_list)):
        topic=df_dict_list[i]['L1_Categories']
        subtopic=df_dict_list[i]['L2_Categories']
        concept=df_dict_list[i]['L3_Categories']
        prompt = generation_instruct_prompt.format_prompt(topic=topic,subtopic=subtopic,concept=concept,NO_QUES=per_topic_ques).to_string()
        messages = [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ]
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        model_inputs = tokenizer([text], return_tensors="pt").to(device)

        generated_ids = model.generate(
            model_inputs.input_ids,
            max_new_tokens=4096
        )
        generated_ids = [
            output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
        ]

        response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
        print('Generated Complex Question :\n',response)
        questions_text=response.split('COMPLEX_QUESTIONS:')[-1]
        # Split the text by the numbered lines followed by a dot (".")
        questions = re.split(r"\d+\. (.*?)\n", questions_text, flags=re.DOTALL)

        # Remove any empty strings from the list
        questions = [q for q in questions if q.strip()]

        print('questions list :\n',questions)
        
        for question in questions:
            print('question:\n',question)
            prompt1=solution_prompt.format_prompt(question=question).to_string()
            messages1 = [
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt1}
            ]
            text1 = tokenizer.apply_chat_template(
                messages1,
                tokenize=False,
                add_generation_prompt=True
            )
            model_inputs1 = tokenizer([text1], return_tensors="pt").to(device)

            generated_ids1 = model.generate(
                model_inputs1.input_ids,
                max_new_tokens=4096
            )
            generated_ids1 = [
                output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs1.input_ids, generated_ids1)
            ]

            solution = tokenizer.batch_decode(generated_ids1, skip_special_tokens=True)[0]
            print('code:',solution)
            print('-'*100)
            out_dict_list.append({'L1_Categories':topic,'L2_Categories':subtopic,'L3_Categories':concept,'Question':question,'Solution':solution})
    
    out_df = pd.DataFrame.from_dict(out_dict_list) 
    out_df.to_excel(output_file)
    

per_topic_ques=10
input_file='/home/shared/ujjawal/Meta_Prompts_Generation/input/categories_L1L2L3_Sheet3.xlsx'
output_file='/home/shared/ujjawal/Meta_Prompts_Generation/input/categories_L1L2L3_Sheet3_result.xlsx'
generate_code_questions_solution_pair(model,tokenizer,per_topic_ques,input_file,output_file,generation_instruct_prompt,solution_prompt)

Generated Complex Question :
 Here's a plan to generate challenging and tricky coding questions related to the given topic and concept:

1. Identify a few related concepts to the given concept
2. Identify a few subtopics related to the given subtopic
3. Identify a few topics related to the given topic that cover the related concepts and subtopics
4. Collect examples of successful coding questions related to each related concept, subtopic, and topic
5. Identify areas where experts in the field can improve and where code generation can be more challenging
6. Create a list of challenging and tricky questions related to each concept, subtopic, and topic that can truly test the capability of experts from this field.
7. Begin generating coding questions based on the list of challenges and opportunities, and focus on creating questions that are both challenging and tricky.

Step 1: Identify related concepts, subtopics, and topics
The related concepts and subtopics related to the given concept

### Testing Scenaris to generate technical coding

In [None]:
solution_prompt_template="""
Write code for the following question in Python.

question:
{question}

code:
"""
solution_prompt=PromptTemplate.from_template(solution_prompt_template)

In [16]:
L3_TASKS = [
    "Code synthesis",
"Code retreival",
"Text to SQL",
"Math programming",
"Code snippets & examples",
"Plot generation",
"Generating bash commands",
"Database query generation",
"UI code generation",
"Configuration file generation",
"Completing a function",
"Completing a class",
"Code infilling",
"Predicting next line of code",
"Autocomplete code blocks",
"Variable name suggestion",
"Method signature completion",
"Auto-generate test cases",
"Completing HTML tags",
"Syntax correction",
"Summarizing a file / script / repository in a paragraph (or 5 bullets)",
"Automatic commenting",
"Minify code",
"Extracting main features of code",
"Generating code abstract",
"Visual code summary",
"Code compression techniques",
"Summarizing changes in version control",
"Documentation summarization",
"Inline code summarization",
"Code modification (refactoring)",
"Code optimization",
"Code simplification",
"Code search - given a bit of code, search within it",
"API Mining - e.g. help generating calls for APIs",
"Redundancy Removal",
"Converting loops to recursion",
"Refactoring for readability",
"Refactoring for performance",
"Standardization of code format",
"Linux/Mac/Windows common CLI tasks",
"CLI package management",
"Software development environment config",
"Automated script generation",
"Environment setup automation",
"CLI shortcuts",
"CLI for cloud management",
"CLI tools for network troubleshooting",
"Command line data processing",
"Shell script optimization",
"Package management (for all languages)",
"Code repository management",
"Integration with IDEs",
"Build automation",
"Dependency resolution",
"Cross-platform compatibility checks",
"Ecosystem migration tools",
"Code sharing platforms",
"Collaborative coding tools",
"Real-time code synchronization",
"Code translation (one from language to another)",
"Cross-language API usage",
"Legacy code modernization",
"Interoperability solutions",
"Scripting to compiled code conversion",
"Automatic code localization",
"Platform-specific adaptations",
"Framework-specific code generation",
"Code porting for different OS",
"Multi-language code integration",
"Writing a javadoc for this function",
"Generating comments based on code logic",
"Automatic inline comments",
"Updating outdated comments",
"Generating comments for algorithms",
"Comment based on code complexity",
"Summarizing logical blocks with comments",
"Code annotation for review",
"Extracting and commenting critical sections",
"Tool-generated comment consistency check",
"Creating a descriptive and useful commit text for a code commit",
"Automatic commit classification",
"Semantic commit messaging",
"Code Commit message templates",
"Version control integration for commit messages",
"Multi-language commit support",
"Code Commit summarization for changelogs",
"Context-aware commit suggestions",
"Feature-specific commit messages",
"Code Commit message consistency checker",
"Writing a docstring",
"Extended documentation with examples",
"API endpoint documentation",
"Function parameter details",
"Error handling documentation",
"Performance notes",
"Usage scenarios",
"Deprecation notices",
"Security implications",
"Compatibility notes",
"Basic usage scenario",
"Advanced usage scenario",
"Performance critical use case",
"Error handling example",
"Integration with other functions",
"Cross-platform usage example",
"Thread safety demonstration",
"Usage with optional parameters",
"Deprecation alternatives",
"Common pitfalls and workarounds",
"Endpoint description",
"Parameter details",
"Return values",
"Authentication requirements",
"Error codes explanation",
"Sample request/response",
"Versioning and compatibility",
"Deprecation policy",
"Rate limits",
"Security guidelines",
"Code repair",
"Bug identification",
"Code fix suggestions",
"Defect detection",
"Clone detection",
"bugs in this code",
"fixing the bugs in a code",
" an error message/traceback",
"fixing an error message <message> in a code",
"Debugging Strategies/Tooling",
"Unit Test Generation",
"Testing Strategy (e.g. frameworks/guides)",
"Automated regression testing",
"Integration testing tools",
"Continuous testing practices",
"Load and performance testing",
"Security penetration testing",
"User acceptance testing",
"Code coverage analysis",
"Test data generation",
"Deobfuscation",
"Code classification",
"Peer review automation",
"Static code analysis",
"Code style enforcement",
"Security review integration",
"Review metrics and dashboards",
"Automated refactor suggestions",
"Code smell detection",
"Best practices checklist",
"Identifying mistakes that allow for XSS injection",
"SQL injection prevention",
"Code audit for security vulnerabilities",
"Encryption standards review",
"Authentication mechanism review",
"Access control checks",
"Data privacy compliance",
"Security best practices",
"Third-party library security",
"Secure coding training",
"Code fuzzing",
"Assertion Generation",
"Automated code quality reports",
"Performance profiling",
"Memory leak detection",
"Usability testing",
"Cross-browser testing",
"Mobile responsiveness testing",
"Accessibility compliance",
"Internationalization checks",
"Parsing logs into structured templates",
"Finding anomalies from raw logs",
"Log event correlation",
"Predictive log analysis",
"Log-based alerting",
"Real-time log monitoring",
"Log archiving strategies",
"Log data visualization",
"User behavior analysis from logs",
"Security incident detection through logs"
]

In [None]:
import json
import time

levels=["high school student","college student","extreme difficult"]
languages=["Python","Java","SQL"]
#languages=["Java"]
per_concept_ques=5
prompt_template="""
Generate {NoQ} questions and their responses with example code in {LANGUAGE} language to identify,explain {CONCEPT}. The level of complexity has to be {LEVEL}.
"""
prompt=PromptTemplate.from_template(prompt_template)

def generate_questions_response_pairs(model,tokenizer,levels,languages,concepts,per_concept_ques):
    dict_list=[]
    for concept in concepts:
        for level in levels:
            responses=[]
            for language in languages:
                t1=time.time()
                input_prompt=prompt.format_prompt(CONCEPT=concept,NoQ=per_concept_ques,LANGUAGE=language,LEVEL=level).to_string()
                print('input_prompt:',input_prompt)
                messages = [
                    {"role": "system", "content": "You are a helpful assistant."},
                    {"role": "user", "content":  input_prompt}
                ]
                text = tokenizer.apply_chat_template(
                    messages,
                    tokenize=False,
                    add_generation_prompt=True
                )
                model_inputs = tokenizer([text], return_tensors="pt").to(device)

                generated_ids = model.generate(
                    model_inputs.input_ids,
                    max_new_tokens=4096
                )
                generated_ids = [
                    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
                ]

                response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
                print('Generated Complex Question :\n',response)
                responses.append(response)
                t2=time.time()
                print('#'*100)
                print('Time taken for a L3 task for a single language for a single difficulty level:',t2-t1)
            dict1={'L3':concept,'Level':level,'Python':responses[0],'Java':responses[1],'SQL':responses[2]}
            print('dict1:',dict1)
            print('-'*100)
            dict_list.append(dict1)
    with open("./output/results.json", "w") as f:
        json.dump(dict_list, f)

generate_questions_response_pairs(model,tokenizer,levels,languages,L3_TASKS,per_concept_ques)

input_prompt: 
Generate 5 questions and their responses with example code in Python language to identify,explain Code synthesis. The level of complexity has to be high school student.

Generated Complex Question :
 Here are 5 questions and their responses in Python with example code to demonstrate code synthesis:

1. What is code synthesis?
Code synthesis is the process of generating new programs from a set of given specifications.
Example code: 

```python
def synthesize_code(specifications):
    # Process the specifications to generate the code
    synthesized_code = ""
    for spec in specifications:
        synthesized_code += "def " + spec + ":\n\tpass\n"
    return synthesized_code

specifications = ["add(a, b)", "multiply(a, b)"]
print(synthesize_code(specifications))
```

2. What are the advantages of code synthesis?
Code synthesis has several advantages, including efficiency, flexibility, and adaptability.
Example code:

```python
def synthesize_advantages():
    advantages = 

In [None]:
L3,Level-of-difficulty,Python,Java,SQL