In [1]:
import sys
sys.path.append('../')

In [2]:
from pathlib import Path
import yaml
import os

import pandas as pd

from legacy_code_assistant.knowledge_base.knowledge_builder import KnowledgeBaseBuilder
from legacy_code_assistant.knowledge_base.knowledge_builder import CodeAnalyzer

In [3]:
with open('credentials.yaml', "r") as f:
    credentials = yaml.load(f, Loader=yaml.FullLoader)

os.environ["AZURE_OPENAI_ENDPOINT"] = credentials['AZURE_OPENAI_ENDPOINT']
os.environ["AZURE_OPENAI_API_KEY"] = credentials['AZURE_OPENAI_API_KEY']

<h1>Extracting data from code</h1>

In [4]:
path = Path() / '..' / '..' / 'Django-School-Management-System'
# path = Path() / '..' / 'test_code'


In [5]:
paths = list(path.rglob('**/*.py'))
l = len(paths)
paths = [p for p in paths if 'migration' not in str(p)]
paths = [p for p in paths if 'test' not in str(p)]
print(f'Filtered {l - len(paths)} migrations')

ca = CodeAnalyzer(paths)
results = ca.analyze()
print(len(results))
results

Filtered 22 migrations
139


[{'name': 'main',
  'docstring': None,
  'code': 'def main():\n    os.environ.setdefault(\'DJANGO_SETTINGS_MODULE\', \'school_app.settings\')\n    try:\n        from django.core.management import execute_from_command_line\n    except ImportError as exc:\n        raise ImportError(\n            "Couldn\'t import Django. Are you sure it\'s installed and available on your PYTHONPATH environment variable? Did you forget to activate a virtual environment?"\n            ) from exc\n    execute_from_command_line(sys.argv)\n',
  'file': '../../Django-School-Management-System/manage.py',
  'module': 'manage',
  'parent': 'manage',
  'type': 'function'},
 {'docstring': "Django's command-line utility for administrative tasks.",
  'code': '"""Django\'s command-line utility for administrative tasks."""\nimport os\nimport sys\n\n\ndef main():\n    os.environ.setdefault(\'DJANGO_SETTINGS_MODULE\', \'school_app.settings\')\n    try:\n        from django.core.management import execute_from_command_line

In [6]:
df = pd.DataFrame(results)
df['len'] = df['code'].apply(lambda x: len(x))
df = df.sort_values('len', ascending=False)
df.head(10)

Unnamed: 0,name,docstring,code,file,module,parent,type,code_start_line,code_end_line,len
115,views,,from django.contrib import messages\nfrom djan...,../../Django-School-Management-System/apps/cor...,views,,module,0.0,,9495
70,views,,from django.contrib.auth.decorators import log...,../../Django-School-Management-System/apps/fin...,views,,module,0.0,,4093
22,views,,from django.contrib import messages\nfrom djan...,../../Django-School-Management-System/apps/res...,views,,module,0.0,,3621
135,settings,Django settings for school_app project.\n\nGen...,"""""""\nDjango settings for school_app project.\n...",../../Django-School-Management-System/school_a...,settings,,module,0.0,,3281
43,views,,import csv\nfrom django.contrib.auth.mixins im...,../../Django-School-Management-System/apps/stu...,views,,module,0.0,,3081
52,signals,,import csv\nimport os\nfrom io import StringIO...,../../Django-School-Management-System/apps/stu...,signals,,module,0.0,,2715
87,models,,from django.db import models\nfrom django.urls...,../../Django-School-Management-System/apps/fin...,models,,module,0.0,,2081
48,create_bulk_student,,"@receiver(post_save, sender=StudentBulkUpload)...",../../Django-School-Management-System/apps/stu...,signals,signals,function,,,2003
58,models,,from django.core.validators import RegexValida...,../../Django-School-Management-System/apps/stu...,models,,module,0.0,,1901
116,urls,,from django.urls import path\nfrom .views impo...,../../Django-School-Management-System/apps/cor...,urls,,module,0.0,,1890


In [7]:
df.isna().sum()

name                 0
docstring          125
code                 0
file                 0
module               0
parent              45
type                 0
code_start_line     94
code_end_line      139
len                  0
dtype: int64

In [8]:
len(df.loc[df['type'].isin(['function', 'method', 'class'])])

94

In [9]:
df.to_csv('knowledge_base.csv', index=False)

In [10]:
raise

RuntimeError: No active exception to reraise

<h1>Generating docstrings / descriptions</h2>

In [None]:
from legacy_code_assistant.knowledge_base.description_generator import CodeConditionedGenerator

In [None]:
generator = CodeConditionedGenerator(credentials_path='credentials.yaml', data_path='knowledge_base.csv')
generator.generate_docstrings()
generator.save_df('generated_docstrings.csv')

<h1>Make vectorstore based on code</h1>

In [None]:
from legacy_code_assistant.knowledge_base.knowledge_builder import KnowledgeBaseBuilder

In [None]:
kbb_code = KnowledgeBaseBuilder(index_name='code_based_index')
kbb_code.initialize_faiss_based_on_df(df, text_column='code')

kbb_code.search('change students results from their exams', k=10)

  from .autonotebook import tqdm as notebook_tqdm
No sentence-transformers model found with name /home/laszer/.cache/torch/sentence_transformers/microsoft_codebert-base. Creating a new one with MEAN pooling.


[(Document(page_content="class Meta:\n    ordering = ['subject']", metadata={'docstring': None, 'code_start_line': 24, 'code_end_line': 25.0, 'file': '../../Django-School-Management-System/apps/result/models.py', 'module': 'models', 'name': 'Meta', 'parent': nan, 'type': 'class', 'len': 39}),
  30.11378),
 (Document(page_content="class Meta:\n    ordering = ['student', 'term']", metadata={'docstring': None, 'code_start_line': 21, 'code_end_line': 22.0, 'file': '../../Django-School-Management-System/apps/finance/models.py', 'module': 'models', 'name': 'Meta', 'parent': nan, 'type': 'class', 'len': 47}),
  32.57309),
 (Document(page_content="class Meta:\n    ordering = ['surname', 'firstname', 'other_name']", metadata={'docstring': None, 'code_start_line': 39, 'code_end_line': 40.0, 'file': '../../Django-School-Management-System/apps/students/models.py', 'module': 'models', 'name': 'Meta', 'parent': nan, 'type': 'class', 'len': 66}),
  38.27762),
 (Document(page_content="class Meta:\n   

In [None]:
kbb_code.save_index()

<h1>Make vectorstore based on generated docstrings</h1>

In [None]:
df = pd.read_csv('functions_with_generated_docstrings.csv')

In [None]:
from langchain.embeddings import AzureOpenAIEmbeddings

embeddings = AzureOpenAIEmbeddings(
    azure_deployment=credentials['Deployment_embeddings'],
    openai_api_version="2023-05-15",
)

In [None]:
embeddings.model

'text-embedding-ada-002'

In [None]:
kbb_docs = KnowledgeBaseBuilder(index_name='docstring_based_index', model=embeddings)
kbb_docs.initialize_faiss_based_on_df(df, text_column='generated_docstring')

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
kbb_docs.search('definition of student object with results of their exams', k=10)[0][0]

Document(page_content="This view generates a summary of students' test and exam scores for the current session and term.\n\nUpon a GET request, it filters the Result model for objects that match the current\nsession and term from the request context. It then aggregates the scores for each student,\ngrouping their subjects and calculating the total test and exam scores.\n\nThe aggregated data includes:\n    - The list of subjects per student\n    - Total test score across all subjects for each student\n    - Total exam score across all subjects for each student\n    - The combined total of test and exam scores for each student\n\nThe view then constructs a context dictionary with this aggregated data under the key\n'results', which it passes to the 'result/all_results.html' template for rendering.\n\nThis view requires the user to be logged in, as it inherits from LoginRequiredMixin.", metadata={'docstring': nan, 'code': "class ResultListView(LoginRequiredMixin, View):\n\n    def get(se

In [None]:
kbb_docs.save_index()

<h1>RAG</h1>

In [None]:
from langchain.chat_models import AzureChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnableLambda, RunnablePassthrough
from langchain.vectorstores import FAISS


In [None]:
from langchain.globals import set_debug

set_debug(True)

In [None]:
def format_docs(docs):
    return [doc.page_content for doc in docs]

In [None]:
embeddings = AzureOpenAIEmbeddings(
    azure_deployment=credentials['Deployment_embeddings'],
    openai_api_version="2023-05-15",
)

retriever = kbb_docs.get_retriever()

template = """Answer the question based only on the following context:
{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

model = AzureChatOpenAI(
    openai_api_version="2023-05-15",
    azure_deployment=credentials['Deployment_completion'],
)

chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

result = chain.invoke("Add a functionality that changes the studnet. Student might need up to 4 names, not just first and last.")

[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence] Entering Chain run with input:
[0m{
  "input": "Add a functionality that changes the studnet. Student might need up to 4 names, not just first and last."
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel] Entering Chain run with input:
[0m{
  "input": "Add a functionality that changes the studnet. Student might need up to 4 names, not just first and last."
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel > 3:chain:RunnablePassthrough] Entering Chain run with input:
[0m{
  "input": "Add a functionality that changes the studnet. Student might need up to 4 names, not just first and last."
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel > 3:chain:RunnablePassthrough] [0ms] Exiting Chain run with output:
[0m{
  "output": "Add a functionality that changes the studnet. Student might need up to 4 names, not ju

In [None]:
kbb_docs.search('Add a functionality that changes the studnet. Student might need up to 4 names, not just first and last.', k=1)

[(Document(page_content="This migration class defines the initial database schema for a student management system.\n\nThe migration includes the creation of two models: `StudentBulkUpload` and `Student`.\n\n`StudentBulkUpload` model:\n- Contains fields for an auto-incrementing primary key `id`, the `date_uploaded` with automatic timestamping, and a `csv_file` field to store uploaded CSV files in a specified directory for bulk uploads.\n\n`Student` model:\n- Defines the student's attributes including a unique `registration_number`, personal information such as `surname`, `firstname`, `other_name`, `gender`, `date_of_birth`, and contact details like `parent_mobile_number` and `address`.\n- Includes `others` text field for additional information and `passport` image field for storing student photographs.\n- References a `current_class` which is a foreign key to the `StudentClass` model, allowing for a relation to the student's current class with the possibility of being null if not set.\n