In [None]:
import sys
sys.path.append('../')

In [None]:
from pathlib import Path

import pandas as pd

from legacy_code_assistant.knowledge_base.knowledge_builder import KnowledgeBaseBuilder
from legacy_code_assistant.knowledge_base.knowledge_builder import CodeAnalyzer

In [None]:
path = Path() / '..' / '..' / 'Django-School-Management-System'
# path = Path() / '..' / 'test_code'


In [None]:
path

In [None]:
paths = list(path.rglob('**/*.py'))
ca = CodeAnalyzer(paths)
results = ca.analyze()
print(len(results))
results

In [None]:
df = pd.DataFrame(results)
df['len'] = df['code'].apply(lambda x: len(x))
df = df.sort_values('len', ascending=False)
df.head(10)

<h1>Generating docstrings / descriptions</h2>

In [None]:
import yaml
import os

# get credentials from yaml
# credentialsy sa wrzucone na discordzie

with open("credentials.yaml", "r") as f:
    credentials = yaml.load(f, Loader=yaml.FullLoader)

os.environ["AZURE_OPENAI_ENDPOINT"] = credentials['AZURE_OPENAI_ENDPOINT']
os.environ["AZURE_OPENAI_API_KEY"] = credentials['AZURE_OPENAI_API_KEY']

In [None]:
from tqdm import tqdm
import re

from langchain.chat_models import AzureChatOpenAI
from langchain.schema import HumanMessage, AIMessage
from langchain.prompts import ChatMessagePromptTemplate, ChatPromptTemplate, AIMessagePromptTemplate, HumanMessagePromptTemplate


In [None]:
model = AzureChatOpenAI(
    openai_api_version="2023-05-15",
    azure_deployment=credentials['Deployment_completion'],
)

In [None]:
df_docstringable = df.loc[df['type'] != 'module']
print(len(df_docstringable), '/', len(df))
df_docstringable.head()

In [None]:
df_docstringable.isna().sum()

In [None]:
prompt = '''
Given the code of the {type} below, write a docstring for it.
Do not write anything other than the docstring, docstring should be the only output.
Start your answer with ```python\n"""\n and end it with """\n```
\n\n{code}\n\n
'''

# prompt_ai = '''
# Docstring:\n\n
# ```python\n"""\n
# '''

chat_prompt_template = ChatPromptTemplate.from_template(prompt)
# ai_message = AIMessage(content=prompt_ai)

In [None]:
example = df_docstringable.iloc[0]
chat_prompt_template.format_prompt(type=example['type'], code=example['code'])

In [None]:
prompts = [chat_prompt_template.format_prompt(type=example['type'], code=example['code']) for _, example in df_docstringable.iterrows()]

In [None]:
docstrings = []

for prompt in tqdm(prompts):
    result = model(prompt.to_messages()).content
    result = re.sub('^\n*```\n*(python)\n*(""")?\n*', '', result)
    result = re.sub('\n*(""")?\n*```\n*$', '', result)
    docstrings.append(result)

In [None]:
len(docstrings)

In [None]:
df_docstringable.loc[:, 'generated_docstring'] = docstrings
df_docstringable.head()

In [None]:
df.to_csv('knowledge_base.csv', index=False)

In [None]:
df_docstringable.to_csv('functions_with_generated_docstrings.csv', index=False)

<h1>Make vectorstore based on code</h1>

In [None]:
from legacy_code_assistant.knowledge_base.knowledge_builder import KnowledgeBaseBuilder
from langchain.document_loaders import DataFrameLoader
from langchain.vectorstores import FAISS
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter #-> to splits data to chunks




In [None]:
kbb = KnowledgeBaseBuilder(index_name='code_based_index')
kbb.initialize_faiss_based_on_df(df, text_column='code')

kbb.search('endpoint')

In [None]:
kbb.save_index()

<h1>Make vectorstore based on generated docstrings</h1>

In [None]:
from langchain.embeddings import AzureOpenAIEmbeddings

embeddings = AzureOpenAIEmbeddings(
    azure_deployment=credentials['Deployment_embeddings'],
    openai_api_version="2023-05-15",
)

In [None]:
kbb = KnowledgeBaseBuilder(index_name='docstring_based_index', model=embeddings)
kbb.initialize_faiss_based_on_df(df, text_column='generated_docstring')

kbb.search('giving response to the user')

# PROMPT FOR TESTING

In [None]:
text_django_test = """from django.test import TestCase

from apps.corecode.models import (
    AcademicSession,
    AcademicTerm,
    SiteConfig,
    Subject,
)


class SiteConfigTest(TestCase):
    def test_siteconfig(self):
        site_config = SiteConfig.objects.create(key="akey", value="aname")
        self.assertEqual(str(site_config), "akey")


class AcademicSessionTest(TestCase):
    def test_academicsession(self):
        session = AcademicSession.objects.create(
            name="test session", current=True)
        self.assertEqual(str(session), "test session")


class AcademicTermTest(TestCase):
    def test_academicterm(self):
        term = AcademicTerm.objects.create(name="test Term", current=True)
        self.assertEqual(str(term), "test Term")


class SubjectTest(TestCase):
    def test_subject(self):
        subject = Subject.objects.create(name="a_subject")
        self.assertEqual(str(subject), "a_subject")"""

In [None]:
to_be_tested = df['code'].iloc[1]

In [None]:
df = pd.read_csv('functions_with_generated_docstrings.csv')

In [None]:
import yaml
import os

# get credentials from yaml
# credentialsy sa wrzucone na discordzie

with open("credentials.yaml", "r") as f:
    credentials = yaml.load(f, Loader=yaml.FullLoader)

os.environ["AZURE_OPENAI_ENDPOINT"] = credentials['AZURE_OPENAI_ENDPOINT']
os.environ["AZURE_OPENAI_API_KEY"] = credentials['AZURE_OPENAI_API_KEY']

model = AzureChatOpenAI(
    openai_api_version="2023-05-15",
    azure_deployment=credentials['Deployment_completion'],
)

In [None]:
message = HumanMessage(
    content=f"Having {text_django_test} as example test for django. Write me another for this one {to_be_tested}. Remebber to take it slow and focus on step by step approach to the problem. Quality over quantity. In the end merge all  little steps."
)
output = model([message])

In [None]:
output.content

In [None]:
from IPython.display import display, Markdown, Latex
display(Markdown(output.content))
# If you particularly want to display maths, this is more direct:
display(Latex('\phi'))