In [None]:
## Installations

!pip install -q openai
!pip install -q langchain
!pip install -q unstructured
!pip install -q tiktoken
!pip install -q faiss-cpu
!pip install python-docx



In [None]:
## Get the openai API key
import openai
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.getenv('OPENAI_API_KEY')

In [None]:
## Use this code if you want to export the AI message as a markdown word document

from IPython.display import display, Markdown
from docx import Document

def save_as_word_document(text, filename):
    document = Document()
    document.add_paragraph(text)
    document.save(filename)

def disp_markdown(text):
    display(Markdown(text))
    save_as_word_document(text, "project_proposal_german.docx")

In [None]:
import langchain
langchain.debug = False

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredFileLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA


# Load Data
loader = UnstructuredFileLoader("raw_text.docx")
raw_documents = loader.load()

# Split text into overlapping chuncks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=15000,
                                               chunk_overlap=8500,
                                               length_function=len,
                                               add_start_index=True)


documents = text_splitter.split_documents(raw_documents)

# Embedd the data
embeddings = OpenAIEmbeddings(openai_api_key=openai.api_key)
docsearch = FAISS.from_documents(documents, embeddings)

# Set the model
llm = ChatOpenAI(model_name="gpt-3.5-turbo-16k",
                 openai_api_key=openai.api_key,
                 temperature=0.0,
                 max_tokens=240)

# Set up the retrieval function
qa_chain = RetrievalQA.from_chain_type(llm,
                                       chain_type="refine",
                                       retriever=docsearch.as_retriever())

In [None]:
# a list of "questions", each "question" corresponds to a section and its specific instructions

sections = [""" I. Titel des FuE-Vorhabens:
                  (Provide a succinct technical and numerical description and/or specifications of the project that captures its MAIN innovative core.

                    Bear in mind the following:

                    - Use specific technical language and must include any relevant parameters, numerical data, or statistics to accentuate new methodologies and intended results.
                    - Aim for a detailed, comprehensive, and well-articulated response reflecting the breadth and depth of our research.
                    - DO NOT INCLUDE THE SECTION HEADER.
                    - Keep your response within 800 CHARACTERS and incorporate target parameters in absolute values.
                    - Avoid mentioning information not directly related to the technical specifics and goals of the project.)""", #220

            """ Start der FuE-Arbeiten (TT.MM.JJJJ):
                 (Mention the start date of the research and development work.)

               Ende der FuE-Arbeiten (TT.MM.JJJJ):
                 (Mention the projected end date of the research and development work.)""" ,

            """ II. Forschungszweig Hauptkategorie/Unterkategorie:
                  (Specify the main and sub-category of the research discipline your project falls under.)""",

            """
      III. Inhaltlich/Fachliche Angaben:
        ( Based on the provided context, formulate a cohesive paragraph detailing the current state of art/technology and its limitations relevant to the project's technical description.
         Highlight how the project extends or builds upon this state, showcasing its innovation and potential impact.
         Specify the project's goal, emphasizing its innovative core and associated technical terms.
         Detail the expected outcome or result, illustrating its contribution to advancing the field.

        Bear in mind the following:

          - Use specific technical language and must include any relevant parameters, numerical data, or statistics to accentuate new methodologies and intended results.
          - Aim for a detailed, comprehensive, and well-articulated response reflecting the breadth and depth of our research.
          - DO NOT INCLUDE THE SECTION HEADER.
          - Keep your response within 800 CHARACTERS and incorporate target parameters in absolute values.
          - Avoid mentioning information not directly related to the technical specifics and goals of the project.)""",

      """
      IV. Beschreibung der Arbeiten (alle Tätigkeitsformen: Eigenbetriebliche
          Forschung/Auftragsforschung/Kooperationsforschung)
          (Based on the provided context, formulate a cohesive paragraph detailing all the steps, tasks and methods that will be employed to accomplish the project's objective. Make sure you detail the tools and methodologies that will be used to implement each step.

            Bear in mind the following:

            - Use specific technical language and must include any relevant parameters, numerical data, or statistics to accentuate new methodologies and intended results.
            - Aim for a succinct, comprehensive, and well-articulated response reflecting the breadth and depth of our research.
            - DO NOT INCLUDE THE SECTION HEADER AND INTRODUCTORY SENTENCES.
            - Keep your response within 800 CHARACTERS and incorporate target parameters in absolute values.
            - Avoid mentioning information not directly related to the technical specifics and goals of the project.)""",


"""   V. Zielt das Vorhaben auf ein Produkt, Produktionsverfahren, eine Produktionslinie, eine Dienstleistung oder eine wissenschaftliche Methodik ab, das/die eine deutliche Weiterentwicklung/Neuheit in Bezug auf den betreffenden Wirtschaftszweig darstellt?
          (In this section, your task is to:
          1. Specify the target sector/product category and explain how your project differentiates from the existing state of the art within that industry.
          2. Describe any new scientific/technical methods, approaches, or procedures that will be utilized in your project.
          3. Provide examples of experiments, field studies, observations, cohort studies, or any other relevant activities.
          4. Highlight approximately two parameters of differentiation to showcase how your project aims to surpass the state of the art in terms of innovation and advancement.

          Note: Remember to use specific details to describe your project and its differences from existing solutions.
          This will demonstrate the innovative nature of your project and show how it aligns with the FRASCATI criteria.
          Your responses should be detailed, , well-explained but straight to the point. Don't include the section header.
          Ensure your response MUST NOT EXCEED 800 CHARACTERS, and it must include the target parameters in the form of absolute value)""",

            """
      VI. Besteht ein konkreter Bezug des Vorhabens zu bestehenden Produkten, Produktionslinien, Produktionsverfahren, Dienstleistungen oder bereits etablierter wissenschaftlicher Methodik in Ihrem Unternehmen?
        (Based on the provided context:
          1. Describe the technologies, methods or solutions currently in use at the company that relate to the proposed project.
          2. Outline the unique functionality that distinguishes the proposed project from the existing solutions.
          3. Detail how the to-be-developed product, process, or service varies from those your company already offers.

        Bear in mind the following:

        - Use specific technical language and must include any relevant parameters, numerical data, or statistics to accentuate new methodologies and intended results.
        - Aim for a succinct, comprehensive, and well-articulated response reflecting the breadth and depth of our research.
        - DO NOT INCLUDE THE SECTION HEADER AND INTRODUCTORY SENTENCES.
        - Keep your response within 800 CHARACTERS and incorporate target parameters in absolute values.
        - Avoid mentioning information not directly related to the technical specifics and goals of the project.) """,

            """
      VII. Erläuterung der wissenschaftlichen und/oder technischen Risiken bei der Umsetzung des Vorhabens
          (Based on the provided context, formulate a cohesive paragraph explaining all potential risks - engineering, natural, social, or humanities - that could hinder the project.
           Describe scientific or technical uncertainties or challenges that might lead to failure.

        Bear in mind the following:

        - Use specific technical language and must include any relevant parameters, numerical data, or statistics to accentuate new methodologies and intended results.
        - Aim for a succinct, comprehensive, and well-articulated response reflecting the breadth and depth of our research.
        - DO NOT INCLUDE THE SECTION HEADER AND INTRODUCTORY SENTENCES.
        - Keep your response within 800 CHARACTERS and incorporate target parameters in absolute values.
        - Avoid mentioning information not directly related to the technical specifics and goals of the project.)""",

            """
      VIII. Tätigkeitsform:
          (In this section, your task is to articulate a comprehensive analysis of whether the activities and responsibilities of your project fall under in-house research and development, contract research, or both.

            For reference:
            - In-house research and development: refers to activities conducted by the staff of the organization itself, often implying that the organization has control over the research agenda, process, and results.
            - Contract research, on the other hand, refers to activities where an organization contracts another party (usually another research institution or a specialized research company) to carry out specific R&D tasks.
              This often happens when the commissioning organization lacks certain skills, resources, or capacity to perform those tasks internally.

            Note: Utilize specific technical terminology to emphasize the novel technical approaches.
            Your responses should be thorough, detailed, and well-explained responses that reveal the depth and breadth of our research activities.
            DO NOT INCLUDE THE SECTION HEADER. Ensure your response DOES NOT EXCEED 800 CHARACTERS and includes target parameters in absolute values.)""",

      """
      X. Beschreibung der in Auftrag gegebenen FuE-Arbeiten und der damit verbundenen Ziele
        (In this section, your task is to Describe the research and development work commissioned for the project and its objectives.

          Note: Utilize specific technical terminology to emphasize the novel technical approaches.
          Your responses should be thorough, detailed, and well-explained responses that reveal the depth and breadth of our research activities.
          DO NOT INCLUDE THE SECTION HEADER. Ensure your response DOES NOT EXCEED 800 CHARACTERS and includes target parameters in absolute values.)""",

            """
      XI. Verschlagwortung
        Extract all the relevant technical and scientific keywords mentioned in the provided context. """,

            """
            XII. Verwertung
         (Based on the provided context, formulate a cohesive and succinct explanation of how the project's outcomes will be utilized and disseminated (to end users and the scientific community).

        Bear in mind the following:

        - Use specific technical language and must include any relevant parameters, numerical data, or statistics to accentuate new methodologies and intended results.
        - Aim for a succinct, comprehensive, and well-articulated response reflecting the breadth and depth of our research.
        - DO NOT INCLUDE THE SECTION HEADER AND INTRODUCTORY SENTENCES.
        - Keep your response within 800 CHARACTERS and incorporate target parameters in absolute values.
        - Avoid mentioning information not directly related to the technical specifics and goals of the project.)"""
]

In [None]:
# Build prompt (System message + "question")
from langchain.prompts import PromptTemplate
template = """
As a project manager in an R&D department, your task is to construct a comprehensive project description in German based on the provided German context.
The proposal should be innovative, align with the FRASCATI criteria, and be suitable for submission to the German tax R&D subsidy grant scheme.
The proposal should reflect the provided German context, encapsulating specific objectives, prospective impact, and significant advancements in technology, methodology, or application in the relevant field.

Respond in German while considering all the following points:

-  Preserve all critical information, including technical terms, dates, statistics, and numerical data from the context.
-  Emphasize the uniqueness and novelty of the project, highlighting its potential economic, societal, or environmental benefits.
-  Provide comprehensive and accurate answers, refraining from inventing information. Always refer back to the given context.
-  Utilize technical and scientific language whenever suitable, specifying the methodologies used.
-  Your responses should be to the point and in the form of paragraphs and MUST NOT EXCEED 800 CHARACTERS.

{context}

Question:

      III. Inhaltlich/Fachliche Angaben:
        ( Based on the provided context, formulate a cohesive paragraph detailing the current state of art/technology and its limitations relevant to the project's technical description.
         Highlight how the project extends or builds upon this state, showcasing its innovation and potential impact.
         Specify the project's goal, emphasizing its innovative core and associated technical terms.
         Detail the expected outcome or result, illustrating its contribution to advancing the field.

        Bear in mind the following:

          - Use specific technical language and must include any relevant parameters, numerical data, or statistics to accentuate new methodologies and intended results.
          - Aim for a detailed, comprehensive, and well-articulated response reflecting the breadth and depth of our research.
          - DO NOT INCLUDE THE SECTION HEADER.
          - Keep your response within 800 CHARACTERS and incorporate target parameters in absolute values.
          - Avoid mentioning information not directly related to the technical specifics and goals of the project.)
"""

# Run chain
result = qa_chain({"query": template})
result['result']

'Derzeit werden im Bereich der genetischen Gesundheitsbewertung von Haustieren vor allem Einzeltests und rassenspezifische Produkte eingesetzt. Diese Tests sind jedoch begrenzt und können nicht das gesamte genetische Markup eines Tieres untersuchen. Unser Projekt zielt darauf ab, diese Lücke zu schließen und eine innovative Methode einzuführen, die auf dem Next Generation Sequencing (NGS) basiert. Durch die Anwendung von NGS können wir das gesamte Genom eines Tieres analysieren und genetische Varianten wie Insertionen, Deletionen und Inversionen identifizieren. Unser Ziel ist es, für die Spezies Hund, Katze und Pferd ein umfassendes Testpaket zu entwickeln, das eine Vielzahl von Erbkrankheiten abdeckt. Wir streben an, mehr als 60 Prozent der bekannten Erbkrankheiten in jeder Spezies zu erfassen. Das Ergebnis wird ein Testkit sein, das es Tierbesitzern, Züchtern und Tierär'