### OCI Data Science - Useful Tips
<details>
<summary><font size="2">Check for Public Internet Access</font></summary>

```python
import requests
response = requests.get("https://oracle.com")
assert response.status_code==200, "Internet connection failed"
```
</details>
<details>
<summary><font size="2">Helpful Documentation </font></summary>
<ul><li><a href="https://docs.cloud.oracle.com/en-us/iaas/data-science/using/data-science.htm">Data Science Service Documentation</a></li>
<li><a href="https://docs.cloud.oracle.com/iaas/tools/ads-sdk/latest/index.html">ADS documentation</a></li>
</ul>
</details>
<details>
<summary><font size="2">Typical Cell Imports and Settings for ADS</font></summary>

```python
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.ERROR)

import ads
from ads.dataset.factory import DatasetFactory
from ads.automl.provider import OracleAutoMLProvider
from ads.automl.driver import AutoML
from ads.evaluations.evaluator import ADSEvaluator
from ads.common.data import ADSData
from ads.explanations.explainer import ADSExplainer
from ads.explanations.mlx_global_explainer import MLXGlobalExplainer
from ads.explanations.mlx_local_explainer import MLXLocalExplainer
from ads.catalog.model import ModelCatalog
from ads.common.model_artifact import ModelArtifact
```
</details>
<details>
<summary><font size="2">Useful Environment Variables</font></summary>

```python
import os
print(os.environ["NB_SESSION_COMPARTMENT_OCID"])
print(os.environ["PROJECT_OCID"])
print(os.environ["USER_OCID"])
print(os.environ["TENANCY_OCID"])
print(os.environ["NB_REGION"])
```
</details>

In [11]:
!pip install langchain-core

Collecting langchain-core
  Downloading langchain_core-0.2.41-py3-none-any.whl.metadata (6.2 kB)
Collecting langsmith<0.2.0,>=0.1.112 (from langchain-core)
  Downloading langsmith-0.1.136-py3-none-any.whl.metadata (13 kB)
Collecting pydantic<3,>=1 (from langchain-core)
  Downloading pydantic-2.9.2-py3-none-any.whl.metadata (149 kB)
Collecting tenacity!=8.4.0,<9.0.0,>=8.1.0 (from langchain-core)
  Downloading tenacity-8.5.0-py3-none-any.whl.metadata (1.2 kB)
Collecting httpx<1,>=0.23.0 (from langsmith<0.2.0,>=0.1.112->langchain-core)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting orjson<4.0.0,>=3.9.14 (from langsmith<0.2.0,>=0.1.112->langchain-core)
  Downloading orjson-3.10.7-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (50 kB)
Collecting requests<3,>=2 (from langsmith<0.2.0,>=0.1.112->langchain-core)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting requests-toolbelt<2.0.0,>=1.0.0 (from langsmith<0.2.0,>=0.1.112-

In [3]:
import pdfplumber
from docx import Document

# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ''
        for page in pdf.pages:
            text += page.extract_text()
    return text

# Extract text from PPTX (using python-pptx)
from pptx import Presentation

def extract_text_from_pptx(pptx_path):
    prs = Presentation(pptx_path)
    text = ''
    for slide in prs.slides:
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                text += shape.text + "\n"
    return text

# Extract relevant sections using keywords
def extract_sections(text, keywords):
    sections = {}
    for keyword in keywords:
        sections[keyword] = []
        for line in text.split("\n"):
            if keyword.lower() in line.lower():
                sections[keyword].append(line)
    return sections

In [6]:
# Load RFP and RFP Response
rfp_text = extract_text_from_pdf('rfp.pdf')
rfp_response_text = extract_text_from_pptx('rfp_response.pptx')

# Define sections to extract
keywords_rfp = ['Project Overview', 'Requirements', 'Data Integration', 'Ownership', 'Reporting']
keywords_response = ['Business Value', 'Data Mart', 'LLM', 'Reporting', 'Deliverables', 'Scope', 'Assumptions']

# Extract sections
rfp_sections = extract_sections(rfp_text, keywords_rfp)
response_sections = extract_sections(rfp_response_text, keywords_response)

# Check extracted sections
print("RFP Sections:", rfp_sections)
print("Response Sections:", response_sections)

RFP Sections: {'Project Overview': ['1 ADA FCCS Project Overview ............................................................................................................................................... 5', 'ADA FCCS – REQUIREMENTS AND DESIGN DOCUMENT 41 ADA FCCS PROJECT OVERVIEW'], 'Requirements': ['Requirements and Design Document', '2. Requirements & Design Elements .................................................................................................................................... 7', '3. Annexure Requirements & Design Phase .................................................................................................................... 21', 'ADA FCCS – REQUIREMENTS AND DESIGN DOCUMENT 2ADA FCCS – REQUIREMENTS AND DESIGN DOCUMENT 3Document Revision History & Approval', 'ADA FCCS – REQUIREMENTS AND DESIGN DOCUMENT 41 ADA FCCS PROJECT OVERVIEW', 'consolidation for statutory reporting and statutory and management reporting requirements of ADQ', 'ADQ Aviation re

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def calculate_similarity(text1, text2):
    vectorizer = TfidfVectorizer().fit_transform([text1, text2])
    vectors = vectorizer.toarray()
    return cosine_similarity(vectors)[0, 1]

# Map RFP and Response sections
for rfp_key, rfp_value in rfp_sections.items():
    for response_key, response_value in response_sections.items():
        similarity = calculate_similarity(' '.join(rfp_value), ' '.join(response_value))
        print(f'Similarity between {rfp_key} and {response_key}: {similarity:.2f}')

Similarity between Project Overview and Business Value: 0.00
Similarity between Project Overview and Data Mart: 0.07
Similarity between Project Overview and LLM: 0.03
Similarity between Project Overview and Reporting: 0.00
Similarity between Project Overview and Deliverables: 0.00
Similarity between Requirements and Business Value: 0.01
Similarity between Requirements and Data Mart: 0.18
Similarity between Requirements and LLM: 0.14
Similarity between Requirements and Reporting: 0.00
Similarity between Requirements and Deliverables: 0.00
Similarity between Data Integration and Business Value: 0.00
Similarity between Data Integration and Data Mart: 0.28
Similarity between Data Integration and LLM: 0.16
Similarity between Data Integration and Reporting: 0.00
Similarity between Data Integration and Deliverables: 0.00
Similarity between Ownership and Business Value: 0.00
Similarity between Ownership and Data Mart: 0.07
Similarity between Ownership and LLM: 0.08
Similarity between Ownership

In [22]:
import oci
import pdfplumber
from pptx import Presentation

# Setup basic variables
compartment_id = "ocid1.compartment.oc1..aaaaaaaaretksgipt3jgwfpzgh4ijyw54uynyfviaxs5li4wtl744fj4fi3q"
CONFIG_PROFILE = "DEFAULT"
config = oci.config.from_file('config', CONFIG_PROFILE)

# Service endpoint
endpoint = "https://inference.generativeai.us-chicago-1.oci.oraclecloud.com"

generative_ai_inference_client = oci.generative_ai_inference.GenerativeAiInferenceClient(config=config, service_endpoint=endpoint, retry_strategy=oci.retry.NoneRetryStrategy(), timeout=(10,240))

# Function to extract text from PDF (RFP)
def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ''
        for page in pdf.pages:
            text += page.extract_text()
    return text

# Function to extract text from PPTX (RFP response)
def extract_text_from_pptx(pptx_path):
    prs = Presentation(pptx_path)
    text = ''
    for slide in prs.slides:
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                text += shape.text + "\n"
    return text

# Load and extract text from RFP and RFP Response
rfp_text = extract_text_from_pdf('rfp.pdf')
rfp_response_text = extract_text_from_pptx('rfp_response.pptx')

# Concatenate the two documents for context
combined_input = f"RFP Document:\n{rfp_text}\n\nRFP Response Document:\n{rfp_response_text}"

# Function to get response from Generative AI API
def ask_question(question, input_text):
    chat_detail = oci.generative_ai_inference.models.ChatDetails()

    # Create chat request
    chat_request = oci.generative_ai_inference.models.CohereChatRequest()
    chat_request.message = f"{question}\n\nContext:\n{input_text}"
    chat_request.max_tokens = 600
    chat_request.temperature = 0.25
    chat_request.frequency_penalty = 0
    chat_request.top_p = 0.75
    chat_request.top_k = 0

    # Set chat details and compartment info
    chat_detail.serving_mode = oci.generative_ai_inference.models.OnDemandServingMode(model_id="ocid1.generativeaimodel.oc1.us-chicago-1.amaaaaaask7dceya7ozidbukxwtun4ocm4ngco2jukoaht5mygpgr6gq2lgq")
    chat_detail.chat_request = chat_request
    chat_detail.compartment_id = compartment_id

    # Send request and get response
    chat_response = generative_ai_inference_client.chat(chat_detail)

    # Extract the message from the chat history
    chat_history = chat_response.data.chat_response.chat_history
    if chat_history:
        return chat_history[-1].message  # Return the last message from the chatbot

    return "No response received."

# Example question to ask based on RFP and RFP Response
question = "Come up with a process flow diagram suitable for the application that has been asked by the client?"
response = ask_question(question, combined_input)

print("**************************Chat Result**************************")
print(response)

**************************Chat Result**************************
Here is a suggested process flow diagram for the implementation of the Oracle Data Mart and Analytics platform with an LLM-powered Oracle Chatbot:

## High-Level Process Flow:

### 1. Project Initiation:
- Kick-off meeting with ADA and Oracle teams.
- Define project scope, objectives, and timelines.
- Assign roles and responsibilities.

### 2. Data Mart Development:
- Source data extraction from FCCS, Procurement, and HR systems.
- Data cleansing and validation.
- Data modeling and transformation.
- Create data marts for FCCS, ADA, EYE, AMMROC, and GAL.

### 3. LLM Model Training:
- Select and fine-tune LLM models.
- Train LLM models on ADA's data and use cases.
- Test and validate LLM performance.

### 4. Oracle Digital Assistant (ODA) Setup:
- Integrate ODA with LLM models.
- Configure ODA chat interface and avatar system.
- Train ADA team on ODA usage.

### 5. Testing and User Acceptance:
- Conduct unit testing, integra

In [23]:
import oci
import pdfplumber
from pptx import Presentation

# Setup basic variables
compartment_id = "ocid1.compartment.oc1..aaaaaaaaretksgipt3jgwfpzgh4ijyw54uynyfviaxs5li4wtl744fj4fi3q"
CONFIG_PROFILE = "DEFAULT"
config = oci.config.from_file('config', CONFIG_PROFILE)

# Service endpoint
endpoint = "https://inference.generativeai.us-chicago-1.oci.oraclecloud.com"

generative_ai_inference_client = oci.generative_ai_inference.GenerativeAiInferenceClient(config=config, service_endpoint=endpoint, retry_strategy=oci.retry.NoneRetryStrategy(), timeout=(10,240))

# Function to extract text from PDF (RFP)
def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ''
        for page in pdf.pages:
            text += page.extract_text()
    return text

# Function to extract text from PPTX (RFP response)
def extract_text_from_pptx(pptx_path):
    prs = Presentation(pptx_path)
    text = ''
    for slide in prs.slides:
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                text += shape.text + "\n"
    return text

# Load and extract text from RFP and RFP Response
rfp_text = extract_text_from_pdf('rfp.pdf')
rfp_response_text = extract_text_from_pptx('rfp_response.pptx')

# Concatenate the two documents for context
combined_input = f"RFP Document:\n{rfp_text}\n\nRFP Response Document:\n{rfp_response_text}"

# List of questions to ask (tailored for requirements gathering document)
questions = [
    "What is the purpose and scope of this project as outlined in the RFP?",
    "What are the main business objectives that this project aims to achieve for Abu Dhabi Aviation Holding?",
    "What are the key functional requirements that the solution must fulfill, including features and modules needed for reporting and consolidation?",
    "What are the key non-functional requirements, such as performance, scalability, availability, and maintainability of the system?",
    "Can you describe the overall solution architecture, including the key components, integrations, and data flows involved in this project?",
    "What are the specific integration requirements between Oracle FCCS and external systems like Great Plains, Rusada, and others?",
    "What is the data management strategy, including data storage, retention, and backup policies for this project?",
    "What are the data security protocols, including user access control, encryption methods, and audit logging mechanisms?",
    "What are the automation and workflow requirements for processes like data consolidation, variance analysis, and reporting?",
    "What are the specific reporting requirements, including the use of SmartView, dashboards, and other tools for ad-hoc reporting?",
    "What are the assumptions and constraints identified for this project, including dependencies on other systems and limitations of the current environment?",
    "What are the key deliverables and milestones for this project, including UAT, go-live support, and post-implementation support?"
]

# Function to ask multiple questions and get responses
def ask_multiple_questions(questions, input_text):
    chat_detail = oci.generative_ai_inference.models.ChatDetails()
    chat_responses = []
    
    for question in questions:
        # Create chat request for each question
        chat_request = oci.generative_ai_inference.models.CohereChatRequest()
        chat_request.message = f"{question}\n\nContext:\n{input_text}"
        chat_request.max_tokens = 600
        chat_request.temperature = 0.15  # Lower temperature for more technical responses
        chat_request.frequency_penalty = 0
        chat_request.top_p = 0.85  # Bias towards more likely words to maintain technical tone
        chat_request.top_k = 0

        # Set chat details and compartment info
        chat_detail.serving_mode = oci.generative_ai_inference.models.OnDemandServingMode(model_id="ocid1.generativeaimodel.oc1.us-chicago-1.amaaaaaask7dceya7ozidbukxwtun4ocm4ngco2jukoaht5mygpgr6gq2lgq")
        chat_detail.chat_request = chat_request
        chat_detail.compartment_id = compartment_id

        # Send request and get response
        chat_response = generative_ai_inference_client.chat(chat_detail)

        # Extract the message from the chat history
        chat_history = chat_response.data.chat_response.chat_history
        if chat_history:
            response_message = chat_history[-1].message
            chat_responses.append(f"Question: {question}\nResponse: {response_message}\n")

    return chat_responses

# Ask the list of questions
responses = ask_multiple_questions(questions, combined_input)

# Print each response
for response in responses:
    print("**************************Chat Result**************************")
    print(response)

**************************Chat Result**************************
Question: What is the purpose and scope of this project as outlined in the RFP?
Response: The purpose of this project, as outlined in the RFP, is to implement a consolidation and reporting solution for Abu Dhabi Aviation Holding, leveraging Oracle EPM Cloud FCCS. The scope of the project includes:

- Enabling consolidation for statutory and management reporting, including actual, budget, forecast, and commentary scenarios.
- Implementing the Abu Dhabi Aviation Holding entity and ownership structure.
- Automating data integrations and loads for all entities.
- Enabling SmartView-based reporting packs.
- Configuring standard out-of-box ratios and KPIs, as well as additional ratios for operational reporting.
- Setting up ownership management and consolidations for statutory and management reporting.
- Developing supplemental data forms and managing intercompany data.
- Providing training and support for the use of dashboards 

In [50]:
import re
from docx import Document
from docx.shared import Inches
import oci
import io


compartment_id = "ocid1.compartment.oc1..aaaaaaaaretksgipt3jgwfpzgh4ijyw54uynyfviaxs5li4wtl744fj4fi3q"
config = oci.config.from_file('config', 'DEFAULT')
object_storage_client = oci.object_storage.ObjectStorageClient(config)

def extract_question_and_response(combined_text):
    question_match = re.search(r'Question:\s*(.+?)\n', combined_text)
    response_match = re.search(r'Response:\s*(.+)', combined_text, re.S)
    
    if question_match and response_match:
        question = question_match.group(1)
        response = response_match.group(1).strip()
        return question, response
    return None, None


def extract_keywords(question):
    keywords = re.findall(r'\b(purpose and scope|business objectives|functional requirements|non-functional requirements|solution architecture|data management|security|reporting requirements|automation|key deliverables)\b', question, re.I)
    return ' '.join([kw.capitalize() for kw in keywords]) if keywords else 'General Section'


project_title = "Speak Mate"
for paragraph in doc.paragraphs:
    if '<Project title>' in paragraph.text:
        paragraph.text = paragraph.text.replace('<Project title>', project_title)


def insert_after_page_3(doc, responses):
    paragraphs = list(doc.paragraphs)
    insert_after = None
    for i, paragraph in enumerate(paragraphs):
        if "Customer Experience" in paragraph.text:
            insert_after = i + 1
            break

    if insert_after:
        for response in responses:
            question, answer = response.split('\nResponse: ', 1)
            heading = extract_keywords(question)
            doc.paragraphs.insert(insert_after, doc.add_heading(heading, level=2))
            doc.paragraphs.insert(insert_after + 1, doc.add_paragraph(answer))
            insert_after += 2
            
# Ensure no overlap with footer by adjusting bottom margin
def ensure_no_footer_overlap(doc):
    for section in doc.sections:
        section.bottom_margin = Inches(1)
        section.top_margin = Inches(1)
        section.left_margin = Inches(1)
        section.right_margin = Inches(1)
        
        
# Load the requirements template
template_path = 'requirements_template.docx'
doc = Document(template_path)

# Insert responses between the designated pages
insert_after_page_3(doc, responses)
ensure_no_footer_overlap(doc)

# Create a BytesIO stream to hold the document in memory
doc_stream = io.BytesIO()
doc.save(doc_stream)
doc_stream.seek(0)

# Upload the in-memory document to Object Storage
def upload_to_object_storage_from_memory(bucket_name, object_name, doc_stream):
    object_storage_client.put_object(
        namespace_name=namespace_name,
        bucket_name=bucket_name,
        object_name=object_name,
        put_object_body=doc_stream
    )
    print(f"File {object_name} uploaded to Object Storage in bucket {bucket_name}.")

# Define your Object Storage bucket and namespace here
bucket_name = "ECHO"
namespace_name = "gc35013"
folder_path = "PROJECT_SPEAK_MATE/DELIVERY_DOCS"

object_name = f"{folder_path}/updated_requirements_document.docx"

upload_to_object_storage_from_memory(bucket_name,object_name,doc_stream)

File PROJECT_SPEAK_MATE/DELIVERY_DOCS/updated_requirements_document.docx uploaded to Object Storage in bucket ECHO.
