In [1]:
from argostranslate import package, translate
from deep_translator import GoogleTranslator
import fitz 
from langdetect import detect
from translate import Translator
import nltk
import re
from sentence_transformers import SentenceTransformer
from langchain.prompts import PromptTemplate

from PyPDF2 import PdfReader
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.chains import ConversationalRetrievalChain
from openai import AzureOpenAI

import chromadb
from chromadb.utils import embedding_functions

In [None]:
os.environ['OPENAI_API_KEY'] = 'OPENAI_API_KEY'
os.environ['OPENAI_API_TYPE'] = 'azure'
os.environ['OPENAI_API_VERSION'] = '2023-03-15-preview'
os.environ['OPENAI_API_BASE'] = 'OPENAI_API_BASE'

In [3]:
# This Python code is setting up a text analysis process using the chromadb library. It imports embedding_functions to convert text into vector representations. 
## The path for the data, the model for embeddings, and the name of the document collection are defined as "turkish_document_chroma/", "all-MiniLM-L6-v2", and "turkish_language_doc", respectively. Lastly, an instance of chromadb.
## PersistentClient is created to interact with the data stored at the specified path.

CHROMA_DATA_PATH = "turkish_document_chroma/"
EMBED_MODEL = "all-MiniLM-L6-v2"
COLLECTION_NAME = "turkish_language_doc"

client = chromadb.PersistentClient(path=CHROMA_DATA_PATH)

In [6]:
# performing two main tasks:
## setting up an embedding function using the SentenceTransformerEmbeddingFunction from the embedding_functions module, using the model name defined earlier.
## creating a new collection in the ChromaDB database using the client instance. The collection is named using the previously defined COLLECTION_NAME, the embedding function just created is assigned to it, and it's specified to use cosine space for HNSW algorithm.
## Lastly, it retrieves the newly created collection from the ChromaDB database using the get_collection method.

embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name=EMBED_MODEL
)

collection = client.create_collection(
    name=COLLECTION_NAME,
    embedding_function=embedding_func,
    metadata={"hnsw:space": "cosine"},
)



2024-02-10 13:14:56.608 INFO    sentence_transformers.SentenceTransformer: Load pretrained SentenceTransformer: all-MiniLM-L6-v2


In [4]:
collection = client.get_collection("turkish_language_doc")

In [7]:
def read_pdf(file_path):
    text = ''
    with fitz.open(file_path) as doc:
        for page in doc:
            text += page.get_text()
    return text

def detect_language(text):
    return detect(text)

def chunk_text_nltk(text, max_chunk_size=5000):
    """
    This function uses NLTK's sent_tokenize to split the text into sentences and then
    aggregates those sentences into chunks that are smaller than max_chunk_size.
    """
    sentences = nltk.tokenize.sent_tokenize(text)
    chunks = []
    current_chunk = sentences[0]

    for sentence in sentences[1:]:
        
        if len(current_chunk) + len(sentence) + 1 < max_chunk_size:
            current_chunk += ' ' + sentence
        else:
            chunks.append(current_chunk)
            current_chunk = sentence
    chunks.append(current_chunk)  # Add the last chunk
    return chunks

def translate_text(text, dest_language='en'):
    source_lang = detect(text)
    translator = GoogleTranslator(source='auto', target=dest_language)
    
    chunks = chunk_text_nltk(text)
    translated_text = ''
    for chunk in chunks:
        if len(chunk) <= 5000:
            translated_text += translator.translate(chunk) + ' '
        else:
            print("Warning: A chunk exceeded the translation limit and was skipped.")
    
    return translated_text


In [8]:
# reads a PDF file.
## checks the language of the text and if the language is not English, it translates the text into English.
## splits the text into chunks of a certain size, with a certain amount of overlap between chunks.
## adds each chunk of text to a collection with a specific ID. This could be for storage in a database or another storage system.


pdf_path = "Muhendislikte_yapay_zeka_ve_uygulamalari.pdf"
text = read_pdf(pdf_path)


if detect_language(text) != 'en':
    text = translate_text(text)


text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len
)
chunks = text_splitter.split_text(text=text)


for i, chunk in enumerate(chunks):
    document_id = f"id{i}"
    
    collection.add(
        documents=[chunk],  
        ids=[document_id]
    )

2024-02-10 13:37:42.903 INFO    backoff: Backing off send_request(...) for 0.3s (requests.exceptions.ConnectionError: ('Connection aborted.', ConnectionResetError(10054, 'Varolan bir bağlantı uzaktaki bir ana bilgisayar tarafından zorla kapatıldı', None, 10054, None)))


In [5]:
# queries a collection  for a specific text, in this case "Find me Purposes of Artificial Intelligence in the relevant content."
## specifies that it wants the top 5 results from this query.
## stores the results of this query in the query_results variable.


query_results = collection.query(
    query_texts=["Find me Purposes of Artificial Intelligence in the relevant content."],
    n_results=5,
)

query_results

{'ids': [['id27', 'id12', 'id11', 'id26', 'id13']],
 'distances': [[0.3116670846939087,
   0.3139141798019409,
   0.35517728328704834,
   0.387454628944397,
   0.4085800051689148]],
 'metadatas': [[None, None, None, None, None]],
 'embeddings': None,
 'documents': [['It is based on answering on a specific topic rather than answering all questions.\n(Akpınar, 2015). 1.5\nResearch Areas of Artificial Intelligence\nArtificial intelligence studies are not only in computer science, but also in games, automatic application of mathematical theorems.\nin proving, natural language understanding and translation processes, image processing, general\nin information systems, machine learning, knowledge-based systems, data mining, robotics\nIt is carried out in different areas such as (Kocaba¸s, 2013). Artificial use in different areas every day\nWe see that intelligence is used. 1.5.1\nGames\nGames such as chess, checkers and backgammon have been preferred by researchers since the early days of art

In [21]:
# sets a context string, which appears to be instructions for an AI assistant.
## sets a question string, which asks for the purposes of artificial intelligence.
## queries a collection for the question, requesting the top 5 results and including the documents in the results.
## takes the documents from the query results and joins them into a single string, separated by commas, which it stores in the reviews_str variable.

context = """
You are an artificial intelligence who loves helping people. Your goal is to provide accurate and consistent responses. When answering questions, use the following context: '{}'. Even if the content may not always be directly related to the question, 
strive to generate a meaningful response based on the provided context.  

""" 

question = """
Find me research areas of artificial intelligence.

"""

good_reviews = collection.query(
    query_texts=[question],
    n_results=5,
    include=["documents"]
)

reviews_str = ",".join(good_reviews["documents"][0])



In [22]:
good_reviews

{'ids': [['id27', 'id42', 'id11', 'id26', 'id13']],
 'distances': None,
 'metadatas': None,
 'embeddings': None,
 'documents': [['It is based on answering on a specific topic rather than answering all questions.\n(Akpınar, 2015). 1.5\nResearch Areas of Artificial Intelligence\nArtificial intelligence studies are not only in computer science, but also in games, automatic application of mathematical theorems.\nin proving, natural language understanding and translation processes, image processing, general\nin information systems, machine learning, knowledge-based systems, data mining, robotics\nIt is carried out in different areas such as (Kocaba¸s, 2013). Artificial use in different areas every day\nWe see that intelligence is used. 1.5.1\nGames\nGames such as chess, checkers and backgammon have been preferred by researchers since the early days of artificial intelligence.\nIt has become a popular area. In the beginning, we consider many solutions in a limited time.\nSystems that were bu

In [23]:
llm_model = AzureOpenAI(
    azure_endpoint = os.environ['OPENAI_API_BASE'] ,
    api_key = os.environ['OPENAI_API_KEY'],
    api_version = os.environ['OPENAI_API_VERSION']
)

good_review_summaries = llm_model.chat.completions.create(
        model="model_name", # model = "deployment_name".
        messages=[
        {"role": "system", "content": context.format(reviews_str)},
        {"role": "user", "content": question},
        ],
    temperature=0,
    n=1
    )




print(good_review_summaries.choices[0].message.content)

Research areas of artificial intelligence include:

1. Games: Artificial intelligence has been applied to games such as chess, checkers, and backgammon. Researchers have developed systems that use strategies based on experience and information to play these games.

2. Natural Language Understanding and Translation: Artificial intelligence is used to understand and translate natural language. This includes speech synthesis, speech understanding, and pattern recognition.

3. Image Processing: Artificial intelligence is used in image processing to analyze and interpret visual information. This includes tasks such as object recognition, image classification, and image generation.

4. Information Systems: Artificial intelligence is used in information systems to collect and organize information within a specific field of expertise. This includes the development of expert systems that provide answers and solutions in a specific domain.

5. Machine Learning: Machine learning is a subfield of 