In [2]:
import os
import pandas as pd
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import ChatPromptTemplate
from langchain.chains import LLMChain
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field
from typing import List, Dict, Any
from dotenv import load_dotenv
from langchain_community.document_loaders import CSVLoader
from langchain.prompts import PromptTemplate


In [3]:
from dotenv import load_dotenv
import os

load_dotenv('C:\\Users\\TERENTI\\Desktop\\UNI\\LLM\\api_key.env')

api_key = os.getenv('API_KEY')


In [4]:
#initializing model 
model_name = 'gemini-2.0-flash' 
model_name2 = 'gemini-2.0-flash-lite' #if out of requests use this one


llm_model = ChatGoogleGenerativeAI(
    model=model_name,
    google_api_key=api_key,
    temperature=0.0,
    convert_system_message_to_human=True
)

print(f"Initialized Gemini Model: {model_name}")


Initialized Gemini Model: gemini-2.0-flash


In [5]:
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader(r'C:\Users\TERENTI\Desktop\UNI\LLM\HW\HW2\General_Academic_Regulations_21.02.2025-eng.pdf')

In [6]:
docs = loader.load()
print(docs[0].page_content[2:])

proved by the Rector order N196, 05/07/2024  AC_GUIDE 005 General Academic Regulations | სასწავლო პროცესის ზოგადი რეგულაციები 
AC_GUIDE 005 
1 
 
 
    GENERAL ACADEMIC REGULATIONS


In [7]:
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter

c_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=1000,
    chunk_overlap=150,
    length_function=len
)

r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=70
)

In [8]:
doc_split = r_splitter.split_documents(docs)
doc_split[0].page_content

'Approved by the Rector order N196, 05/07/2024  AC_GUIDE 005 General Academic Regulations | სასწავლო პროცესის ზოგადი რეგულაციები \nAC_GUIDE 005 \n1 \n \n \n    GENERAL ACADEMIC REGULATIONS'

In [9]:
# from sentence_transformers import SentenceTransformer
from transformers import AutoModel, AutoTokenizer
#
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
# Load the BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)


In [11]:
#tokenizer function for embedding



def get_embedding(text):
    #tokenize input and convert into tensor
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    #pass it to BERT
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].squeeze().numpy()

In [12]:
# Generate embeddings for each chunk
document_embeddings = [get_embedding(chunk.page_content) for chunk in doc_split]

In [13]:
document_embeddings[0]

array([-4.76507634e-01,  5.94461799e-01, -4.70275246e-02, -2.09089100e-01,
       -5.59368312e-01, -4.03583020e-01,  5.66022635e-01,  5.18967867e-01,
       -1.98110670e-01, -8.91329423e-02, -8.04711878e-01,  1.13803186e-02,
       -4.42805700e-03,  3.37806106e-01,  3.23058993e-01,  4.18296978e-02,
       -7.85032846e-03,  6.27574921e-01,  3.49347800e-01,  2.10068807e-01,
       -1.71922714e-01, -5.65401316e-01,  2.87322819e-01, -2.57753193e-01,
       -1.07053138e-01, -3.61914188e-02, -2.10021168e-01,  1.61077410e-01,
       -1.85637742e-01, -1.57904297e-01,  1.14258900e-01,  3.55533242e-01,
       -1.63850486e-01,  2.82093063e-02,  5.72541475e-01, -3.73378843e-02,
        6.01706266e-01, -3.36631238e-01,  4.54089850e-01,  4.84103784e-02,
        1.59115851e-01,  1.85927138e-01,  2.62305647e-01, -2.17892587e-01,
        1.62773252e-01, -3.72564614e-01, -3.08246708e+00, -2.24999696e-01,
       -3.10571015e-01, -2.15969294e-01,  4.61419255e-01, -5.21380186e-01,
        7.14956000e-02,  

In [14]:
import faiss

# Convert list of embeddings to a numpy array
document_embeddings = np.array(document_embeddings).astype('float32')

# Create a FAISS index
dimension_of_embeddings = document_embeddings.shape[1]  # Dimension of embeddings (e.g., 768 for BERT)
index = faiss.IndexFlatL2(dimension_of_embeddings)  # Use L2 distance for similarity search

# Add embeddings to the FAISS index
index.add(document_embeddings)

# Optionally, save the FAISS index to disk
faiss.write_index(index, 'document_embeddings.index')


In [15]:
# Assuming `query_text` is the user input or query you want to search for
query_text = "What is GPA?"

# Get the embedding for the query
query_embedding = get_embedding(query_text)

# Perform the search in FAISS to find the most similar document
k = 3  # Number of nearest neighbors to retrieve
distances, indices = index.search(np.array([query_embedding]), k)

# Output the retrieved document chunks based on the closest indices
for i in indices[0]:
    print(doc_split[i].page_content)  # document_chunks is the list of your original documents


AC_GUIDE 005  
 
1.
Approved by the Rector order N196, 05/07/2024  AC_GUIDE 005 General Academic Regulations | სასწავლო პროცესის ზოგადი რეგულაციები 
AC_GUIDE 005 
1 
 
 
    GENERAL ACADEMIC REGULATIONS
of the administration. If necessary, a specialist with relevant qualification/inclusive education may be invited; 8. The individual curriculum of the student is reviewed and approved by the Rector of the University upon the submission by the faculty/school.


In [16]:
# TEST TEST TEST

query = "What is Retrieval-Augmented Generation?"
query_vec = get_embedding(query)

chunk1 = "Retrieval-Augmented Generation (RAG) is a method where an LLM pulls info from external sources before generating an answer."
chunk2 = "This paper proposes a dataset for COVID-19 question answering tasks based on research papers."

chunk_vecs = [get_embedding(chunk1), get_embedding(chunk2)]

sims = cosine_similarity([query_vec], chunk_vecs)
print(sims)


[[0.5997536 0.862936 ]]


In [17]:
# loader
from langchain.document_loaders import PyPDFLoader
# chunker
from langchain.text_splitter import RecursiveCharacterTextSplitter
# encoders
from sentence_transformers import SentenceTransformer
from transformers import AutoModel, AutoTokenizer
#
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# for generation we need llm for this we import
from langchain_openai import OpenAI




In [18]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [19]:
query_text = "What is RAG system?"

chunk1 = "Retrieval-Augmented Generation (RAG) is a method where an LLM pulls info from external sources before generating an answer."
chunk2 = "This paper proposes a dataset for COVID-19 question answering tasks based on research papers."

query_vec = model.encode(query_text)
chunk1_vec = model.encode(chunk1)
chunk2_vec = model.encode(chunk2)
sims = cosine_similarity([query_vec], [chunk1_vec, chunk2_vec])
print(sims)

[[0.47258613 0.01885617]]


In [20]:
query_text = "GPA IS academic scoring system"

chunk1 = "tells us how good student performs"
chunk2 = "academic things."

query_vec = model.encode(query_text)
chunk1_vec = model.encode(chunk1)
chunk2_vec = model.encode(chunk2)
sims = cosine_similarity([query_vec], [chunk1_vec, chunk2_vec])
print(sims)

[[0.37468135 0.35112286]]


In [21]:
# Generate embeddings for each chunk
document_embeddings_new = [model.encode(chunk.page_content) for chunk in doc_split]

In [22]:
import faiss

# Convert list of embeddings to a numpy array
document_embeddings_new = np.array(document_embeddings_new).astype('float32')

# Create a FAISS index
dimension_of_embeddings_new = document_embeddings_new.shape[1]  
index = faiss.IndexFlatL2(dimension_of_embeddings_new)  # Use L2 distance for similarity search

# Add embeddings to the FAISS index
index.add(document_embeddings_new)

# Optionally, save the FAISS index to disk
faiss.write_index(index, 'document_embeddings.index')


In [23]:
# Assuming `query_text` is the user input or query you want to search for
query_text = "3.1 GPA"

# Get the embedding for the query
query_embedding = model.encode(query_text)

# Perform the search in FAISS to find the most similar document
k = 10  # Number of nearest neighbors to retrieve
distances, indices = index.search(np.array([query_embedding]), k)

# Output the retrieved document chunks based on the closest indices
for i in indices[0]:
    print(doc_split[i].page_content)  # document_chunks is the list of your original documents


Article 16. Academic Assessment: Cumulative Weighted Average Score (GPA) 1. Students' academic performance in undergraduate educational programs is assessed via a cumulative weighted average score - GPA (grade point average). 2. The maximal value of a student GPA when studying for an educational program is 4.0. 3. The weighted GP score obtained by the student in the individual subject is calculated according to the following scheme:  Assessment of subject Weighted score (GP) 94-100 4.0 91-93 3.7 88-90 3.4 85-87 3.1 81-84 2.8 78-80 2.5 74-77 2.2 71-73 1.9 68-70 1.6 64-67 1.3 61-63 1.0 56-60 0.8 51-55 0.5  4. Within the educational program, the student GPA is calculated by dividing the sum of multiples of weighted scores for each subject corresponding to the obtained credits and the number of credits of the subject by the total number of credits obtained by the student:  ∑k(GPk ∗ C k ) GPA = ∑k Ck  where, K index indicates the number of the individual subjects; GPK - weighted score
indic

In [24]:
# get chunks what we need, works right
retrieved_chunks = [doc_split[i].page_content for i in indices[0]]
context = "\n\n".join(retrieved_chunks)
context

'Article 16. Academic Assessment: Cumulative Weighted Average Score (GPA) 1. Students\' academic performance in undergraduate educational programs is assessed via a cumulative weighted average score - GPA (grade point average). 2. The maximal value of a student GPA when studying for an educational program is 4.0. 3. The weighted GP score obtained by the student in the individual subject is calculated according to the following scheme:  Assessment of subject Weighted score (GP) 94-100 4.0 91-93 3.7 88-90 3.4 85-87 3.1 81-84 2.8 78-80 2.5 74-77 2.2 71-73 1.9 68-70 1.6 64-67 1.3 61-63 1.0 56-60 0.8 51-55 0.5  4. Within the educational program, the student GPA is calculated by dividing the sum of multiples of weighted scores for each subject corresponding to the obtained credits and the number of credits of the subject by the total number of credits obtained by the student:  ∑k(GPk ∗ C k ) GPA = ∑k Ck  where, K index indicates the number of the individual subjects; GPK - weighted score\n\n

In [25]:
#create additional chain to get the information outside of the RAG

from langchain.chains import LLMChain, SimpleSequentialChain

topic_prompt = PromptTemplate(
    input_variable=['question'],
    template ='go through a prompt step by step, understand a what users {question} for and deduce where the desired answer from embedding coulld be, რამდენია ზოგადი GPA მსოფლიოში უმაღლეს სასწავლებლებში'
)
chain_inference = LLMChain(llm=llm_model, prompt=topic_prompt)
# print(result)


# topic_prompt2=PromptTemplate(
#     input_variables=['correlation'],
#     template='based on the columns tell me which one of the columns has the heighest {correlation}, also tell me what is their correlation coefficent,Do not count and year correlation, it doesnt make any sense to do so'
# )
# chain_correlation = LLMChain(llm=llm_model, prompt=topic_prompt2)


topic_prompt3 = PromptTemplate(
    input_variables=['part'],
    template='add the your information to the answer {part}, that would gives us a better more through result'
)
chain_clean=LLMChain(llm=llm_model, prompt=topic_prompt3)


topic_prompt4 = PromptTemplate(
    input_variables=['text'],  # This must be 'text' for SimpleSequentialChain
    template='Write the following in Georgian, making it sophisticated: {text}'
)


chain_predict = LLMChain(llm=llm_model, prompt=topic_prompt4)
#Prediction chain
main_chain=SimpleSequentialChain(chains=[chain_inference,chain_clean, chain_predict])
output= main_chain.run(docs)
print(output)



  chain_inference = LLMChain(llm=llm_model, prompt=topic_prompt)
  output= main_chain.run(docs)


Here's the translation, aiming for a sophisticated and professional tone:

"გასაგებია. თქვენ გსურთ, რომ მოწოდებული დოკუმენტაციის სიმცირის მიუხედავად, საკუთარი ცოდნისა და მსჯელობის საფუძველზე, უფრო სრულყოფილი და სასარგებლო პასუხი მოგაწოდოთ.  ამიტომაც, წარმოგიდგენთ გაუმჯობესებულ პასუხს, რომელიც ეფუძნება წინა ანალიზს:

ეს დოკუმენტი აღწერს GPA-ს სისტემას ქუთაისის საერთაშორისო უნივერსიტეტში. მაქსიმალური GPA არის 4.0, და GPA გამოითვლება როგორც შეწონილი საშუალო ქულა, რაც ნაჩვენებია ფორმულით: `GPA = ∑k(GPk ∗ C k ) / ∑k Ck`. ეს ნიშნავს, რომ თითოეული ქულა (GPk) მრავლდება შესაბამის საკრედიტო საათებზე (Ck) ამ კურსისთვის, და ამ ნამრავლების ჯამი იყოფა საკრედიტო საათების საერთო რაოდენობაზე. შეფასების სკალა შემდეგია:

*   94-100: 4.0
*   91-93: 3.7
*   ... (დანარჩენი სკალა მოცემულია)
*   51-55: 0.5

სამწუხაროდ, ეს დოკუმენტი არ შეიცავს ინფორმაციას მსოფლიოს წამყვან უნივერსიტეტებში GPA-ს ზოგადი საშუალო მაჩვენებლების შესახებ. ამ ინფორმაციის მოსაძიებლად, თქვენ უნდა მიმართოთ რესურსებს, რომლებიც აგროვებენ მო

In [26]:
#agent to search we to know, other evalutation system outside of GPA and compare 
from langchain.agents import initialize_agent, Tool
from langchain.tools import DuckDuckGoSearchRun


search_tool = DuckDuckGoSearchRun()
tools = [
    Tool(
        name="DuckDuckGoSearch",
        func=search_tool.run,
        description="Search the web for up-to-date information"
    )
]



agent = initialize_agent(
    tools,
    llm_model,
    agent="zero-shot-react-description",
    verbose=True
)



response = agent.run("now search information what evaluation system is there other than GPA, and retrieve scoring system")
print(response)






[1m> Entering new AgentExecutor chain...[0m


  agent = initialize_agent(


[32;1m[1;3mI need to find information about alternative evaluation systems to GPA and different scoring systems used in education.
Action: DuckDuckGoSearch
Action Input: "alternative evaluation systems to GPA"[0m
Observation: [36;1m[1;3mPass/Fail Grading. Pass/fail grading boils outcomes down to two possibilities: pass or fail. That simplicity can bring relief to students, who no longer stress over letter-based rankings. Some might feel freer to experiment, unafraid of jeopardizing a GPA by exploring challenging subjects. On the flip side, pass/fail grading gives little nuance. Chart: Popularity trends of alternative grading systems from 2010 to 2025 Figure: Popularity trends of alternative grading systems in K-12 and higher education from 2010 to 2025. The chart below illustrates the rising adoption of alternative grading systems in both K-12 and higher education over the past 15 years. For schools interested in implementing any alternative grading systems, Beck recommends having



[32;1m[1;3mI need to find more information about different scoring systems.
Action: DuckDuckGoSearch
Action Input: "different scoring systems in education"[0m
Observation: [36;1m[1;3mThis is a list of grading systems used by countries of the world, primarily within the fields of secondary education and university education, organized by continent with links to specifics in numerous entries. Discover alternative grading systems like mastery-based learning, narrative feedback, and pass/fail evaluations. Learn how they improve student outcomes and their challenges in scaling nationwide. The Classroom Assessment Scoring System (CLASS®) is an observation instrument that assesses the quality of teacher-child interactions in center-based preschool classrooms. CLASS® includes three domains or categories of teacher-child interactions that support children's learning and development: Emotional Support, Classroom Organization, and Instructional Support. Within each domain are ... What is sta



[32;1m[1;3mI have gathered information about alternative evaluation systems to GPA and different scoring systems used in education. Here's a summary:

*   **Alternative Evaluation Systems to GPA:**
    *   **Pass/Fail Grading:** Simplifies outcomes to "pass" or "fail," potentially reducing student stress and encouraging exploration of challenging subjects. However, it lacks nuance.
    *   **Alternative Grading:** An umbrella term for grading methods differing from standard practices, often focusing on effort and learning rather than performance.
    *   **Mastery-Based Learning:** Focuses on the mastery of skills or standards for a specific subject.
    *   **Narrative Feedback:** Provides written feedback on student work instead of a grade.

*   **Different Scoring Systems:**
    *   **Standards-Based Grading (SBG):** Focuses on the effectiveness of instruction and the mastery of skills or standards for a specific subject.
    *   **Classroom Assessment Scoring System (CLASS®):** A