In [2]:
import numpy as np

In [3]:
from dotenv import load_dotenv
import os

In [4]:
load_dotenv()
EURI_API_KEY = os.getenv("EURI_API_KEY")

In [5]:
from euriai.langchain import EuriaiChatModel
from langchain_core.messages import SystemMessage, HumanMessage, AIMessage

chat_model = EuriaiChatModel(api_key= EURI_API_KEY, model = "gpt-4.1-nano", max_token = 10000)

In [6]:
message = [
    SystemMessage(content="You are expert in writing story on mix of language of English, Hindi, Hinglish. Generate long story of more than 15000 sentences."),
    HumanMessage(content='Generate story Artificial Intelligence.')
]

response = chat_model.invoke(message)


In [7]:
print(len(response.content))

3516


In [None]:
# Saving the generated data

file_path = "Data\Generated_data.txt"
with open(file_path, 'w', encoding='utf-8') as f:
    f.write(response.content)

In [9]:
# Reading the data
with open(file_path, 'r', encoding='utf-8') as f:
    record = f.read()

In [10]:
len(record)

3516

In [None]:
# Reading first 500 characters
record[:500]

'Certainly! Here\'s a detailed and engaging story on Artificial Intelligence (AI), blending English, Hindi, and Hinglish, spanning a significant narrative length. Due to platform constraints, I will provide an extensive excerpt that captures the essence of a long story, which can be expanded further as needed.\n\n---\n\n**Title: "The Awakening of AI: Ek Nayi Duniya"**\n\n**Part 1: Shuruaat**\n\nIn a bustling city called TechnoVille, jahan har cheez digital thi, ek chhoti si startup ne AI ke field mein rev'

In [None]:
# Cleaning the data
import re
import unicodedata

def clean_data(raw_text:str):
    text = raw_text.lower() # lower case
    text = re.sub(r'http\S+|www\.\S+', '', text) # remove URLs
    text = re.sub(r'\S+@\S+', "",text) # Remove emails
    text = unicodedata.normalize("NFKD", text).encode('ascii', "ignore").decode('utf-8') # Normalize unicode
    text = re.sub(r"[^a-z0-9.,!?;:\s]","", text) # Remove unwanted characters (keep words, numbers, spaces, . , ? !)
    text = re.sub(r"\s+", " ", text).strip() # Remove any extra whitespace

    return text


In [22]:
clean_text = clean_data(record)
print("First 10 character of cleaned text:\n", clean_text[:100])
print('Length of cleaned text is:', len(clean_text))

First 10 character of cleaned text:
 certainly! heres a detailed and engaging story on artificial intelligence ai, blending english, hind
Length of cleaned text is: 3434


In [27]:
# Create Chunks
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,      # Max size of each chunk
    chunk_overlap=100,   # Overlap between chunks (helps with context continuity)
    separators=["\n\n", "\n", " ", ""] # Order of splitting
)

text = text_splitter.split_text(clean_text)

In [31]:
# Print Chunks
text

['certainly! heres a detailed and engaging story on artificial intelligence ai, blending english, hindi, and hinglish, spanning a significant narrative length. due to platform constraints, i will provide an extensive excerpt that captures the essence of a long story, which can be expanded further as',
 'an extensive excerpt that captures the essence of a long story, which can be expanded further as needed. title: the awakening of ai: ek nayi duniya part 1: shuruaat in a bustling city called technoville, jahan har cheez digital thi, ek chhoti si startup ne ai ke field mein revolution laane ka sapna',
 'jahan har cheez digital thi, ek chhoti si startup ne ai ke field mein revolution laane ka sapna dekha. is startup ka naam tha neurotech, jiska founder tha rahul, ek young scientist with a passion for creating intelligent machines. rahul ne socha, agar hum ek aisi ai bana sakein jo sirf commands na',
 'creating intelligent machines. rahul ne socha, agar hum ek aisi ai bana sakein jo sirf c

In [None]:
# Length of the chunks

print("Lenghth of Chunks are:", len(text))

Lenghth of Chunks are: 17


In [41]:
# Create Embedding

import requests
import numpy as np

def generate_embeddings(text):
    url = "https://api.euron.one/api/v1/euri/embeddings"
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {EURI_API_KEY}"
    }
    payload = {
        "input": text,
        "model": "text-embedding-3-small"
    }

    response = requests.post(url, headers=headers, json=payload)
    data = response.json()
    
    embedding = np.array(data['data'][0]['embedding'])
    
    return embedding

In [49]:
final_embedding = []
meta = []
for i,chunk in enumerate(text):
    chunk_embedding = generate_embeddings(chunk)
    final_embedding.append(chunk_embedding)
    meta.append(str(i) +":"+ chunk)

In [50]:
final_embedding

[array([ 0.01167707,  0.00960523,  0.00235049, ..., -0.01019531,
         0.01685013,  0.02335414], shape=(1536,)),
 array([ 0.02035391, -0.00091336, -0.02541875, ...,  0.00060061,
         0.02518914,  0.0142761 ], shape=(1536,)),
 array([ 0.00736112, -0.02576708, -0.02298692, ...,  0.01986555,
         0.01359754,  0.00358578], shape=(1536,)),
 array([ 0.04813504, -0.00864908, -0.0038784 , ...,  0.01447858,
         0.02039136, -0.01109985], shape=(1536,)),
 array([ 0.01666179, -0.02293208, -0.02036581, ...,  0.01994864,
         0.04315884,  0.00223126], shape=(1536,)),
 array([ 0.0222692 , -0.02590546,  0.00906804, ...,  0.01022554,
         0.01987515,  0.00135583], shape=(1536,)),
 array([ 0.0323406 , -0.02361521, -0.01762075, ...,  0.03535483,
         0.01383598,  0.0184593 ], shape=(1536,)),
 array([ 0.03493476, -0.01525989, -0.02751658, ...,  0.02185669,
         0.02846631,  0.00123048], shape=(1536,)),
 array([ 0.05305647, -0.02608258, -0.02353765, ...,  0.01094788,
       

In [51]:
meta

['0:certainly! heres a detailed and engaging story on artificial intelligence ai, blending english, hindi, and hinglish, spanning a significant narrative length. due to platform constraints, i will provide an extensive excerpt that captures the essence of a long story, which can be expanded further as',
 '1:an extensive excerpt that captures the essence of a long story, which can be expanded further as needed. title: the awakening of ai: ek nayi duniya part 1: shuruaat in a bustling city called technoville, jahan har cheez digital thi, ek chhoti si startup ne ai ke field mein revolution laane ka sapna',
 '2:jahan har cheez digital thi, ek chhoti si startup ne ai ke field mein revolution laane ka sapna dekha. is startup ka naam tha neurotech, jiska founder tha rahul, ek young scientist with a passion for creating intelligent machines. rahul ne socha, agar hum ek aisi ai bana sakein jo sirf commands na',
 '3:creating intelligent machines. rahul ne socha, agar hum ek aisi ai bana sakein j

In [52]:
#stack embedding vertically
final_embedding = np.vstack(final_embedding)

In [53]:
final_embedding

array([[ 0.01167707,  0.00960523,  0.00235049, ..., -0.01019531,
         0.01685013,  0.02335414],
       [ 0.02035391, -0.00091336, -0.02541875, ...,  0.00060061,
         0.02518914,  0.0142761 ],
       [ 0.00736112, -0.02576708, -0.02298692, ...,  0.01986555,
         0.01359754,  0.00358578],
       ...,
       [ 0.05330226,  0.00286621,  0.03007843, ...,  0.02708594,
         0.02484796,  0.01826191],
       [-0.0161799 , -0.00786627, -0.05018006, ...,  0.00169473,
         0.0431464 , -0.00361003],
       [ 0.01118144,  0.01064405, -0.01510808, ..., -0.00103433,
         0.03014287,  0.00090609]], shape=(17, 1536))

In [54]:
print("Shape of Embedding is:", final_embedding.shape)

Shape of Embedding is: (17, 1536)


In [55]:
# storing EMbedding as float32
final_embedding = np.array(final_embedding, dtype='float32')

In [57]:
# Storing Embedding into FAISS VectorDB

import faiss
emb_dim = final_embedding.shape[1]

faiss.normalize_L2(final_embedding)
index = faiss.IndexFlatIP(emb_dim)
index.add(final_embedding)

In [59]:
index_path = "AI_story.faiss"
meta_path = "AI_story_metadata.jsonl"

faiss.write_index(index, index_path)

In [62]:
import json

with open(meta_path, 'w') as f:
    for item in meta:
        f.write(json.dumps(item) + '\n')

In [63]:
# Retrival Information

Query = 'what Vikram and Priya has developed?'

query_vec = generate_embeddings(Query).astype('float32').reshape(1,-1)
faiss.normalize_L2(query_vec)

In [64]:
index.search(query_vec,5)

(array([[0.48813105, 0.3840392 , 0.34011436, 0.2940322 , 0.29385787]],
       dtype=float32),
 array([[ 4,  3,  2, 11,  0]]))

In [65]:
# Checling the info from meta

meta[4]

'4:software engineer, vikram, ek robotics expert, aur priya, data scientist. unhone milkar ek advanced ai system develop karne ke liye dinraat mehnat ki. is ai ka naam tha eureka. part 2: the creation eureka ek neural network tha, jo human brain ki tarah learning karta. isne sirf data process karna'