In [None]:
import pandas as pd
from datasets import load_dataset
import json
import os
import sys
import boto3
from botocore.client import Config
import faiss

In [None]:
# will be using Titan Embedding model called form LangChain to generate embeddings of querry
from langchain_community.embeddings import BedrockEmbeddings
from langchain.llms.bedrock import Bedrock

# for Data Ingestion
import numpy as np
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader

# For Vectore embedding and Vectore Store (Using Fiass DB fot embedding)
from langchain.vectorstores import faiss
from langchain_community.vectorstores import FAISS


In [None]:
dataset = load_dataset("hugginglearners/netflix-shows")
dataset = pd.DataFrame(dataset['train'])
# dataset.to_json('/content/drive/MyDrive/Netflix_Project/data.json')
dataset.head()

In [None]:
# Upload the file to an S3 bucket.
session = boto3.Session()
s3 = session.resource('s3')
bucket = s3.Bucket('moviedatabucket')
# bucket.Object('dataset.json').put(Body=open('dataset.json', 'rb'))


In [None]:
# Bedrock Client
session = boto3.Session(
    aws_access_key_id='ACCESS_KEY',
    aws_secret_access_key='SECREAT_ACCESS_KEY',
    aws_session_token=boto3.client('sts').assume_role(
        RoleArn='arn:aws:iam::975050062872:role/ColabAccess',
        RoleSessionName='ColabSession'  # You can customize the session name
    )['Credentials']['SessionToken'],
)
region =session.region_name
print(region)

In [None]:
bedrock_config = Config(connect_timeout=120, read_timeout=120, retries={'max_attempts': 3})

bedrock_client = boto3.client("bedrock-runtime", region_name = region)

# bedrock_client.create_evaluation_job()

In [None]:
# Instance for Titan Embedding model from bedrock
bedrock_embedding = BedrockEmbeddings(model_id='amazon.titan-embed-text-v2:0',
                                      client = bedrock_client)

In [None]:
# Implementing data Ingestion
obj = bucket.Object('data.json')
response = obj.get()
data = json.load(response['Body'])


In [None]:
def data_ingest():
    loader = TextLoader('data.json', encoding='utf8')
    data = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size  = 10000,
                                                   chunk_overlap = 500)
    doc_chunk =  text_splitter.split_documents(data)
    return doc_chunk

In [None]:
# Vector Embedding and Vector store
def get_vectorestore(docs):
    vectore_store_fiass = FAISS.from_documents(docs,
                                               bedrock_embedding)
    vectore_store_fiass.save_local('faiss_index')

In [None]:
doc_chunks = data_ingest()
print(type(data_ingest()))
get_vectorestore(doc_chunks)

In [None]:
# prompt: save a vectore on drive at given path
file_name = "vectore_store.pkl"

# Upload file to S3
bucket.Object(file_name).put(Body=open(file_name, "rb"))