# Imports

In [2]:
import os
import json
import numpy as np
from pypdf import PdfReader 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import joblib

In [3]:
import vertexai
from vertexai.preview.generative_models import GenerativeModel, ChatSession, Part
from vertexai.language_models import TextEmbeddingModel

# Env

In [4]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/home/spanwar/Documents/collage/projects/chatbot_v2/vertex_key/oval-smile-417517-43cf867c95fd.json'

In [5]:
project_id = "oval-smile-417517"
location   = "us-central1"
vertexai.init(project=project_id, location=location)

model = GenerativeModel("gemini-pro")

# Gemini

In [29]:
def get_chat_response(prompt: str):
    response = model.generate_content(prompt)
    return response

In [30]:
def response_to_text(response):
    return response.text
        # return str(response.candidates[0].content.parts[0]).split('text: "')[-1].split('"\n')[0]

In [31]:
def call_gemini(message):
    message = message
    response = get_chat_response(message)
    txt_output = response_to_text(response)
    return txt_output

# Tools

## QnA

In [32]:
def find_files(path):
    '''
    find files from the directory
    '''
    if os.path.exists(path):
        if os.path.isdir(path):
            files = []
            for root, _, filenames in os.walk(path):
                for filename in filenames:
                    files.append(os.path.join(root, filename))
            return files
        elif os.path.isfile(path):
            return [path]
        else:
            raise Exception('Unable to read directory')
    else:
        raise Exception('Path does not exsts')

In [33]:
def find_file_ext(path):
    '''
    return file extension.
    '''
    file_name, file_extension = os.path.splitext(path)
    return file_extension

In [34]:
def read_pdf(path):
    '''
    Read pdf file.
    '''
    # creating a pdf reader object 
    reader = PdfReader(path) 
    text   = ''
    for i in range(len(reader.pages)): 
        # getting a specific page from the pdf file 
        page = reader.pages[i] 
        # extracting text from page 
        text = text + page.extract_text() 
    return text

In [35]:
def read_text(path):
    with open(path, 'r') as f:
        content = f.read()
    return content

In [36]:
def text_words(text):
    return len(text.split(' '))

In [37]:
def read_directory_files(path):
    files     = find_files(path)
    text_data = {}
    
    for file in files:
        print(f'Reading the file: {file}')
        extension = find_file_ext(file)
        if extension=='.pdf':
            text = read_pdf(file)
            text_data[file] = {'len': text_words(text),
                               'text': text
                              }
        elif extension=='.rtf':
            text = read_text(file)
            text_data[file] = {'len': text_words(text),
                               'text': text
                              }
        else:
            text = read_text(file)
            text_data[file] = {'len': text_words(text),
                               'text': text
                              }

    return text_data

In [49]:
def save_emb_vectorizer(vectorizer, path):
    joblib.dump(vectorizer, path)  # Save the model to a file

def load_emb_vectorizer(path):
    # Load the saved moded
    return joblib.load(path)

In [50]:
def create_embedding(data):
    texts    = []
    for key in data:
        texts.append(data[key]['text'])
        
    vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1,2), stop_words = "english", lowercase = True, max_features = 10000)
    embeddings = vectorizer.fit_transform(texts).toarray()
    for index, key in enumerate(data):
        data[key]['emb'] = embeddings[index]
    
    return data, vectorizer

In [51]:
def create_embedding_from_vectorizer(text, vectorizer):
    return vectorizer.transform([text]).toarray()

In [52]:
def get_similar_text(doc_emb, emb, top_doc):
    similarities = cosine_similarity(emb, doc_emb)
    sorted_indices = np.argsort(similarities[0])[::-1]  # Sort in descending order
    top_indices = sorted_indices[:top_doc]
    return top_indices

In [111]:
def search_doc(text, data, top_index, vectorizer):
    text_emb = create_embedding_from_vectorizer(text, vectorizer)
    doc_emb  = [data[i]['emb'] for i in data]
    indexs = get_similar_text(doc_emb, text_emb, top_index)

    similar_doc = ''
    for i, index in enumerate(indexs):
        file  = list(data.keys())[index]
        similar_doc =  similar_doc + f'Doc {i}\n' + data[file]['text']

    return similar_doc

In [97]:
def read_json(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

def write_json(file_path, data):
    with open(file_path, 'w') as file:
        json.dump(data, file, indent=4)

In [98]:
def process_data(path, botname):
    text_data = read_directory_files(path)
    data_index, vectorizer = create_embedding(text_data)

    np.save(f'data/qna_save_data/{botname}_qna.npy', data_index)
    save_emb_vectorizer(vectorizer, f'data/qna_save_data/{botname}_vec.pkl')

In [99]:
def initialize_qna(path, botname):
    process_data(path, botname)

In [110]:
def qna(question, botname):
    # Load data
    data_index = np.load(f'data/qna_save_data/{botname}_qna.npy', allow_pickle=True)
    vectorizer = load_emb_vectorizer(f'data/qna_save_data/{botname}_vec.pkl')
    data_index = data_index.tolist()
    # Search doc
    message = search_doc(question, data_index, 2, vectorizer)
    message = message + f'\n#####\nUsing above data Please answer this question. Please write atmax 100 words only.: {question}'
    # call
    answer = call_gemini(message)
    return answer

In [101]:
path = 'data/leanStartupData'

In [102]:
initialize_qna(path, 'test')

Reading the file: data/leanStartupData/LDBD23intro.pdf
Reading the file: data/leanStartupData/LeanStartup.pdf
Reading the file: data/leanStartupData/Platform.pdf
Reading the file: data/leanStartupData/LearningLog.rtf


In [112]:
ans = qna('what is the lean to startup?. Also explain about the business model', 'test')

In [113]:
print(ans)

**Lean Startup** is a method that helps businesses to create new products and services in a fast and efficient way. It is based on the idea of "build, measure, learn" and involves testing ideas with customers early and often, then iterating based on feedback.

A **business model** is a plan for how a business will generate revenue and make a profit. It describes the business's products or services, target market, sales and marketing strategies, and financial plan. A lean startup approach to business model design involves testing and iterating on the business model as the business learns more about its customers and market.


In [23]:
ans2 = qna('can you help me how to write learning log. I am little confused.')

In [24]:
print(ans2)

**How to Write a Learning Log:**

1. **Start** with the week number (e.g., W4).
2. **Answer** the provided themes (e.g., What inspired you this week?).
3. **Reflect** on your experiences and learning in practice.
4. **Write** in an unstructured and conversational style.
5. **Focus** on:
    - Inspirations and surprises
    - Innovation and idea generation
    - Business development
    - Stakeholder collaboration
    - Technology and software development
    - Teamwork
    - Learning sources (mentors, team, stakeholders)
6. **Use** the learning log as a weekly checklist to reflect on your progress.
7. **Keep** a single document for all entries.
