## **Install Required Dependencies**

In [24]:
!pip install python-dotenv
!pip install langchain
!pip install langchain_experimental
!pip install pinecone-client
!pip install openai==0.28.1
!pip install tiktoken

Collecting tiktoken
  Using cached tiktoken-0.5.2-cp39-cp39-win_amd64.whl.metadata (6.8 kB)
Using cached tiktoken-0.5.2-cp39-cp39-win_amd64.whl (786 kB)
Installing collected packages: tiktoken
Successfully installed tiktoken-0.5.2


## **Import Required Dependencies**

In [45]:
# Basics
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# TensorFlow Recommenders
import tensorflow_recommenders as tfrs
import tensorflow as tf

# Import Environment Variables
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

## **Read Environment Variables**

In [18]:
dummy_path = os.getenv('DUMMY_DATA')
try:
    with open(dummy_path) as f:
        rec_sys = f.read()
        print('File Read Correctly!')
except Exception as e:
    print(f"Error opening file: {e}")

File Read Correctly!


In [19]:
# LangChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter= RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len
)

## **Splitting Text**

In [21]:
chunks= text_splitter.create_documents([rec_sys])
print(chunks[4])

page_content='the Organization,Short Term Goal,Long Term Goal,Open To,Call To Action,Impact Story,Top Needs,Who'


In [22]:
print(chunks[10].page_content)

and Online Services,Venture Capitalists,About the Organization,Establish a Business Mentorship


In [23]:
print(f'Now you have: {len(chunks)}')

Now you have: 8283


## **Embeddings Cost**

In [26]:
def print_embedding_cost(texts):
    import tiktoken
    enc=tiktoken.encoding_for_model('text-embedding-ada-002')
    total_tokens=sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Total Tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens/1000*0.0004:.6f}')

print_embedding_cost(chunks)

Total Tokens: 157119
Embedding Cost in USD: 0.062848


In [27]:
from langchain.embeddings import OpenAIEmbeddings
embedding=OpenAIEmbeddings()

## **Inserting The Embedding Into Pinecone Index**

In [29]:
import os
import pinecone
from langchain.vectorstores import Pinecone
from tqdm.autonotebook import tqdm
pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'), environment=os.environ.get('PINECONE_ENV'))

In [30]:
# Deleting All Indexes
indexes= pinecone.list_indexes()
for i in indexes:
    print('Deleting All Indexes...!', end='')
    pinecone.delete_index(i)
    print('Done!')
print('Done!')

Deleting All Indexes...!Done!
Done!


In [31]:
# Creating The Indexes
index_name='rec-sys'
if index_name not in pinecone.list_indexes():
    print(f'Creating Index {index_name}...!')
    pinecone.create_index(index_name, dimension=1536, metric='cosine')
    print('Done!')

Creating Index rec-sys...!
Done!


In [32]:
vector_store= Pinecone.from_documents(chunks, embedding, index_name=index_name)

## **Asking For Recommendations (Similarity Search)**

In [33]:
# Define the index name (Ensure this is the correct name of the index you've created)
index_name = index_name

# Connect to the Pinecone vector index
vector_store = vector_store

In [38]:
try:
    query= 'I looking for specialists with interest in mental health, give me the top 7 with their names?'
    result= vector_store.similarity_search(query)
    print(result)
except Exception:
    print(e)

[Document(page_content='Manager,Organization,About Me,Interest,Tags Description,Diversity & Inclusion | Mental Health |'), Document(page_content='Manager,Organization,,AI & Machine Learning | Education | Mental Health,Tags Description,Consulting'), Document(page_content='Scientist Manager,Organization,About Me,Learning | Research | Academics,,Developer | Mental Health'), Document(page_content='Me,Mentorship | Consulting | Content Creation,,Developer | Mental Health | Content')]


In [39]:
for r in result:
    print(r.page_content)
    print('-' * 50)

Manager,Organization,About Me,Interest,Tags Description,Diversity & Inclusion | Mental Health |
--------------------------------------------------
Manager,Organization,,AI & Machine Learning | Education | Mental Health,Tags Description,Consulting
--------------------------------------------------
Scientist Manager,Organization,About Me,Learning | Research | Academics,,Developer | Mental Health
--------------------------------------------------
Me,Mentorship | Consulting | Content Creation,,Developer | Mental Health | Content
--------------------------------------------------


In [53]:
!pip install --upgrade langchain
!pip install openpyxl

Collecting openpyxl
  Using cached openpyxl-3.1.2-py2.py3-none-any.whl (249 kB)
Collecting et-xmlfile (from openpyxl)
  Using cached et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.1.2


In [54]:
import numpy as np
import pandas as pd
from langchain.embeddings import OpenAIEmbeddings

In [55]:
embeddings_model = OpenAIEmbeddings()

In [59]:
import os
import pandas as pd

dummy_path_excel = os.getenv('DUMMY_DATA_EXCEL')

if dummy_path_excel:
    try:
        # Specifying the engine manually
        data_rec_sys = pd.read_excel(dummy_path_excel, engine='openpyxl')
        print(data_rec_sys.shape)
    except Exception as e:
        print(f"Error reading the Excel file: {e}")
else:
    print("The path to the Excel file is not set or invalid.")
data_rec_sys.head()

(1000, 27)


Unnamed: 0,First Name,Last Name,Logging Date,Company,Aditionals,Social Network Logging,Job Title,Organization,About Me,Interest,...,Target Connection for the Organization,About the Organization,Short Term Goal,Long Term Goal,Open To,Call To Action,Impact Story,Top Needs,Who Do You Want To Connect With?,Goals
0,Kimberly,Mccarty,2023-12-06 00:00:00,Verizon Communications Inc.,,LinkedIn,"Presenter, Broadcasting",Organization,About Me,Innovation | Education & Training | Research,...,Venture Capitalists,About the Organization,Establish a Business Mentorship Scheme,Foster Artisan and Craftsmanship,Artists,Partner in Financial Empowerment,"Successfully negotiated a major business deal,...",Family Support Services,Care Providers,Enhance Lobbying Efforts
1,First Name,Tyler,2023-12-05 00:00:00,American Express Company,Aditionals,LinkedIn,Insurance Underwriter,Commerce Chamber,About Me,Career Advancement | Real Estate | Mentorship,...,Healthcare Providers,,Offer Business Financial Planning Workshops,Strengthen Youth Engagement,Animal Shelters,Become a Part of Our Growing Community,I collaborated on a research project that resu...,Infrastructure Development,Pet Lovers,Boost Local Tourism
2,Jessica,White,2023-12-10 00:00:00,Walmart Inc.,Aditionals,LinkedIn,Engineer and Communications Specialist,Commerce Chamber,,Health & Wellness | Mental Health | AI & Machi...,...,Investment Firms,About the Organization,Create a Member Directory,Support Local Journalism,Elderly Care Services,Empower the Next Generation,I have a mentoring group for teenagers who wan...,Reporting Resources,Parents,Enhance Disaster Preparedness
3,James,Rocha,2023-12-01 00:00:00,Ford Motor Company,Aditionals,Google,Quality Manager,Commerce Chamber,,Compliance | Economics | Remote Work,...,Logistics Companies,,Enhance Public Relations Efforts,Foster Manufacturing Growth,Private Partnerships,Save Every Drop,Established key performance indicators (KPIs) ...,Agricultural Tech,Pet Lovers,Enhance Dining Scene
4,Eric,Burton,2023-12-11 00:00:00,American Express Company,,Social Network Logging,Contractor,Commerce Chamber,About Me,Design & UX | Corporate Responsability | Educa...,...,Local Business Owners,,Start a Chamber Member Survey,Foster Innovation and R&D,Youth Organizations,Become a Part of Our Growing Community,I designed and implemented a cybersecurity fra...,Sustainable Building,Environmentalists,Preserve Cultural Landmarks


In [60]:
# Define Required Columns For Embeddings
columns = [
    "Job Title", "Company", "Interest", "Needs", "Skills",
    "A Key Organizational Need", "Target Connection for the Organization",
    "Short Term Goal", "Long Term Goal", "Open To", "Call To Action",
    "Impact Story", "Top Needs", "Who Do You Want To Connect With?", "Goals"
]

In [61]:
data_rec_sys.describe().T

Unnamed: 0,count,unique,top,freq
First Name,1000,49,Mariah,38
Last Name,1000,50,Lewis,30
Logging Date,1000,72,2023-12-02 00:00:00,91
Company,1000,53,Microsoft Corporation,25
Aditionals,516,1,Aditionals,516
Social Network Logging,1000,3,Google,338
Job Title,1000,50,Administrator,33
Organization,1000,2,Commerce Chamber,526
About Me,492,1,About Me,492
Interest,1000,48,Cibersecurity | Diversity & Inclusion | Branding,30


In [62]:
data_rec_sys.isnull().sum()

First Name                                  0
Last Name                                   0
Logging Date                                0
Company                                     0
Aditionals                                484
Social Network Logging                      0
Job Title                                   0
Organization                                0
About Me                                  508
Interest                                    0
Tags Description                          498
Needs                                       0
Skills                                      0
Rating                                      0
Archive Member                              0
Human Coded (Manual Validation)             0
A Key Organizational Need                   0
Target Connection for the Organization      0
About the Organization                    495
Short Term Goal                             0
Long Term Goal                              0
Open To                           

In [63]:
# Get the Embedding For Each Column
embeddings_dict = {}
for column in columns:
    valid_entries = data_rec_sys[column].dropna()
    embeddings = [embeddings_model.embed_text(text) for text in valid_entries]
    embeddings_dict[column] = embeddings

AttributeError: 'OpenAIEmbeddings' object has no attribute 'embed_text'

In [64]:
import os
import openai

# Load your OpenAI API key from an environment variable or secure source
openai.api_key = os.getenv('OPENAI_API_KEY')

# Define a function to get embeddings from OpenAI
def get_openai_embeddings(text):
    # You might need to adjust the model and parameters according to your needs and OpenAI's API documentation
    response = openai.Embedding.create(input=[text], engine="text-similarity-davinci-001")
    return response['data'][0]['embedding']

# Get the Embedding For Each Column
embeddings_dict = {}
for column in columns:
    print(f"Processing column: {column}")
    valid_entries = data_rec_sys[column].dropna()
    embeddings = [get_openai_embeddings(text) for text in valid_entries]
    embeddings_dict[column] = embeddings

Processing column: Job Title
