In [3]:
# or to use other formats, use the following after installing the required packages:
from langchain.document_loaders import UnstructuredFileLoader

### Use LangChain CVSLoader 

In [57]:
# 1. Importing langChain document loaders and loading the data
from langchain.document_loaders.csv_loader import CSVLoader
def load_data_csv_file(file_path):
    loader = CSVLoader(file_path)
    data = loader.load()
    return data

In [58]:
# 2. Split the text into chunks
from langchain.text_splitter import CharacterTextSplitter
def split_documents(documents, chunk_size=1000, chunk_overlap =0):
    text_splitter = CharacterTextSplitter(chunk_size=chunk_size,chunk_overlap=chunk_overlap)
    texts = text_splitter.split_documents(documents)
    return texts


In [71]:
# 3. Test load_data_csv_file and split_documents
students_data = load_data_csv_file('dummy_students_candidacy_data1.csv')
students_text = split_documents(students_data)
print(len(students_data))
print(len(students_text))
print(students_text[5])

137
137
page_content="Last Name: Smith6\nFirst Name: Joe6\nUci Net: Smith6\nConcentration: \nArea of Expertise: My Area of expertise includes but isn't limited to, Back end, data and information systems, Fullstack development, Microservices, and System design with a focus on building large-scale distributed systems on the cloud. I am also proficient in Algorithms, Data structures, and problem-solving. I have over 3 years of experience in software development in companies like Amazon and Cisco. I have worked with Tech stacks ranging from Golang, Java, C++, and Python to AWS, Ansible, and CI/CD.   I discovered a passion for teaching when I began organizing workshops for incoming undergraduates on various core computer science concepts. These experiences helped me refine my understanding and learn essential skills associated with disseminating complex information faster for beginners to learn and I have grown to love mentoring young students. I have also published articles on platforms su

In [60]:
# 4. #importing pinecone packages and OpenAIEmbeddings
# pip install pinecone-client tiktoken
from langchain.vectorstores import Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms.openai import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain


In [61]:
# 5. Creating a pinecone vector store
from dotenv import load_dotenv
import os
load_dotenv()
pinecone.init(
    api_key=os.getenv("PINECONE_API_KEY"),
    environment=os.getenv("PINECONE_ENVIRONMENT")
)
# print(os.getenv("PINECONE_API_KEY"))
# print(os.getenv("PINECONE_ENVIRONMENT"))

In [62]:
# 6. Recursive document search function
def recursive_doc_search(doc, embeddings, index_name):
    if doc == None:
        return None
    else:
        docsearch = Pinecone.from_texts([t.page_content for t in doc], embeddings, index_name=index_name)
        return docsearch

In [63]:
# 7. Use recursive_doc_search to set up the vector store for student candidates
embeddings = OpenAIEmbeddings(openai_api_key=os.getenv("OPENAI_API_KEY"))
# query_result = embeddings.embed_query("hello world")
# print(len(query_result))

In [64]:
docsearch_students = recursive_doc_search(students_text, embeddings, index_name='ta-matching-system-students')

In [65]:
# 8. Establishing llm and the QA Chain
model_name = "gpt-3.5-turbo-0613"
llm=ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613")
chain = load_qa_chain(llm, chain_type="stuff")

In [69]:
# 9. Function to get similar students close to the query (docsearch -> vector store) 
def get_similar_docs(docsearch, query, k = 2, score = False):
    if score:
        similar_docs = docsearch.similarity_search_with_score(query, k)
    else:
        similar_docs = docsearch.similarity_search(query, k)
    return similar_docs

query = "Artificial Intelligence"
similar_docs_about_query = get_similar_docs(docsearch_students, query)
print(similar_docs_about_query)



[Document(page_content=': CourseTitleGrWhenICSÊ399UNIVERSITY TEACHINGS202303CSÊ274APROB LEARNINGA202303CSÊ230DIST COMPUTER SYSA202303ICSÊ399UNIVERSITY TEACHINGS202292ICSÊ398ATA TRAINING SEMINARS202292CSÊ273AMACHINE LEARNINGA202292CSÊ244INTR EMBED UBIQ SYSA+202292ICSÊ399UNIVERSITY TEACHINGS202214CSÊ295BRAIN-INSPIRED LRNGA+202214CSÊ261DATA STRUCTURESA202214', metadata={}), Document(page_content=': CourseTitleGrWhenICSÊ399UNIVERSITY TEACHINGS202303CSÊ274APROB LEARNINGA202303CSÊ230DIST COMPUTER SYSA202303ICSÊ399UNIVERSITY TEACHINGS202292ICSÊ398ATA TRAINING SEMINARS202292CSÊ273AMACHINE LEARNINGA202292CSÊ244INTR EMBED UBIQ SYSA+202292ICSÊ399UNIVERSITY TEACHINGS202214CSÊ295BRAIN-INSPIRED LRNGA+202214CSÊ261DATA STRUCTURESA202214', metadata={})]


In [67]:
# 10. Functions to get the answer to the question
def get_recommendation(docsearch, query, chain):
    docs = docsearch.similarity_search(query)
    ans = chain.run(input_documents=docs, question=query)
    return ans


query = "Which 5 students are suitable Teaching assistant candidates for this course: I&C SCI 9	Introduction to Computation for Scientists and Engineers	Introduces principles, techniques, and computational tools for quantitative approach to basic problem solving in physics and engineering. Pre-requisites: MATH 2A or AP Calculus AB or AP Calculus BC. Overlaps: Physics 2 "
recommendation = get_recommendation(docsearch_students, query, chain)
print(recommendation)


Based on the given information, there is no specific mention of any students who are suitable Teaching assistant candidates for the course I&C SCI 9. Therefore, I cannot provide a list of 5 suitable candidates.
