In [3]:
# or to use other formats, use the following after installing the required packages:
from langchain.document_loaders import UnstructuredFileLoader

## Use LangChain + Pinecone to Match Best TA Candidates for Each Course

### Step 1 : Import LangChain CSVLoader and load the Students Information

In [57]:
# 1. Importing langChain document loaders and loading the data
from langchain.document_loaders.csv_loader import CSVLoader
def load_data_csv_file(file_path):
    loader = CSVLoader(file_path)
    data = loader.load()
    return data

In [58]:
# 2. Split the text into chunks
from langchain.text_splitter import CharacterTextSplitter
def split_documents(documents, chunk_size=1000, chunk_overlap =0):
    text_splitter = CharacterTextSplitter(chunk_size=chunk_size,chunk_overlap=chunk_overlap)
    texts = text_splitter.split_documents(documents)
    return texts


In [80]:
# 3. Test load_data_csv_file and split_documents
students_data = load_data_csv_file('dummy_students_candidacy_data1.csv')
students_text = split_documents(students_data)
print(len(students_data))
print(len(students_text))
print(students_text[88].page_content)

137
137
Last Name: Smith89
First Name: Joe89
Uci Net: Smith89
Concentration: Embedded and Cyber-Physical
Area of Expertise: My area of expertise is in Embedded Systems and IoT (Internet of Things). My relevant coursework, projects, and experience have all been focused on these areas. Additionally, I have experience working with various programming languages, frameworks, and hardware devices.  In terms of my special qualifications in Computer Science, I have experience in Embedded System Software, IoT Systems and Software, and Cyber-Physical System Design. I also have skills in various programming languages, frameworks, and hardware devices used in the field of Computer Science.
Programming: I have experience with the following programming languages: C, C++, Python, EmbeddedC, SystemC, MATLAB, HTML/CSS.  I also have experience in some frameworks and platforms, such as Simulink, LabView, ROS, Arduino IDE, NI Multisim, Keil �vision 5, Xilinx, OpenCV, Proteus, Git, Node-Red.  For operating

In [83]:
print(students_text[100].page_content)

Last Name: Smith101
First Name: Joe101
Uci Net: Smith101
Concentration: 
Area of Expertise: I have previously TA'd and have experience proctoring classes remotely.  I have TA'd CS 121, ICS 6D, and ICS 32 previously.  My area of expertise is Machine Learning and Scientific Computing.
Programming: I am proficient with Python and I have experience using C/C++.  I have working with Linux, Windows, and Mac operating systems.  I have experience using Canvas' API suite, and can upload grades and comments automatically.
Past Experience: CourseTitleInstructorPositionWhenCS�121Information RetrievalMustafa IbrahimTA2021-winterICS�6DDiscrete Mathematics for CSPaniz EbrahimiTA2021-fallICS�6DDiscrete Mathematics for CSStanislaw JareckiTA2020-springICS�032Programming with Software LibrariesMustafa IbrahimTA2022-spring
Courses Taken: CourseTitleGrWhenICS�399UNIVERSITY TEACHINGS202303ICS�399UNIVERSITY TEACHINGS202292ICS�399UNIVERSITY TEACHINGS202214CS�261DATA STRUCTURESA-202214ICS�399UNIVERSITY TEACHIN

### Step 2. Set Up the Vector Store (Embedding Index) of the Students Information

In [60]:
# 4. #importing pinecone packages and OpenAIEmbeddings
# pip install pinecone-client tiktoken
from langchain.vectorstores import Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain


In [84]:
# 5. Creating a pinecone vector store
from dotenv import load_dotenv
import os
load_dotenv()
pinecone.init(
    api_key=os.getenv("PINECONE_API_KEY"),
    environment=os.getenv("PINECONE_ENVIRONMENT")
)
# print(os.getenv("PINECONE_API_KEY"))
# print(os.getenv("PINECONE_ENVIRONMENT"))

In [85]:
# 6. Recursive document search function
def recursive_doc_search(doc, embeddings, index_name):
    if doc == None:
        return None
    else:
        docsearch = Pinecone.from_texts([t.page_content for t in doc], embeddings, index_name=index_name)
        return docsearch

In [86]:
# 7. Use recursive_doc_search to set up the vector store for student candidates
embeddings = OpenAIEmbeddings(openai_api_key=os.getenv("OPENAI_API_KEY"))
query_result = embeddings.embed_query("hello world")
print(len(query_result))

1536


In [87]:
docsearch_students = recursive_doc_search(students_text, embeddings, index_name='ta-matching-system-students')

### Step 3. Use the LangChain QA Chain and Do Query based on Given Context (Similar Students Data)

In [128]:
# 8. Establishing llm and the QA Chain
model_name = "gpt-3.5-turbo-0613"
llm=ChatOpenAI(temperature=0, model=model_name)
chain = load_qa_chain(llm, chain_type="stuff")

In [131]:
# 9. Function to get similar students close to the query (docsearch -> vector store) 
def get_similar_docs(docsearch, query, k = 7, score = False):
    if score:
        similar_docs = docsearch.similarity_search_with_score(query, k)
    else:
        similar_docs = docsearch.similarity_search(query, k)
    return similar_docs

query = "Artificial Intelligence"
similar_docs_about_query = get_similar_docs(docsearch_students, query)
print(similar_docs_about_query)



[Document(page_content='Last Name: Smith56\nFirst Name: Joe56\nUci Net: Smith56\nConcentration: \nArea of Expertise: I am currently pursuing Masters of Computer Science. Relevant Courses: Algorithms with Applications, Advanced Programming and Problem Solving, Introduction to Artificial Intelligence.  Relevant Courses from my Bachelors include: Advanced Data Structures and Analysis of Algorithms, Automata Theory, Operating Systems, Computer Networks, Cryptography and Network Security, Cloud Computing Services, Artificial Intelligence, Information Retrieval, Software Engineering with Project Management.  I have experience with a wide range of programming languages and technologies, including � Programming languages: Java, C, C++, Python � Web Development: HTML, CSS, PHP, AJAX, JavaScript, React.js, Django, Flask � Artificial Intelligence: NumPy, Pandas, Matplotlib, OpenCV, Scikit-learn, Keras, TensorFlow, MATLAB � App Development: Swift, SwiftUI � Databases: MySQL, Oracle, MongoDB, Fireb

In [134]:
course_description = "Introduces principles, techniques, and computational tools for quantitative approach to basic problem solving in physics and engineering. Pre-requisites: MATH 2A or AP Calculus AB or AP Calculus BC. Overlaps: Physics 2 "
query_get_recommendation = "In these students I give you, which 5 students are suitable Teaching assistant candidates (sort from best candidates to general candidates) for this course: I&C SCI 9" + course_description
similar_students = get_similar_docs(docsearch_students, course_description)
print(similar_students)

[Document(page_content='Last Name: Smith9\nFirst Name: Joe9\nUci Net: Smith9\nConcentration: Computer Science\nArea of Expertise: 1 I was a Teaching Assistant for the Department of Physics at UC Irvine for two quarters, i.e. Fall 2022 & Winter 2023. I worked under Professors Lee, Guerra, and Wu for classes Physics 3A, 3B, and 7C. My responsibilities included conducting discussions for 100+ students each quarter, conducting and grading quizzes as well as grading midterms and final exams.  2) This is my third quarter of graduate studies in ICS and I have maintained a consistent 4.0 GPA even after pursuing complicated courses like CS231P, CS253P, CS261P, CS271P, and CS273P. 3) I was a Teaching Assistant for my undergraduate course \'CSL802- Distributed Computing Lab\' where I was responsible for creating a list of experiments that students would perform for the duration of the course. I also assisted students through various phases of their course projects. 4) During my role as a Data Eng

In [129]:
# 10. Functions to get the answer to the question
def get_recommendation(similar_students_info, query, chain):
    try:
        # operation that may raise an exception
        ans = chain.run(input_documents=similar_students_info, question=query)
    except Exception as e:
        print(f"An error occurred: {e}")
        summary_similar_students_info = summarize_candidates(similar_students_info)  # Call the other function
        ans = chain.run(input_documents=summary_similar_students_info, question=query)
    return ans

In [135]:
res = get_recommendation(similar_students, query_get_recommendation, chain)
print(res)

Based on the information provided, here are the five students who are suitable Teaching Assistant candidates for the course I&C SCI 9, sorted from best candidates to general candidates:

1. Joe9 Smith (Computer Science concentration, experience as a Teaching Assistant for Physics courses, expertise in Python programming, and strong academic performance)
2. Joe95 Smith (Areas of expertise in AI and theoretical computer science, experience as a Teaching Assistant for various computer science courses, and familiarity with Java and C++)
3. Joe119 Smith (Computer Graphics concentration, experience as a Teaching Assistant for computer science courses, and familiarity with C++, Unix, and Python)
4. Joe129 Smith (Experience in applied machine learning, familiarity with Java and Python, and experience mentoring online courses)
5. Joe62 Smith (Expertise in multithreaded programming, experience as an assistant to TAs and teaching basic algorithms, and proficiency in C/C++, Python, and Java)

Plea

## TODO Walkaround OpenAI's Token Limit With Chain Type

If Token Limit Error:
InvalidRequestError: This model's maximum context length is 4097 tokens. However, your messages resulted in 8216 tokens. Please reduce the length of the messages.

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms.openai import OpenAI
from langchain.chains.summarize import load_summarize_chain

llm2 = OpenAI(temperature=0)

def summarize_candidates(docs):
    char_text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    docs_chunks = char_text_splitter.split_documents(docs)
    model = load_summarize_chain(llm2, chain_type="refine")
    summary = model.run(docs_chunks)
    print(summary)
    return summary

In [None]:
course_description = "Introduces principles, techniques, and computational tools for quantitative approach to basic problem solving in physics and engineering. Pre-requisites: MATH 2A or AP Calculus AB or AP Calculus BC. Overlaps: Physics 2 "
query_get_recommendation = "In these students I give you, which 5 students are suitable Teaching assistant candidates for this course: I&C SCI 9" + course_description
similar_students = get_similar_docs(docsearch_students, course_description)
print(similar_students)
print(type(similar_students))
print(type(similar_students[0]))
print(type(similar_students[0].page_content))
summarize_candidates(similar_students)

<class 'list'>
<class 'langchain.schema.Document'>
<class 'str'>

In [113]:
char_text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs_chunks = char_text_splitter.split_documents(similar_students)
print(len(docs_chunks))

19


In [119]:
model = load_summarize_chain(llm2, chain_type="map_reduce")
summary = model.run(docs_chunks[0:5])


In [120]:
print(summary)

 Joe Smith is a Computer Science student at UC Irvine with a 4.0 GPA and experience as a Teaching Assistant for Physics classes and a Data Engineer at Tata Consultancy Services. He is proficient in SQL, C++, Java, and C and is comfortable with Windows OS and Ubuntu Linux. He has expertise in machine learning/AI, mathematics, and cognitive science, and is comfortable with C++, Unix, and Python. He has taken multiple courses in theoretical computer science, including optimization, data structures and algorithms, graph theory, causality, time series analysis, operations research, economics related courses, probability and statistics, linear algebra, game theory, discrete math, and basics of machine learning. He is also proficient in Fortran, MATLAB, Writing, and Discrete Math.


In [81]:
# Once finished, we delete the Pinecone index to save resources
pinecone.delete_index('ta-matching-system-students')