In [3]:
# or to use other formats, use the following after installing the required packages:
from langchain.document_loaders import UnstructuredFileLoader

## Use LangChain + Pinecone to Match Best TA Candidates for Each Course

### Step 1 : Import LangChain CSVLoader and load the Students Information

In [7]:
# 1. Importing langChain document loaders and loading the data
from langchain.document_loaders.csv_loader import CSVLoader
def load_data_csv_file(file_path):
    loader = CSVLoader(file_path)
    data = loader.load()
    return data

In [8]:
# 2. Split the text into chunks
from langchain.text_splitter import CharacterTextSplitter
def split_documents(documents, chunk_size=1000, chunk_overlap =0):
    text_splitter = CharacterTextSplitter(chunk_size=chunk_size,chunk_overlap=chunk_overlap)
    texts = text_splitter.split_documents(documents)
    return texts


In [9]:
# 3. Test load_data_csv_file and split_documents
students_data = load_data_csv_file('dummy_students_candidacy_data1.csv')
students_text = split_documents(students_data)
print(len(students_data))
print(len(students_text))
print(students_text[88].page_content)

137
137
Last Name: Smith89
First Name: Joe89
Uci Net: Smith89
Concentration: Embedded and Cyber-Physical
Area of Expertise: My area of expertise is in Embedded Systems and IoT (Internet of Things). My relevant coursework, projects, and experience have all been focused on these areas. Additionally, I have experience working with various programming languages, frameworks, and hardware devices.  In terms of my special qualifications in Computer Science, I have experience in Embedded System Software, IoT Systems and Software, and Cyber-Physical System Design. I also have skills in various programming languages, frameworks, and hardware devices used in the field of Computer Science.
Programming: I have experience with the following programming languages: C, C++, Python, EmbeddedC, SystemC, MATLAB, HTML/CSS.  I also have experience in some frameworks and platforms, such as Simulink, LabView, ROS, Arduino IDE, NI Multisim, Keil �vision 5, Xilinx, OpenCV, Proteus, Git, Node-Red.  For operating

In [10]:
print(students_text[100].page_content)

Last Name: Smith101
First Name: Joe101
Uci Net: Smith101
Concentration: 
Area of Expertise: I have previously TA'd and have experience proctoring classes remotely.  I have TA'd CS 121, ICS 6D, and ICS 32 previously.  My area of expertise is Machine Learning and Scientific Computing.
Programming: I am proficient with Python and I have experience using C/C++.  I have working with Linux, Windows, and Mac operating systems.  I have experience using Canvas' API suite, and can upload grades and comments automatically.
Past Experience: CourseTitleInstructorPositionWhenCS�121Information RetrievalMustafa IbrahimTA2021-winterICS�6DDiscrete Mathematics for CSPaniz EbrahimiTA2021-fallICS�6DDiscrete Mathematics for CSStanislaw JareckiTA2020-springICS�032Programming with Software LibrariesMustafa IbrahimTA2022-spring
Courses Taken: CourseTitleGrWhenICS�399UNIVERSITY TEACHINGS202303ICS�399UNIVERSITY TEACHINGS202292ICS�399UNIVERSITY TEACHINGS202214CS�261DATA STRUCTURESA-202214ICS�399UNIVERSITY TEACHIN

### Step 2. Set Up the Vector Store (Embedding Index) of the Students Information

In [11]:
# 4. #importing pinecone packages and OpenAIEmbeddings
# pip install pinecone-client tiktoken
from langchain.vectorstores import Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain


In [12]:
# 5. Creating a pinecone vector store
from dotenv import load_dotenv
import os
load_dotenv()
pinecone.init(
    api_key=os.getenv("PINECONE_API_KEY"),
    environment=os.getenv("PINECONE_ENVIRONMENT")
)
# print(os.getenv("PINECONE_API_KEY"))
# print(os.getenv("PINECONE_ENVIRONMENT"))

In [13]:
# 6. Recursive document search function
def recursive_doc_search(doc, embeddings, index_name):
    if doc == None:
        return None
    else:
        docsearch = Pinecone.from_texts([t.page_content for t in doc], embeddings, index_name=index_name)
        return docsearch

In [14]:
# 7. Use recursive_doc_search to set up the vector store for student candidates
embeddings = OpenAIEmbeddings(openai_api_key=os.getenv("OPENAI_API_KEY"))
query_result = embeddings.embed_query("hello world")
print(len(query_result))

1536


#### Manage the Pinecone Index

In [15]:
# give a index name
index_name_ta = "ta-matching-system-students"
# create a pinecone vector store (index) for student candidates
# since we are using the openai embeddings, the dimension is 1536
pinecone.create_index(index_name_ta, dimension=1536)
# a more complex way: pinecone.create_index("example-index", dimension=128, metric="euclidean", pods=4, pod_type="s1.x1")
pinecone.describe_index(index_name_ta)

IndexDescription(name='ta-matching-system-students', metric='cosine', replicas=1, dimension=1536.0, shards=1, pods=1, pod_type='p1', status={'ready': True, 'state': 'Ready'}, metadata_config=None, source_collection='')

In [16]:
docsearch_students = recursive_doc_search(students_text, embeddings, index_name=index_name_ta)

### Step 3. Use the LangChain QA Chain and Do Query based on Given Context (Similar Students Data)

In [17]:
# 8. Establishing llm and the QA Chain
model_name = "gpt-3.5-turbo-0613"
llm=ChatOpenAI(temperature=0, model=model_name)
llm.predict("what do you recommend for this course: ")
chain = load_qa_chain(llm, chain_type="stuff")

In [18]:
def top_skills_course(course_description):
    query = "what are the top skills for this course: " + course_description
    return llm.predict(query)

In [19]:
# 9. Function to get similar students close to the query (docsearch -> vector store) 
def get_similar_docs(docsearch, query, k = 20, score = False):
    if score:
        similar_docs = docsearch.similarity_search_with_score(query, k)
    else:
        similar_docs = docsearch.similarity_search(query, k)
    return similar_docs

query = "Artificial Intelligence"
similar_docs_about_query = get_similar_docs(docsearch_students, query)
print(similar_docs_about_query)



[Document(page_content='Last Name: Smith56\nFirst Name: Joe56\nUci Net: Smith56\nConcentration: \nArea of Expertise: I am currently pursuing Masters of Computer Science. Relevant Courses: Algorithms with Applications, Advanced Programming and Problem Solving, Introduction to Artificial Intelligence.  Relevant Courses from my Bachelors include: Advanced Data Structures and Analysis of Algorithms, Automata Theory, Operating Systems, Computer Networks, Cryptography and Network Security, Cloud Computing Services, Artificial Intelligence, Information Retrieval, Software Engineering with Project Management.  I have experience with a wide range of programming languages and technologies, including � Programming languages: Java, C, C++, Python � Web Development: HTML, CSS, PHP, AJAX, JavaScript, React.js, Django, Flask � Artificial Intelligence: NumPy, Pandas, Matplotlib, OpenCV, Scikit-learn, Keras, TensorFlow, MATLAB � App Development: Swift, SwiftUI � Databases: MySQL, Oracle, MongoDB, Fireb

In [20]:
course_description = "Introduces principles, techniques, and computational tools for quantitative approach to basic problem solving in physics and engineering. Pre-requisites: MATH 2A or AP Calculus AB or AP Calculus BC. Overlaps: Physics 2 "
query_get_recommendation = "In these students I give you, which 10 students are most suitable Teaching assistant candidates (sort from best candidates to general candidates) for this course: I&C SCI 9" + course_description
similar_students = get_similar_docs(docsearch_students, top_skills_course(course_description))
print(similar_students)

[Document(page_content='Last Name: Smith9\nFirst Name: Joe9\nUci Net: Smith9\nConcentration: Computer Science\nArea of Expertise: 1 I was a Teaching Assistant for the Department of Physics at UC Irvine for two quarters, i.e. Fall 2022 & Winter 2023. I worked under Professors Lee, Guerra, and Wu for classes Physics 3A, 3B, and 7C. My responsibilities included conducting discussions for 100+ students each quarter, conducting and grading quizzes as well as grading midterms and final exams.  2) This is my third quarter of graduate studies in ICS and I have maintained a consistent 4.0 GPA even after pursuing complicated courses like CS231P, CS253P, CS261P, CS271P, and CS273P. 3) I was a Teaching Assistant for my undergraduate course \'CSL802- Distributed Computing Lab\' where I was responsible for creating a list of experiments that students would perform for the duration of the course. I also assisted students through various phases of their course projects. 4) During my role as a Data Eng

In [74]:
# 10. Functions to get the answer to the question
def get_recommendation(similar_students_info, query, chain):
    try:
        # operation that may raise an exception
        ans = chain.run(input_documents=similar_students_info, question=query)
    except Exception as e:
        print(f"An error occurred: {e}")
        # summary_similar_students_info = summarize_candidates(similar_students_info)  
        # ans = chain.run(input_documents=summary_similar_students_info, question=query)
    return ans

In [None]:
res = get_recommendation(similar_students, query_get_recommendation, chain)
print(res)

## TODO Walkaround OpenAI's Token Limit With Chain Type

If Token Limit Error:
InvalidRequestError: This model's maximum context length is 4097 tokens. However, your messages resulted in 8216 tokens. Please reduce the length of the messages.

In [25]:
from langchain.llms.openai import OpenAI
from langchain.chains.summarize import load_summarize_chain

llm2 = OpenAI(temperature=0)

def summarize_candidates(docs):
    char_text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    docs_chunks = char_text_splitter.split_documents(docs)
    model = load_summarize_chain(llm2, chain_type="map_reduce")
    summary = model.run(docs_chunks)
    print(summary)
    return summary

In [None]:
course_description = "Introduces principles, techniques, and computational tools for quantitative approach to basic problem solving in physics and engineering. Pre-requisites: MATH 2A or AP Calculus AB or AP Calculus BC. Overlaps: Physics 2 "
query_get_recommendation = "In these students I give you, which 5 students are suitable Teaching assistant candidates for this course: I&C SCI 9" + course_description
similar_students = get_similar_docs(docsearch_students, course_description)
print(similar_students)
print(type(similar_students))
print(type(similar_students[0]))
print(type(similar_students[0].page_content))
summarize_candidates(similar_students)

<class 'list'>
<class 'langchain.schema.Document'>
<class 'str'>

In [22]:
char_text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs_chunks = char_text_splitter.split_documents(similar_students)
print(len(docs_chunks))

20


In [78]:
from langchain.schema import Document
def generate_new_doc(page_content, metadata):
    new_doc = Document(
        page_content=page_content,
        metadata=metadata
    )
    return new_doc

In [54]:
del docs_chunks[0]

In [55]:
# Insert the new chunk at the first position
# docs_chunks.insert(0, new_chunk_prompt)
print(len(docs_chunks))

20


In [86]:
import openai
class StudentsInfoSummarize:
    def __init__(self, students_info):
        self.students_info = students_info
        self.students_info_summary = []

    def get_students_summary(self):
        for i in range(len(self.students_info)):
            text = self.summarize_one_student(self.students_info[i].page_content)
            new_doc_chunk = generate_new_doc(text, {})
            self.students_info_summary.append(new_doc_chunk)
        return self.students_info_summary


    # pass one student info string to summarize_one_student
    def summarize_one_student(self, student_info):
        openai.api_key = os.getenv("OPENAI_API_KEY")

        response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {
            "role": "system",
            "content": "Please provide summary for the student background. Include their name, Uci Net and background information."
            },
            {
            "role": "user",
            "content": student_info
        },
        ],
        temperature=0,
        max_tokens=150,
        top_p=1.0,
        frequency_penalty=0.0,
        presence_penalty=0.0
       )
        return response.choices[0].message.content;

In [87]:
# from students_info_summarize import StudentsInfoSummarize
students_info_summarize = StudentsInfoSummarize(docs_chunks)
concise_context = students_info_summarize.get_students_summary()
print(type(concise_context))
print(len(concise_context))
#similar_context, in a more concise way, within the token limit


<class 'list'>
20


In [56]:
print(type(docs_chunks[0]))
print("page_content", docs_chunks[0].page_content)
print("metadata", docs_chunks[0].metadata)

<class 'langchain.schema.Document'>
page_content Last Name: Smith9
First Name: Joe9
Uci Net: Smith9
Concentration: Computer Science
Area of Expertise: 1 I was a Teaching Assistant for the Department of Physics at UC Irvine for two quarters, i.e. Fall 2022 & Winter 2023. I worked under Professors Lee, Guerra, and Wu for classes Physics 3A, 3B, and 7C. My responsibilities included conducting discussions for 100+ students each quarter, conducting and grading quizzes as well as grading midterms and final exams.  2) This is my third quarter of graduate studies in ICS and I have maintained a consistent 4.0 GPA even after pursuing complicated courses like CS231P, CS253P, CS261P, CS271P, and CS273P. 3) I was a Teaching Assistant for my undergraduate course 'CSL802- Distributed Computing Lab' where I was responsible for creating a list of experiments that students would perform for the duration of the course. I also assisted students through various phases of their course projects. 4) During my

In [88]:
# use the concise summary and try query again
res = get_recommendation(concise_context, query_get_recommendation, chain)
print(res)

Based on the provided information, the following 10 students are the most suitable Teaching Assistant candidates for the course I&C SCI 9:

1. Joe Smith9: With experience as a Teaching Assistant for the Department of Physics and a background in Computer Science, Joe Smith9 would be a strong candidate for this course.

2. Joe Smith109: With expertise in software development and machine learning, Joe Smith109 would be well-equipped to assist students in applying computational tools to problem-solving in physics and engineering.

3. Joe Smith120: With a strong background in software development and a comprehensive understanding of statistical analysis and data analysis, Joe Smith120 would be a valuable asset in teaching the quantitative approach to problem-solving in this course.

4. Joe Smith90: With experience as a Teaching Assistant for courses in Boolean Logic and Discrete Structures, Joe Smith90 would have the necessary mathematical background to assist students in this physics and e

In [51]:
model = load_summarize_chain(llm2, chain_type="map_reduce")
summary = model.run(docs_chunks[0:5])


In [52]:
print(summary)

 Joe Smith84, Joe Smith67, and Joe Smith122 are all students at UC Irvine with expertise in various areas of Computer Science, Electrical Engineering, and Computer Engineering. They have experience in programming languages such as Python, C/C++, Java, and MATLAB, and have worked in various roles such as Teaching Assistants, Data Engineers, and Intel Student Ambassadors. They have also taken courses in Network Analysis II, Information Retrieval, and other relevant topics.


In [6]:
# Once finished, we delete the Pinecone index to save resources
pinecone.delete_index(index_name_ta)