In [3]:
# or to use other formats, use the following after installing the required packages:
from langchain.document_loaders import UnstructuredFileLoader

In [1]:
# Process csv file and filtered cols we need
import csv

def filter_csv(input_file, output_file, columns):
    with open(input_file, 'r') as file:
        reader = csv.DictReader(file)
        headers = reader.fieldnames

        # Validate the specified columns
        valid_columns = [col for col in columns if col in headers]
        if not valid_columns:
            print("No valid columns specified.")
            return

        with open(output_file, 'w', newline='') as output:
            writer = csv.DictWriter(output, fieldnames=valid_columns)
            writer.writeheader()

            for row in reader:
                filtered_row = {col: row[col] for col in valid_columns}
                writer.writerow(filtered_row)

    print("CSV file created successfully.")

In [None]:
# Example usage
input_file = 'students_sum23_all_field_uuid.csv'
output_file = 'dummy_students_candidacy_data2.csv'
columns_to_keep = ['UUID','Area of Expertise', 'Programming', 'Past Experience','Course Prefs']  # Specify the columns you want to keep

filter_csv(input_file, output_file, columns_to_keep)

## Use LangChain + Chroma + GPT Model to Match Best TA Candidates for Each Course
## Main idea


### Process All Students Info -> Vector Store -> Summarize All the Students' Info and Store It in SQL DB-> Give 20 Students Info as Context of Most Similar Background according to Course Description -> Get the 20 Students' Info from SQL DB Within the Token Limit -> Query GPT Model + Context -> GPT give the top 10 candidates based on the prompt instructions

#### To Avoid Latency and Time Our Error: We will run the whole data first and store all the recommendation in our SQL Database

### Step 1 : Import LangChain CSVLoader and load the Students Information

In [4]:
# 1. Function: Importing langChain document loaders and loading the data
from langchain.document_loaders.csv_loader import CSVLoader
def load_data_csv_file(file_path):
    loader = CSVLoader(file_path)
    data = loader.load()
    return data

In [5]:
# 2. Function: Split the text into chunks
from langchain.text_splitter import CharacterTextSplitter
def split_documents(documents, chunk_size=1000, chunk_overlap =0):
    text_splitter = CharacterTextSplitter(chunk_size=chunk_size,chunk_overlap=chunk_overlap)
    texts = text_splitter.split_documents(documents)
    return texts

In [7]:
# 3. Test load_data_csv_file and split_documents
students_data = load_data_csv_file('selected_columns_only.csv')
students_text = split_documents(students_data)
print(len(students_data))
print(len(students_text))
print(students_text[88].page_content)

136
136
UUID: e69d8240-cef5-427e-97ee-604c135293d2
Area of Expertise: I have expertise in Software Design, Software Architecture, Discrete Mathematics, Machine Learning, Data Structure, Algorithms, Compilers, Computer Architecture, Operating Systems, Robotics, Digital Design, Microprocessors, Computer Networks and Database Management. I am well-versed in programming languages like Python, C/C#/C++, SystemVerilog, Java, MySQL, Dot Net framework, Angular and MATLAB.  My qualifications in Computer Science are that I have worked as Software Engineer at Larsen and Toubro Infotech (LTI)for 1.5 years to gain industrial experience in the above-mentioned domains. I completed the above courses during my undergraduate years with good grades.   Yes, I have conducted offline (in-person) Robotics workshops during my undergraduate studies .
Programming: I successfully completed DotNet Full Stack Developer training at LTI. I have worked on and developed various SQL scripts, Stored Procedures and SSIS 

In [8]:
print(students_text[100].page_content)

UUID: 79c2e103-b928-47f0-8113-409ef5fb5811
Area of Expertise: My area of expertise is computer architecture and system design. I have worked with programming languages for several years now. During my undergraduate studies at the American University of Beirut, I TA'ed 2 online courses: Introduction to programming in Python and Computer Organization. In both of these courses, I was helping students with their assignments, answering their questions, and grading their submissions.
Programming: I have taken courses on programming using VHDL, C, C++, Python, MATLAB, and SQL. I have also used most of these languages in projects (academic and non-academic) beyond the scope of the courses. I have used C++ to run machine learning programs on a Jetson and used C to run audio detection programs on microcontrollers such as the Arduino Nano. I also have extended hands-on experience with Python, as I have used it for many projects and previously TA'ed a course for it. I am also currently using VHDL 

### Step 2. Set Up the Vector Store (Embedding Index) of the Students Information

In [9]:
# 4. #importing Chroma packages and OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain

In [10]:
# 5. Function: Recursive document search
def recursive_doc_search(doc, embeddings, persist_directory):
    if doc == None:
        return None
    else:
        db = Chroma.from_documents(doc, embeddings,persist_directory=persist_directory)
        return db

In [11]:
# 6. Use OpenAI's Embeddings
import os
embeddings = OpenAIEmbeddings(openai_api_key=os.getenv("OPENAI_API_KEY"))
query_result = embeddings.embed_query("hello world")
print(len(query_result))

1536


#### Set Up New Chroma Vector DB and Loading Existing Vector DB

In [13]:
# If you want to delete an existing Vector DB
if os.path.exists("chroma.sqlite3"):
    os.remove("chroma.sqlite3")
    print("File deleted.")
else:
    print("File does not exist.")

File deleted.


In [14]:
# 7. Creating a Chroma local client
import chromadb
persist_directory = "/Users/sunfengnan/Documents/TA_Matching/LangChain_Practice"
client = chromadb.PersistentClient(path=persist_directory)

vector_db = Chroma.from_documents(
    client=client,
    documents=students_text, 
    collection_name="ta_matching_students_info",
    embedding=embeddings,
    persist_directory=persist_directory,
)
print(f"Documents Loaded: {vector_db._collection.count()}")

Documents Loaded: 136


In [12]:
# If you want to delete data in existing Vector DB
# vector_db = None

In [15]:
vector_db.get()
print(f"Documents Loaded: {vector_db._collection.count()}")

Documents Loaded: 136


### Notice: If we already have an Vector DB, we can load it directly and use it again.

In [16]:
# Now we can load the persisted DB from disk, and use it as normal
vector_db_students = Chroma(
    client=client,
    persist_directory=persist_directory, 
    collection_name="ta_matching_students_info",
    embedding_function=embeddings)
vector_db_students.get()
print(f"Documents Loaded: {vector_db_students._collection.count()}")
print(type(vector_db_students))

Documents Loaded: 136
<class 'langchain.vectorstores.chroma.Chroma'>


### Step 3. Use the LangChain QA Chain and Do Query based on Given Context (Similar Students Data)

In [17]:
# 9. Establishing llm and the QA Chain
model_name = "gpt-3.5-turbo-0613"
llm=ChatOpenAI(temperature=0, model=model_name)
chain = load_qa_chain(llm, chain_type="stuff")

In [18]:
# 10. Function: Let gpt model summarize the Top skills/Academic background for a course
def top_skills_course(course_description):
    query = "Summarize the skills, teaching background, and academic background required to be a successful Teaching assistant for this course, token limit 100:" + course_description
    ans = llm.predict(query)
    print(ans)
    return ans

In [19]:
# 11. Function to get similar students close to the query (docsearch -> vector store) 
def get_similar_docs(db, query, k = 20, score = False):
    if score:
        similar_docs = db.similarity_search_with_score(query, k)
    else:
        similar_docs = db.similarity_search(query, k)
    return similar_docs

In [20]:
# For example, if we want to find the top 20 students who has the most similar academic background to the course description
# As our query context
course_description = "Introduces principles, techniques, and computational tools for quantitative approach to basic problem solving in physics and engineering. Pre-requisites: MATH 2A or AP Calculus AB or AP Calculus BC. Overlaps: Physics 2 "
similar_students = get_similar_docs(vector_db_students, top_skills_course(course_description))
print(similar_students)

To be a successful Teaching Assistant for this course, one must possess a strong academic background in physics and engineering. A solid understanding of principles, techniques, and computational tools used in quantitative problem solving is essential. Additionally, a strong foundation in mathematics, specifically calculus, is required, with a prerequisite of MATH 2A or completion of AP Calculus AB or BC. Familiarity with Physics 2, which may overlap with this course, is also beneficial. Teaching experience or a background in education would be advantageous in effectively assisting students in understanding and applying the course material.
[Document(page_content='UUID: 91f60ad1-a7cb-4666-8bec-56dd08a6d91b\nArea of Expertise: 1) I was a Teaching Assistant for the Department of Physics at UC Irvine for two quarters, i.e. Fall 2022 & Winter 2023. I worked under Professors Lee, Guerra, and Wu for classes Physics 3A, 3B, and 7C. My responsibilities included conducting discussions for 100+ 

In [21]:
# 10. Functions to get the answer to the question
def get_recommendation(similar_students_info, query, chain):
    try:
        # operation that may raise an exception
        ans = chain.run(input_documents=similar_students_info, question=query)
    except Exception as e:
        print(f"An error occurred: {e}")
        # summary_similar_students_info = summarize_candidates(similar_students_info)  
        # ans = chain.run(input_documents=summary_similar_students_info, question=query)
    return ans

In [None]:
# Notice: if we run the query with 20 students' info as context, the gpt model will return an Token Limit error
# course_num = "I&C SCI 9"
# query_get_recommendation = "In these students I give you, which 10 students are suitable Teaching assistant candidates for this course" + course_num + course_description
# res = get_recommendation(similar_students, query_get_recommendation, chain)
# print(res)

## TODO Walkaround OpenAI's Token Limit With Chain Type

If Token Limit Error:
InvalidRequestError: This model's maximum context length is 4097 tokens. However, your messages resulted in 8216 tokens. Please reduce the length of the messages.

In [22]:
course_description = "Introduction to Computation for Scientists and Engineers: Introduces principles, techniques, and computational tools for quantitative approach to basic problem solving in physics and engineering. Pre-requisites: MATH 2A or AP Calculus AB or AP Calculus BC. Overlaps: Physics 2 "
course_description2 = "Human Factors for the Web: Principles of human-computer interaction in evaluating, designing, and developing information presented on the World Wide Web. Topics include user characteristics, usability analysis, navigation and organization, color, typography, multimedia, information visualization, prototyping, user studies, evaluation strategies, and web accessibility."
course_description3 = "Programming in C/C++ as a Second Language: Introduction to the lexical, syntactic, semantic, and pragmatic characteristics of the C/C++ languages for experienced programmers. Emphasis on object-oriented programming, using standard libraries, and programming with manual garbage collection."

course_num = "I&C SCI 9"
course_num2 = "I&C SCI 10"
course_num3 = "I&C SCI 45C"
similar_students = get_similar_docs(vector_db_students, top_skills_course(course_description3))
print(similar_students)
print(type(similar_students))
print(type(similar_students[0]))
print(type(similar_students[0].page_content))
print(len(similar_students[0].page_content))

To be a successful Teaching Assistant for the course "Programming in C/C++ as a Second Language: Introduction to the lexical, syntactic, semantic, and pragmatic characteristics of the C/C++ languages for experienced programmers," several skills, teaching background, and academic background are required.

Skills:
1. Proficiency in C/C++ programming languages: A strong command over C/C++ programming languages is essential to effectively assist students in understanding the lexical, syntactic, semantic, and pragmatic characteristics of these languages.
2. Object-oriented programming (OOP): In-depth knowledge of OOP concepts and their implementation in C/C++ is crucial, as the course emphasizes object-oriented programming.
3. Familiarity with standard libraries: Being well-versed in the usage of standard libraries in C/C++ is necessary to guide students in utilizing these libraries effectively.
4. Manual garbage collection: Understanding the concept and implementation of manual garbage col

<class 'list'>
<class 'langchain.schema.Document'>
<class 'str'>

In [23]:
char_text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs_chunks = char_text_splitter.split_documents(similar_students)
print("length of docs_chunks", len(docs_chunks))
print(type(docs_chunks[0]))
print("page_content", docs_chunks[0].page_content)
print("text length for student_1", len(docs_chunks[0].page_content))
print("metadata", docs_chunks[0].metadata)

length of docs_chunks 20
<class 'langchain.schema.document.Document'>
page_content UUID: 87138c34-949d-4ebb-824d-692f8e72a386
Area of Expertise: 1.I think my area of expertise is software engineering (software testing, empirical software study, static analysis)  2.I have no special qualifications in Computer Science  3.I have experience teaching three online courses.  a)In my sophomore year (Spring 2020), I helped a junior high school student ready for the high school entrance examination, including mathematics, physics, chemistry and English. b)In my senior year (Spring 2022), as a Teaching Assistant of the course “Introduction to Computer Programming (Java)”, I taught the students how to code and debug. c)In my senior year (Spring 2022), as a Teaching Assistant of the course “Software Engineering”, I taught the students how to code and debug, and graded their presentations.
Programming: 1.I have nearly 4-year coding experience of Java. I use Java to complete many course projects, inc

In [24]:
# Helper function for Class StudentsInfoSummarize
from langchain.schema import Document
def generate_new_doc(page_content, metadata):
    new_doc = Document(
        page_content=page_content,
        metadata=metadata
    )
    return new_doc

In [25]:
# Summarize students info within the token limit
import openai
class StudentsInfoSummarize:
    def __init__(self, students_info):
        self.students_info = students_info
        self.students_info_summary = []

    def get_students_summary(self):
        for i in range(len(self.students_info)):
            text = self.summarize_one_student(self.students_info[i].page_content)
            new_doc_chunk = generate_new_doc(text, {})
            self.students_info_summary.append(new_doc_chunk)
        return self.students_info_summary


    # pass one student info string to summarize_one_student
    def summarize_one_student(self, student_info):
        openai.api_key = os.getenv("OPENAI_API_KEY")

        response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {
            "role": "system",
            "content": "Summarize the student's background. Include their UUID, expertise, programming skills, and past experience (include all teaching experience, if any)"
            },
            {
            "role": "user",
            "content": student_info
        },
        ],
        temperature=0,
        max_tokens=150,
        top_p=1.0,
        frequency_penalty=0.0,
        presence_penalty=0.0
       )
        return response.choices[0].message.content;

In [None]:
# from students_info_summarize import StudentsInfoSummarize
students_info_summarize = StudentsInfoSummarize(docs_chunks)
concise_context = students_info_summarize.get_students_summary()
print(type(concise_context))
print(len(concise_context))
#similar_context, in a more concise way, within the token limit


In [28]:
print("student1",concise_context[0].page_content, "\n\n")
print(len(concise_context[0].page_content))
print("student2", concise_context[5].page_content)

student1 UUID: 87138c34-949d-4ebb-824d-692f8e72a386
Area of Expertise: The student's area of expertise is software engineering, specifically in software testing, empirical software study, and static analysis. They do not have any special qualifications in Computer Science.

Programming Skills: The student has nearly 4 years of coding experience in Java and has used it for various projects, including backend development, bug fixing in open-source projects, and their undergraduate thesis. They also have some experience with C/C++ and have used C for two projects. They have experience with SQL languages, specifically PostgreSQL. They have about 10 years of experience with Windows OS and nearly 4 years of experience with Linux 


723
student2 UUID: a5a9f0e8-84de-4116-858d-7b7807d6e468
Area of Expertise: The student has a strong background in Computer Science, with expertise in areas such as Digital Logic Design, Data Structures, Computer Organization and Architecture, Machine Learning, DBM

In [30]:
# use the concise summary and query again, with students' background keywords
query_get_recommendation = "Provided a list of students. Which 10 students would make the best Teaching assistant candidates (sort from best candidates to worst candidates) for this course" + course_num3 + course_description3 + ", include their background key words"
res = get_recommendation(concise_context, query_get_recommendation, chain)
print(res)

To determine the best Teaching Assistant candidates for the course I&C SCI 45C, we can consider the students' areas of expertise, programming skills, and teaching experience. Based on the provided information, the following 10 students would make strong candidates, sorted from best to worst:

1. UUID: a57392f2-1cd9-44ec-a15e-e48e4e67cd5a
   - Area of Expertise: Machine learning, digital signal processing, speech recognition, natural language processing, image processing
   - Programming Skills: Java, C++, C, Unix, VHDL, Scheme
   - Teaching Experience: TA for CS 141 (Programming Languages) in Spring 2023, Reader for CS 141 in Spring 2022

2. UUID: 5ae1e953-678e-4474-b8d0-331ed44b7435
   - Area of Expertise: Algorithm Analysis and Data Structures, Discrete Mathematics, Linear Algebra, Computer Graphics, boolean algebra, Theory of Computation.
   - Programming Skills: C++, C, Python
   - Teaching Experience: TA for ICS46 (Data Structure in C++) in Winter 2023, TA for ICS33 (Intermediate 

In [32]:
# use the concise summary and query again, only with the UUID
query_get_recommendation = "Provided a list of students. Which 10 students would make the best Teaching assistant candidates (sort from best candidates to worst candidates) for this course" + course_num3 + course_description3 + ", only give the UUID"
res = get_recommendation(concise_context, query_get_recommendation, chain)
print(res)

Based on the provided information, the following 10 students would make the best Teaching Assistant candidates for the course I&C SCI 45C:

1. UUID: a57392f2-1cd9-44ec-a15e-e48e4e67cd5a
2. UUID: 85231e67-f819-4431-9231-fe8769be7d44
3. UUID: 5ae1e953-678e-4474-b8d0-331ed44b7435
4. UUID: 48ec36be-a6d0-4e7c-9b86-cd2a610e70b3
5. UUID: 72251169-337d-45bf-b1fe-46de54d2ff8c
6. UUID: a5a9f0e8-84de-4116-858d-7b7807d6e468
7. UUID: 68d25662-8c15-42d5-b061-ce10acdb1db8
8. UUID: 06be6af0-d8e6-4d2a-8e4b-e3e9803388e0
9. UUID: 3d6197b3-a5cc-4504-8c3d-46c90c83f595
10. UUID: 1e7c76d9-f68e-4d9e-a637-f05df4f8f7db
